diff --git a/.copyright.hook b/.copyright.hook
index dc1b096a0ad28db732b794fa856efed71917c5e8..09afff2072df3384a429d01d06188218ae6e85d1 100644
--- a/.copyright.hook
+++ b/.copyright.hook
@@ -9,7 +9,7 @@ import subprocess
 import platform
 
 COPYRIGHT = '''
-  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7c7eb260aea8478f4833cb79253f4481e10b8685..3a21574b855bc6bc37fefe61de98d657e712cde7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,7 +39,7 @@ option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_F
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
 option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
-option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
+option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
 option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check"         ON)
 option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
@@ -137,7 +137,7 @@ include(external/openblas)  # download, build, install openblas
 include(external/mkldnn)    # download, build, install mkldnn
 include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
-include(external/boost)     # download, build, install boost
+include(external/boost)     # download boost
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
@@ -156,6 +156,7 @@ include(rdma)               # set rdma libraries
 include(flags)              # set paddle compile flags
 include(version)            # set PADDLE_VERSION
 include(coveralls)          # set code coverage
+include(inference_lib)      # add paddle fluid inference libraries
 
 
 include_directories("${PADDLE_SOURCE_DIR}")
diff --git a/benchmark/cluster/vgg16/Dockerfile b/benchmark/cluster/vgg16/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..98356cd7613baff7f0cd66d1462068232b2b8500
--- /dev/null
+++ b/benchmark/cluster/vgg16/Dockerfile
@@ -0,0 +1,18 @@
+#FROM python:2.7.14
+FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
+RUN apt-get update && apt-get install -y python
+RUN pip install -U kubernetes opencv-python &&   apt-get update -y &&   apt-get install -y iputils-ping libgtk2.0-dev
+# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
+#       so we must build one with distribute support to install in this image.
+RUN pip install paddlepaddle
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
+RUN pip uninstall -y paddlepaddle
+
+# below lines may change a lot for debugging
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
+ADD *.whl /
+RUN pip install /*.whl && rm -f /*.whl && \
+chmod +x /usr/bin/paddle_k8s
+ENV LD_LIBRARY_PATH=/usr/local/lib
+ADD vgg16_fluid.py vgg16_v2.py /workspace/
diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..11d00b8f85382aa720c169338c51333b730d44d5
--- /dev/null
+++ b/benchmark/cluster/vgg16/README.md
@@ -0,0 +1,76 @@
+# Performance for Distributed vgg16
+
+## Test Result
+
+### Hardware Infomation
+
+- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
+- cpu MHz		: 2101.000
+- cache size	: 20480 KB
+
+### Single Node Single Thread
+
+- PServer Count: 10
+- Trainer Count: 20
+- Metrics: samples / sec
+
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
+| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
+| TensorFlow | - | - | - | - |
+
+### Different Batch Size
+
+- PServer Count: 10
+- Trainer Count: 20
+- Per trainer CPU Core: 1
+- Metrics: samples / sec
+
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
+| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
+| TensorFlow | - | - | - | - |
+
+
+### Accelerate Rate
+
+- Pserver Count: 20
+- Batch Size: 128
+- Metrics: samples / sec
+
+| Trainer Count | 20 | 40 | 80 | 100 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
+| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
+| TensorFlow | - | - | - | - |
+
+### Different Pserver Count
+
+- Trainer Count: 60
+- Batch Size: 128
+- Metrics: samples/ sec
+
+| PServer Count | 3 | 6 |10 | 20 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
+| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
+| TensorFlow | - | - | - | - |
+
+*The performance gap between Fuild and v2 comes from the network interference.*
+
+
+## Steps to Run the Performance Test
+
+1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
+1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
+1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
+1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
+1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
+
+Check the logs for the distributed training progress and analyze the performance.
+
+## Enable Verbos Logs
+
+Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.
diff --git a/benchmark/cluster/vgg16/fluid_pserver.yaml b/benchmark/cluster/vgg16/fluid_pserver.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ee8b0763b62fc011f40f6197e929a68b48a93e47
--- /dev/null
+++ b/benchmark/cluster/vgg16/fluid_pserver.yaml
@@ -0,0 +1,72 @@
+apiVersion: extensions/v1beta1
+kind: ReplicaSet
+metadata:
+  name: vgg16job-pserver
+spec:
+  replicas: 10
+  template:
+    metadata:
+      labels:
+        paddle-job-pserver: vgg16job
+    spec:
+      hostNetwork: true
+      imagePullSecrets:
+      - name: job-registry-secret
+      containers:
+      - name: pserver
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+        imagePullPolicy: Always
+        ports:
+        - name: jobport-30236
+          containerPort: 30236
+        env:
+        - name: PADDLE_JOB_NAME
+          value: vgg16job
+        - name: MKL_NUM_THREADS
+          value: "1"
+        - name: TRAINING_ROLE
+          value: "PSERVER"
+        - name: TRAINERS
+          value: "20"
+        - name: PSERVERS
+          value: "10"
+        - name: TOPOLOGY
+          value: ""
+        - name: ENTRY
+          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: PADDLE_INIT_PORT
+          value: "30236"
+        - name: PADDLE_INIT_NICS
+          value: "xgbe0"
+        - name: PADDLE_INIT_TRAINER_COUNT
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+          value: "1"
+        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+          value: "20"
+        - name: PADDLE_INIT_NUM_PASSES
+          value: "1"
+        - name: PADDLE_INIT_USE_GPU
+          value: "0"
+        - name: LD_LIBRARY_PATH
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        - name: POD_IP
+          valueFrom:
+            fieldRef:
+              fieldPath: "status.podIP"
+        command: ["paddle_k8s", "start_fluid"]
+        resources:
+          requests:
+            memory: 10Gi
+            cpu: 4
+          limits:
+            memory: 10Gi
+            cpu: 4
diff --git a/benchmark/cluster/vgg16/fluid_trainer.yaml b/benchmark/cluster/vgg16/fluid_trainer.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0a0ed25ebe43c4cc0d5ab0b72cf36c936fcce802
--- /dev/null
+++ b/benchmark/cluster/vgg16/fluid_trainer.yaml
@@ -0,0 +1,69 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: vgg16job-trainer
+spec:
+  parallelism: 20
+  completions: 20
+  template:
+    metadata:
+      labels:
+        paddle-job: vgg16job
+    spec:
+      imagePullSecrets:
+        - name: job-registry-secret
+      hostNetwork: true
+      containers:
+      - name: trainer
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+        imagePullPolicy: Always
+        command: ["paddle_k8s", "start_fluid"]
+        env:
+        - name: PADDLE_JOB_NAME
+          value: vgg16job
+        - name: TRAINING_ROLE
+          value: "TRAINER"
+        - name: TRAINERS
+          value: "20"
+        - name: PSERVERS
+          value: "10"
+        - name: TOPOLOGY
+          value: ""
+        - name: ENTRY
+          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: PADDLE_INIT_PORT
+          value: "30236"
+        - name: PADDLE_INIT_NICS
+          value: "xgbe0"
+        - name: PADDLE_INIT_TRAINER_COUNT
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+          value: "1"
+        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+          value: "20"
+        - name: PADDLE_INIT_NUM_PASSES
+          value: "1"
+        - name: PADDLE_INIT_USE_GPU
+          value: "0"
+        - name: LD_LIBRARY_PATH
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        - name: POD_IP
+          valueFrom:
+            fieldRef:
+              fieldPath: "status.podIP"
+        resources:
+          requests:
+            memory: 40Gi
+            cpu: 2
+          limits:
+            memory: 40Gi
+            cpu: 2
+      restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/v2_pserver.yaml b/benchmark/cluster/vgg16/v2_pserver.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dd1271e0cf399184134c06b3200ee1202c65cef0
--- /dev/null
+++ b/benchmark/cluster/vgg16/v2_pserver.yaml
@@ -0,0 +1,64 @@
+apiVersion: extensions/v1beta1
+kind: ReplicaSet
+metadata:
+  name: vgg16v2job-pserver
+spec:
+  replicas: 10
+  template:
+    metadata:
+      labels:
+        paddle-job-pserver: vgg16v2job
+    spec:
+      hostNetwork: true
+      imagePullSecrets:
+      - name: job-registry-secret
+      containers:
+      - name: pserver
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+        imagePullPolicy: Always
+        ports:
+        - name: jobport-30236
+          containerPort: 30236
+        env:
+        - name: PADDLE_JOB_NAME
+          value: vgg16v2job
+        - name: TRAINERS
+          value: "20"
+        - name: PSERVERS
+          value: "10"
+        - name: TOPOLOGY
+          value: ""
+        - name: ENTRY
+          value: "python train.py"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: PADDLE_INIT_PORT
+          value: "30236"
+        - name: PADDLE_INIT_NICS
+          value: "xgbe0"
+        - name: PADDLE_INIT_TRAINER_COUNT
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+          value: "1"
+        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+          value: "20"
+        - name: PADDLE_INIT_NUM_PASSES
+          value: "1"
+        - name: PADDLE_INIT_USE_GPU
+          value: "0"
+        - name: LD_LIBRARY_PATH
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        command: ["paddle_k8s", "start_pserver"]
+        resources:
+          requests:
+            memory: 10Gi
+            cpu: 4
+          limits:
+            memory: 10Gi
+            cpu: 4
diff --git a/benchmark/cluster/vgg16/v2_trainer.yaml b/benchmark/cluster/vgg16/v2_trainer.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..12c8964066cbcfe8d2a44de2f51a3d12ea422fe2
--- /dev/null
+++ b/benchmark/cluster/vgg16/v2_trainer.yaml
@@ -0,0 +1,65 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: vgg16v2job-trainer
+spec:
+  parallelism: 20
+  completions: 20
+  template:
+    metadata:
+      labels:
+        paddle-job: vgg16v2job
+    spec:
+      imagePullSecrets:
+        - name: job-registry-secret
+      hostNetwork: true
+      containers:
+      - name: trainer
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+        imagePullPolicy: Always
+        command: ["paddle_k8s", "start_trainer", "v2"]
+        env:
+        - name: PADDLE_JOB_NAME
+          value: vgg16v2job
+        - name: BATCH_SIZE
+          value: "256"
+        - name: TRAINERS
+          value: "20"
+        - name: PSERVERS
+          value: "10"
+        - name: TOPOLOGY
+          value: ""
+        - name: ENTRY
+          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: PADDLE_INIT_PORT
+          value: "30236"
+        - name: PADDLE_INIT_NICS
+          value: "xgbe0"
+        - name: PADDLE_INIT_TRAINER_COUNT
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+          value: "1"
+        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+          value: "20"
+        - name: PADDLE_INIT_NUM_PASSES
+          value: "2"
+        - name: PADDLE_INIT_USE_GPU
+          value: "0"
+        - name: LD_LIBRARY_PATH
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        resources:
+          requests:
+            memory: 40Gi
+            cpu: 2
+          limits:
+            memory: 40Gi
+            cpu: 2
+      restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py
new file mode 100644
index 0000000000000000000000000000000000000000..499e06ec42fc8f840137173628fa465e0541ba30
--- /dev/null
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@@ -0,0 +1,277 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VGG16 benchmark in Fluid"""
+from __future__ import print_function
+
+import sys
+import time
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.profiler as profiler
+import argparse
+import functools
+import os
+
+
+def str2bool(v):
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--batch_size', type=int, default=128, help="Batch size for training.")
+parser.add_argument(
+    '--learning_rate',
+    type=float,
+    default=1e-3,
+    help="Learning rate for training.")
+parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
+parser.add_argument(
+    '--device',
+    type=str,
+    default='CPU',
+    choices=['CPU', 'GPU'],
+    help="The device type.")
+parser.add_argument('--device_id', type=int, default=0, help="The device id.")
+parser.add_argument(
+    '--data_format',
+    type=str,
+    default='NCHW',
+    choices=['NCHW', 'NHWC'],
+    help='The data order, now only support NCHW.')
+parser.add_argument(
+    '--data_set',
+    type=str,
+    default='cifar10',
+    choices=['cifar10', 'flowers'],
+    help='Optional dataset for benchmark.')
+parser.add_argument(
+    '--local',
+    type=str2bool,
+    default=True,
+    help='Whether to run as local mode.')
+args = parser.parse_args()
+
+
+def vgg16_bn_drop(input):
+    def conv_block(input, num_filter, groups, dropouts):
+        return fluid.nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max')
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    return fc2
+
+
+def main():
+    if args.data_set == "cifar10":
+        classdim = 10
+        if args.data_format == 'NCHW':
+            data_shape = [3, 32, 32]
+        else:
+            data_shape = [32, 32, 3]
+    else:
+        classdim = 102
+        if args.data_format == 'NCHW':
+            data_shape = [3, 224, 224]
+        else:
+            data_shape = [224, 224, 3]
+
+    # Input data
+    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # Train program
+    net = vgg16_bn_drop(images)
+    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Evaluator
+    accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+
+    # inference program
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        test_target = accuracy.metrics + accuracy.states
+        inference_program = fluid.io.get_inference_program(test_target)
+
+    # Optimization
+    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+    optimize_ops, params_grads = optimizer.minimize(avg_cost)
+
+    # Initialize executor
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(
+        args.device_id)
+    exe = fluid.Executor(place)
+
+    # test
+    def test(exe):
+        accuracy.reset(exe)
+        for batch_id, data in enumerate(test_reader()):
+            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
+                                    data)).astype("float32")
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([-1, 1])
+
+            exe.run(inference_program,
+                    feed={"pixel": img_data,
+                          "label": y_data})
+
+        return accuracy.eval(exe)
+
+    def train_loop(exe, trainer_prog):
+        iters = 0
+        ts = time.time()
+        for pass_id in range(args.num_passes):
+            # train
+            start_time = time.time()
+            num_samples = 0
+            accuracy.reset(exe)
+            with profiler.profiler("CPU", 'total') as prof:
+                for batch_id, data in enumerate(train_reader()):
+                    ts = time.time()
+                    img_data = np.array(
+                        map(lambda x: x[0].reshape(data_shape), data)).astype(
+                            "float32")
+                    y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                    y_data = y_data.reshape([-1, 1])
+
+                    loss, acc = exe.run(
+                        trainer_prog,
+                        feed={"pixel": img_data,
+                              "label": y_data},
+                        fetch_list=[avg_cost] + accuracy.metrics)
+                    iters += 1
+                    num_samples += len(data)
+                    print(
+                        "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f"
+                        % (pass_id, iters, loss, acc, time.time() - ts)
+                    )  # The accuracy is the accumulation of batches, but not the current batch.
+
+            pass_elapsed = time.time() - start_time
+            pass_train_acc = accuracy.eval(exe)
+            pass_test_acc = test(exe)
+            print(
+                "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"
+                % (pass_id, num_samples / pass_elapsed, pass_train_acc,
+                   pass_test_acc))
+
+    if args.local:
+        # Parameter initialization
+        exe.run(fluid.default_startup_program())
+
+        # data reader
+        train_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
+                else paddle.dataset.flowers.train(),
+                buf_size=5120),
+            batch_size=args.batch_size)
+        test_reader = paddle.batch(
+            paddle.dataset.cifar.test10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+            batch_size=args.batch_size)
+        train_loop(exe, fluid.default_main_program())
+    else:
+        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # all pserver endpoints
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, "6174"]))
+        pserver_endpoints = ",".join(eplist)
+        print("pserver endpoints: ", pserver_endpoints)
+        trainers = int(os.getenv("TRAINERS"))  # total trainer count
+        print("trainers total: ", trainers)
+        current_endpoint = os.getenv(
+            "POD_IP") + ":6174"  # current pserver endpoint
+        training_role = os.getenv(
+            "TRAINING_ROLE",
+            "TRAINER")  # get the training role: trainer/pserver
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            optimize_ops,
+            params_grads,
+            pservers=pserver_endpoints,
+            trainers=trainers)
+
+        if training_role == "PSERVER":
+            if not current_endpoint:
+                print("need env SERVER_ENDPOINT")
+                exit(1)
+            pserver_prog = t.get_pserver_program(current_endpoint)
+            pserver_startup = t.get_startup_program(current_endpoint,
+                                                    pserver_prog)
+            print("starting server side startup")
+            exe.run(pserver_startup)
+            print("starting parameter server...")
+            exe.run(pserver_prog)
+        elif training_role == "TRAINER":
+            # Parameter initialization
+            exe.run(fluid.default_startup_program())
+
+            # data reader
+            train_reader = paddle.batch(
+                paddle.reader.shuffle(
+                    paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
+                    else paddle.dataset.flowers.train(),
+                    buf_size=5120),
+                batch_size=args.batch_size)
+            test_reader = paddle.batch(
+                paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else
+                paddle.dataset.flowers.test(),
+                batch_size=args.batch_size)
+
+            trainer_prog = t.get_trainer_program()
+            feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
+            # TODO(typhoonzero): change trainer startup program to fetch parameters from pserver
+            exe.run(fluid.default_startup_program())
+            train_loop(exe, trainer_prog)
+        else:
+            print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
+
+
+def print_arguments():
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == "__main__":
+    print_arguments()
+    main()
diff --git a/benchmark/cluster/vgg16/vgg16_v2.py b/benchmark/cluster/vgg16/vgg16_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ac6b3c33252e0a1f596f539fc090c5ada118e15
--- /dev/null
+++ b/benchmark/cluster/vgg16/vgg16_v2.py
@@ -0,0 +1,154 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import gzip
+
+import paddle.v2.dataset.cifar as cifar
+import paddle.v2 as paddle
+import time
+import os
+
+DATA_DIM = 3 * 32 * 32
+CLASS_DIM = 10
+BATCH_SIZE = os.getenv("BATCH_SIZE")
+if BATCH_SIZE:
+    BATCH_SIZE = int(BATCH_SIZE)
+else:
+    BATCH_SIZE = 128
+print "batch_size", BATCH_SIZE
+NODE_COUNT = int(os.getenv("TRAINERS"))
+ts = 0
+
+
+def vgg(input, nums, class_dim):
+    def conv_block(input, num_filter, groups, num_channels=None):
+        return paddle.networks.img_conv_group(
+            input=input,
+            num_channels=num_channels,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act=paddle.activation.Relu(),
+            pool_type=paddle.pooling.Max())
+
+    assert len(nums) == 5
+    # the channel of input feature is 3
+    conv1 = conv_block(input, 64, nums[0], 3)
+    conv2 = conv_block(conv1, 128, nums[1])
+    conv3 = conv_block(conv2, 256, nums[2])
+    conv4 = conv_block(conv3, 512, nums[3])
+    conv5 = conv_block(conv4, 512, nums[4])
+
+    fc_dim = 512
+    fc1 = paddle.layer.fc(input=conv5,
+                          size=fc_dim,
+                          act=paddle.activation.Relu(),
+                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
+    fc2 = paddle.layer.fc(input=fc1,
+                          size=fc_dim,
+                          act=paddle.activation.Relu(),
+                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
+    out = paddle.layer.fc(input=fc2,
+                          size=class_dim,
+                          act=paddle.activation.Softmax())
+    return out
+
+
+def vgg13(input, class_dim):
+    nums = [2, 2, 2, 2, 2]
+    return vgg(input, nums, class_dim)
+
+
+def vgg16(input, class_dim):
+    nums = [2, 2, 3, 3, 3]
+    return vgg(input, nums, class_dim)
+
+
+def vgg19(input, class_dim):
+    nums = [2, 2, 4, 4, 4]
+    return vgg(input, nums, class_dim)
+
+
+def main():
+    global ts
+    paddle.init(use_gpu=False)
+    image = paddle.layer.data(
+        name="image", type=paddle.data_type.dense_vector(DATA_DIM))
+    lbl = paddle.layer.data(
+        name="label", type=paddle.data_type.integer_value(CLASS_DIM))
+
+    extra_layers = None
+    # NOTE: for v2 distributed training need averaging updates.
+    learning_rate = 1e-3 / NODE_COUNT
+    out = vgg16(image, class_dim=CLASS_DIM)
+    cost = paddle.layer.classification_cost(input=out, label=lbl)
+
+    # Create parameters
+    parameters = paddle.parameters.create(cost)
+
+    # Create optimizer
+    optimizer = paddle.optimizer.Momentum(
+        momentum=0.9,
+        regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
+                                                         BATCH_SIZE),
+        learning_rate=learning_rate / BATCH_SIZE,
+        learning_rate_decay_a=0.1,
+        learning_rate_decay_b=128000 * 35,
+        learning_rate_schedule="discexp", )
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            cifar.train10(),
+            # To use other data, replace the above line with:
+            # reader.train_reader('train.list'),
+            buf_size=1000),
+        batch_size=BATCH_SIZE)
+    test_reader = paddle.batch(
+        cifar.test10(),
+        # To use other data, replace the above line with:
+        # reader.test_reader('val.list'),
+        batch_size=BATCH_SIZE)
+
+    # Create trainer
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer,
+                                 extra_layers=extra_layers,
+                                 is_local=False)
+
+    # End batch and end pass event handler
+    def event_handler(event):
+        global ts, ts_pass
+        if isinstance(event, paddle.event.BeginPass):
+            ts_pass = time.time()
+        if isinstance(event, paddle.event.BeginIteration):
+            ts = time.time()
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 1 == 0:
+                print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics,
+                    time.time() - ts)
+        if isinstance(event, paddle.event.EndPass):
+            print "Pass %d end, spent: %f" % (event.pass_id,
+                                              time.time() - ts_pass)
+            result = trainer.test(reader=test_reader)
+            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+
+    trainer.train(
+        reader=train_reader, num_passes=200, event_handler=event_handler)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
index 137f11da7f2f1c46eebf6590d93402786ef543c9..dbc676bdac30e0d730206c17a1912d49d4f896eb 100644
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -15,12 +15,13 @@
 include(ExternalProject)
 
 set(BOOST_PROJECT       "extern_boost")
-set(BOOST_VER           "1.66.0")
-set(BOOST_TAR           "boost_1_66_0")
-set(BOOST_URL           "https://dl.bintray.com/boostorg/release/${BOOST_VER}/source/${BOOST_TAR}.tar.gz")
+set(BOOST_VER           "1.41.0")
+set(BOOST_TAR           "boost_1_41_0")
+set(BOOST_URL           "http://sourceforge.net/projects/boost/files/boost/${BOOST_VER}/${BOOST_TAR}.tar.gz")
 set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
 set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
 set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
+set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
 
 include_directories(${BOOST_INCLUDE_DIR})
 
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index d49c8d601102cf865287c33349bff5eee6a90f6d..6a701e076c95372f903a09d35d4208ee73bd584c 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -28,9 +28,3 @@ endif()
 add_dependencies(eigen3 extern_eigen3)
 
 LIST(APPEND external_project_dependencies eigen3)
-
-IF(NOT WITH_C_API AND WITH_FLUID)
-    INSTALL(FILES ${EIGEN_INCLUDE_DIR}/Eigen/Core DESTINATION third_party/eigen3/Eigen)
-    INSTALL(DIRECTORY ${EIGEN_INCLUDE_DIR}/Eigen/src DESTINATION third_party/eigen3/Eigen)
-    INSTALL(DIRECTORY ${EIGEN_INCLUDE_DIR}/unsupported/Eigen DESTINATION third_party/eigen3/unsupported)
-ENDIF()
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index 60946304541a20809276c3e665d8524baf209006..d4f252bb9f64c8db82b841fedf0817f5d8596501 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -52,7 +52,7 @@ ADD_DEPENDENCIES(gflags extern_gflags)
 
 LIST(APPEND external_project_dependencies gflags)
 
-IF(WITH_C_API OR WITH_FLUID)
+IF(WITH_C_API)
   INSTALL(DIRECTORY ${GFLAGS_INCLUDE_DIR} DESTINATION third_party/gflags)
   IF(ANDROID)
     INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib/${ANDROID_ABI})
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index 382fbda3b5cfeba893f03871cf65498d20804f36..0c6b3aafcb4e990b9d4549820137474e5968a7aa 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -68,7 +68,7 @@ LINK_LIBRARIES(glog gflags)
 
 LIST(APPEND external_project_dependencies glog)
 
-IF(WITH_C_API OR WITH_FLUID)
+IF(WITH_C_API)
   INSTALL(DIRECTORY ${GLOG_INCLUDE_DIR} DESTINATION third_party/glog)
   IF(ANDROID)
     INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib/${ANDROID_ABI})
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 365a370a9cfb708379bcff18ae6aa0725d420ae1..ff5855052dabaa0b63099cd219f3f04e22f1aa85 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -250,7 +250,7 @@ IF(NOT PROTOBUF_FOUND)
     SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY}
         CACHE FILEPATH "protoc library." FORCE)
 
-    IF(WITH_C_API OR WITH_FLUID)
+    IF(WITH_C_API)
         INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf)
         IF(ANDROID)
             INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 7cb4efa7bff7164464f1210a2b2188226c219ef6..5fa60df7b3f6698ceeee1e4f6d868a3d4bfc7a41 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -52,6 +52,7 @@ ExternalProject_Add(
                     -DWITH_TORCH=OFF
                     -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
                     -DBUILD_SHARED=ON
+                    -DBUILD_TESTS=OFF
                     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                     -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                     ${EXTERNAL_OPTIONAL_ARGS}
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 585db019d521b1699baadfae31ef95b5059c71b4..1cb54ba2164fafbfce9f28a3e894ae5e78a9cd68 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -179,15 +179,24 @@ function(cc_library TARGET_NAME)
   set(oneValueArgs "")
   set(multiValueArgs SRCS DEPS)
   cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  if (cc_library_SRCS)
-    if (cc_library_SHARED OR cc_library_shared) # build *.so
+  if(cc_library_SRCS)
+    if(cc_library_SHARED OR cc_library_shared) # build *.so
       add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
     else()
       add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
     endif()
-    if (cc_library_DEPS)
+    if(cc_library_DEPS)
+      # Don't need link libwarpctc.so
+      if("${cc_library_DEPS};" MATCHES "warpctc;")
+        list(REMOVE_ITEM cc_library_DEPS warpctc)
+        add_dependencies(${TARGET_NAME} warpctc)
+      endif()
+      # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
+      target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
+      if("${cc_library_DEPS}" MATCHES "ARCHIVE_START")
+        list(REMOVE_ITEM cc_library_DEPS ARCHIVE_START ARCHIVE_END)
+      endif()
       add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
-      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
     endif()
     
     # cpplint code style
@@ -224,12 +233,18 @@ function(cc_test TARGET_NAME)
   if(WITH_TESTING)
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
+    set(multiValueArgs SRCS DEPS ARGS)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
+    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
+      list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
+    endif()
     add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
-    add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+    add_test(NAME ${TARGET_NAME}
+             COMMAND ${TARGET_NAME} ${cc_test_ARGS}
+             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
   endif()
 endfunction(cc_test)
 
@@ -457,12 +472,12 @@ endfunction()
 
 function(py_test TARGET_NAME)
   if(WITH_TESTING)
-    set(options STATIC static SHARED shared)
+    set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS ARGS)
+    set(multiValueArgs SRCS DEPS ARGS ENVS)
     cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python
+             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_ENVS}
              ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
   endif()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..7d53554358497762b1cd91c39bdd23c5807af2bc
--- /dev/null
+++ b/cmake/inference_lib.cmake
@@ -0,0 +1,90 @@
+# make package for paddle fluid shared and static library
+function(copy TARGET)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DSTS DEPS)
+    cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
+    list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
+    if(NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len})
+        message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers")
+    endif()
+    math(EXPR len "${copy_lib_SRCS_len} - 1")
+    
+    add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS})
+    foreach(index RANGE ${len})
+        list(GET copy_lib_SRCS ${index} src)
+        list(GET copy_lib_DSTS ${index} dst)
+        add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND mkdir -p "${dst}")
+        if(IS_DIRECTORY ${src})
+            add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND cp -r "${src}" "${dst}")
+        else()
+            add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND cp "${src}" "${dst}")
+        endif()
+    endforeach()
+endfunction()
+
+# third party
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/eigen3")
+copy(eigen3_lib
+  SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
+  DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported
+)
+
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/gflags")
+copy(gflags_lib
+  SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
+  DSTS ${dst_dir} ${dst_dir}/lib
+)
+
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/glog")
+copy(glog_lib
+  SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
+  DSTS ${dst_dir} ${dst_dir}/lib
+)
+
+IF(NOT PROTOBUF_FOUND)
+    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf")
+    copy(protobuf_lib
+      SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LITE_LIBRARY}
+      DSTS ${dst_dir} ${dst_dir}/lib
+    )
+ENDIF(NOT PROTOBUF_FOUND)
+
+# paddle fluid module
+set(src_dir "${PADDLE_SOURCE_DIR}/paddle")
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle")
+set(module "framework")
+copy(framework_lib DEPS framework_py_proto 
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/framework/framework.pb.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
+)
+
+set(module "memory")
+copy(memory_lib
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
+)
+
+set(module "inference")
+copy(inference_lib DEPENDS paddle_fluid_shared
+  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/inference/libpaddle_fluid.so
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}
+)
+
+set(module "platform")
+copy(platform_lib
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details
+)
+
+set(module "string")
+copy(string_lib
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
+)
+
+add_custom_target(inference_lib_dist DEPENDS 
+  inference_lib framework_lib memory_lib platform_lib string_lib
+  gflags_lib glog_lib protobuf_lib eigen3_lib)
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 94dd3457fb5b513441c4c8e339e1862de9092517..58ce5d61c950d12630cfe1de354ffc2a2ba1fd59 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -47,3 +47,5 @@ sphinx_add_target(paddle_docs_cn
                   ${SPHINX_CACHE_DIR_CN}
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_CN})
+
+add_subdirectory(api)
diff --git a/doc/api/CMakeLists.txt b/doc/api/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4e0bc1d5b8e799ef86cb92a0dda348b0be4e299a
--- /dev/null
+++ b/doc/api/CMakeLists.txt
@@ -0,0 +1,20 @@
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
+    "${BINARY_BUILD_DIR_EN}/conf.py"
+    @ONLY)
+
+sphinx_add_target(paddle_api_docs
+                  html
+                  ${BINARY_BUILD_DIR_EN}
+                  ${SPHINX_CACHE_DIR_EN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_EN})
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index ddf0b055a92d80295b24255a5462d477e0d9c796..29388f5005bf779a1bfa63c0d46d35996c0c792d 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -87,6 +87,11 @@ roi_pool
 ..  autoclass:: paddle.v2.layer.roi_pool
     :noindex:
 
+pad
+----
+..  autoclass:: paddle.v2.layer.pad
+    :noindex:
+
 Norm Layer
 ==========
 
@@ -133,6 +138,11 @@ grumemory
 ..  autoclass:: paddle.v2.layer.grumemory
     :noindex:
 
+gated_unit
+-----------
+..  autoclass:: paddle.v2.layer.gated_unit
+    :noindex:
+    
 Recurrent Layer Group
 =====================
 
@@ -340,6 +350,11 @@ bilinear_interp
 ..  autoclass:: paddle.v2.layer.bilinear_interp
     :noindex:
 
+dropout
+--------
+..  autoclass:: paddle.v2.layer.dropout
+    :noindex:
+    
 dot_prod
 ---------
 .. autoclass:: paddle.v2.layer.dot_prod
@@ -402,6 +417,11 @@ scale_shift
 ..  autoclass:: paddle.v2.layer.scale_shift
     :noindex:
 
+factorization_machine
+---------------------
+..  autoclass:: paddle.v2.layer.factorization_machine
+    :noindex:
+
 Sampling Layers
 ===============
 
@@ -420,22 +440,6 @@ multiplex
 ..  autoclass:: paddle.v2.layer.multiplex
     :noindex:
 
-Factorization Machine Layer
-============================
-
-factorization_machine
----------------------
-..  autoclass:: paddle.v2.layer.factorization_machine
-    :noindex:
-
-Slicing and Joining Layers
-==========================
-
-pad
-----
-..  autoclass:: paddle.v2.layer.pad
-    :noindex:
-
 ..  _api_v2.layer_costs:
 
 Cost Layers
@@ -526,6 +530,11 @@ multibox_loss
 ..  autoclass:: paddle.v2.layer.multibox_loss
     :noindex:
 
+detection_output
+----------------
+..  autoclass:: paddle.v2.layer.detection_output
+    :noindex:
+    
 Check Layer
 ============
 
@@ -534,31 +543,10 @@ eos
 ..  autoclass:: paddle.v2.layer.eos
     :noindex:
 
-Miscs
-=====
-
-dropout
---------
-..  autoclass:: paddle.v2.layer.dropout
-    :noindex:
-
-Activation with learnable parameter
-===================================
+Activation
+==========
 
 prelu
 --------
 ..  autoclass:: paddle.v2.layer.prelu
     :noindex:
-
-gated_unit
------------
-..  autoclass:: paddle.v2.layer.gated_unit
-    :noindex:
-
-Detection output Layer
-======================
-
-detection_output
-----------------
-..  autoclass:: paddle.v2.layer.detection_output
-    :noindex:
diff --git a/doc/api/v2/data/dataset.rst b/doc/api/v2/data/dataset.rst
index 6a8ecc5bb1d855e0ded3719943ab3adb810de365..02e41564b1e48c07da6ac071fc4b60089169e05a 100644
--- a/doc/api/v2/data/dataset.rst
+++ b/doc/api/v2/data/dataset.rst
@@ -73,3 +73,10 @@ wmt14
 ..  automodule:: paddle.v2.dataset.wmt14
     :members:
     :noindex:
+
+wmt16
++++++
+
+..  automodule:: paddle.v2.dataset.wmt16
+    :members:
+    :noindex:
diff --git a/doc/api/v2/fluid/data_feeder.rst b/doc/api/v2/fluid/data_feeder.rst
index 0fa78f7dfb04c13be7eb83b7fd35cb03f2f4a7fa..a591c7334fd31c98a94b50a4344f251560a0f2f9 100644
--- a/doc/api/v2/fluid/data_feeder.rst
+++ b/doc/api/v2/fluid/data_feeder.rst
@@ -1,9 +1,14 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
 ===========
-DataFeeder
+data_feeder
 ===========
 
 DataFeeder
------------
-..  automodule:: paddle.v2.fluid.data_feeder
-    :members: DataFeeder
+----------
+
+..  autoclass:: paddle.v2.fluid.data_feeder.DataFeeder
+    :members:
     :noindex:
+
diff --git a/doc/api/v2/fluid/evaluator.rst b/doc/api/v2/fluid/evaluator.rst
index a23f3301d0331e0ea3733f06444515eb4680cd31..00dcecfd628a35d83d1c596bf0aea819a1705862 100644
--- a/doc/api/v2/fluid/evaluator.rst
+++ b/doc/api/v2/fluid/evaluator.rst
@@ -1,9 +1,21 @@
-===========
-Evaluator
-===========
-
-Evaluator
------------
-..  automodule:: paddle.v2.fluid.evaluator
-    :members: Evaluator
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=========
+evaluator
+=========
+
+Accuracy
+--------
+
+..  autoclass:: paddle.v2.fluid.evaluator.Accuracy
+    :members:
     :noindex:
+
+ChunkEvaluator
+--------------
+
+..  autoclass:: paddle.v2.fluid.evaluator.ChunkEvaluator
+    :members:
+    :noindex:
+
diff --git a/doc/api/v2/fluid/executor.rst b/doc/api/v2/fluid/executor.rst
index 3a283538c120cfa1ef646c390bb71c6251c23675..a028f6283f2ca333bdf6c9857a98661c0222b41e 100644
--- a/doc/api/v2/fluid/executor.rst
+++ b/doc/api/v2/fluid/executor.rst
@@ -1,9 +1,32 @@
-===========
-Executor
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+========
+executor
+========
 
 Executor
+--------
+
+..  autoclass:: paddle.v2.fluid.executor.Executor
+    :members:
+    :noindex:
+
+global_scope
+------------
+
+..  autofunction:: paddle.v2.fluid.executor.global_scope
+    :noindex:
+
+scope_guard
 -----------
-..  automodule:: paddle.v2.fluid.executor
-    :members: Executor
+
+..  autofunction:: paddle.v2.fluid.executor.scope_guard
+    :noindex:
+
+switch_scope
+------------
+
+..  autofunction:: paddle.v2.fluid.executor.switch_scope
     :noindex:
+
diff --git a/doc/api/v2/fluid/gen_doc.py b/doc/api/v2/fluid/gen_doc.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2147fd3f7ea635d8f14210fbcd1a568ee2230ee
--- /dev/null
+++ b/doc/api/v2/fluid/gen_doc.py
@@ -0,0 +1,109 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import argparse
+import sys
+import types
+
+import paddle.v2.fluid as fluid
+
+
+def parse_arg():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--submodules', nargs="*")
+    parser.add_argument(
+        'module', type=str, help='Generate the documentation of which module')
+    return parser.parse_args()
+
+
+class DocGenerator(object):
+    def __init__(self, module_name, stream=sys.stdout):
+        self.stream = stream
+        self.module_name = module_name
+        if not hasattr(fluid, module_name):
+            raise ValueError("Cannot find fluid.{0}".format(module_name))
+        else:
+            self.module = getattr(fluid, module_name)
+        self.stream.write('''..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+''')
+
+        self._print_header_(module_name, dot='=', is_title=True)
+
+    def print_submodule(self, submodule_name):
+        submodule = getattr(self.module, submodule_name)
+        if submodule is None:
+            raise ValueError("Cannot find submodule {0}".format(submodule_name))
+        self.print_section(submodule_name)
+
+        for item in submodule.__all__:
+            self.print_item(item)
+
+    def print_current_module(self):
+        for item in self.module.__all__:
+            self.print_item(item)
+
+    def print_section(self, name):
+        self._print_header_(name, dot='=', is_title=False)
+
+    def print_item(self, name):
+        item = getattr(self.module, name)
+        if isinstance(item, types.TypeType):
+            self.print_class(name)
+        elif isinstance(item, types.FunctionType):
+            self.print_method(name)
+        else:
+            raise RuntimeError("Unsupported item {0}".format(name))
+
+    def print_class(self, name):
+        self._print_header_(name, dot='-', is_title=False)
+        self.stream.write('''..  autoclass:: paddle.v2.fluid.{0}.{1}
+    :members:
+    :noindex:
+
+'''.format(self.module_name, name))
+
+    def print_method(self, name):
+        self._print_header_(name, dot='-', is_title=False)
+        self.stream.write('''..  autofunction:: paddle.v2.fluid.{0}.{1}
+    :noindex:
+
+'''.format(self.module_name, name))
+
+    def _print_header_(self, name, dot, is_title):
+        dot_line = dot * len(name)
+        if is_title:
+            self.stream.write(dot_line)
+            self.stream.write('\n')
+        self.stream.write(name)
+        self.stream.write('\n')
+        self.stream.write(dot_line)
+        self.stream.write('\n')
+        self.stream.write('\n')
+
+
+def main():
+    args = parse_arg()
+    gen = DocGenerator(args.module)
+    if args.submodules is None:
+        gen.print_current_module()
+    else:
+        for submodule_name in args.submodules:
+            gen.print_submodule(submodule_name)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/doc/api/v2/fluid/gen_doc.sh b/doc/api/v2/fluid/gen_doc.sh
new file mode 100755
index 0000000000000000000000000000000000000000..ba7b7ba8e51399deb852b0a7c8ddd3128f521e85
--- /dev/null
+++ b/doc/api/v2/fluid/gen_doc.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst
+
+for module in io data_feeder evaluator executor initializer io nets optimizer param_attr profiler regularizer
+do
+  python gen_doc.py ${module} > ${module}.rst
+done
diff --git a/doc/api/v2/fluid/initializer.rst b/doc/api/v2/fluid/initializer.rst
index 8f587837e9873370722062404f511654a9460587..c38be033fff2997930525f51c93995db09daa2b6 100644
--- a/doc/api/v2/fluid/initializer.rst
+++ b/doc/api/v2/fluid/initializer.rst
@@ -1,50 +1,35 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
 ===========
-Initializer
+initializer
 ===========
 
+Constant
+--------
 
-
-Initializer
------------
-..  automodule:: paddle.v2.fluid.initializer
-    :members: Initializer
-    :noindex:
-
-
-
-ConstantInitializer
--------------------
-..  automodule:: paddle.v2.fluid.initializer
-    :members: ConstantInitializer
+..  autoclass:: paddle.v2.fluid.initializer.Constant
+    :members:
     :noindex:
 
+Uniform
+-------
 
-
-UniformInitializer
-------------------
-..  automodule:: paddle.v2.fluid.initializer
-    :members: UniformInitializer
-    :noindex:
-
-
-
-NormalInitializer
------------------
-..  automodule:: paddle.v2.fluid.initializer
-    :members: NormalInitializer
+..  autoclass:: paddle.v2.fluid.initializer.Uniform
+    :members:
     :noindex:
 
+Normal
+------
 
-XavierInitializer
------------------
-..  automodule:: paddle.v2.fluid.initializer
-    :members: XavierInitializer
+..  autoclass:: paddle.v2.fluid.initializer.Normal
+    :members:
     :noindex:
 
+Xavier
+------
 
-MSRAInitializer
----------------
-..  automodule:: paddle.v2.fluid.initializer
-    :members: MSRAInitializer
+..  autoclass:: paddle.v2.fluid.initializer.Xavier
+    :members:
     :noindex:
 
diff --git a/doc/api/v2/fluid/io.rst b/doc/api/v2/fluid/io.rst
index 67f68c4e9e16b379207b8de114cdf769e056f78e..37c9c273e369532e8ff596e9649cb695a98a2505 100644
--- a/doc/api/v2/fluid/io.rst
+++ b/doc/api/v2/fluid/io.rst
@@ -1,10 +1,61 @@
-===========
-IO
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
 
+==
+io
+==
 
+save_vars
+---------
 
-is_parameter
+..  autofunction:: paddle.v2.fluid.io.save_vars
+    :noindex:
+
+save_params
 -----------
-..  autofunction:: paddle.v2.fluid.io.is_parameter
+
+..  autofunction:: paddle.v2.fluid.io.save_params
+    :noindex:
+
+save_persistables
+-----------------
+
+..  autofunction:: paddle.v2.fluid.io.save_persistables
+    :noindex:
+
+load_vars
+---------
+
+..  autofunction:: paddle.v2.fluid.io.load_vars
+    :noindex:
+
+load_params
+-----------
+
+..  autofunction:: paddle.v2.fluid.io.load_params
     :noindex:
+
+load_persistables
+-----------------
+
+..  autofunction:: paddle.v2.fluid.io.load_persistables
+    :noindex:
+
+save_inference_model
+--------------------
+
+..  autofunction:: paddle.v2.fluid.io.save_inference_model
+    :noindex:
+
+load_inference_model
+--------------------
+
+..  autofunction:: paddle.v2.fluid.io.load_inference_model
+    :noindex:
+
+get_inference_program
+---------------------
+
+..  autofunction:: paddle.v2.fluid.io.get_inference_program
+    :noindex:
+
diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
index 25d28de0aad3b1788083c92c4adff8b9a86da9b1..58c493fd7412cf9dbe507c9622d67dae33a5fb25 100644
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -1,536 +1,805 @@
-==========
-Layers
-==========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
 
+======
+layers
+======
 
-fc
----
-..  autofunction:: paddle.v2.fluid.layers.fc
+control_flow
+============
+
+split_lod_tensor
+----------------
+
+..  autofunction:: paddle.v2.fluid.layers.split_lod_tensor
     :noindex:
 
-embedding
----------
-..  autofunction:: paddle.v2.fluid.layers.embedding
+merge_lod_tensor
+----------------
+
+..  autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
     :noindex:
 
-dynamic_lstm
-------------
-..  autofunction:: paddle.v2.fluid.layers.dynamic_lstm
+BlockGuard
+----------
+
+..  autoclass:: paddle.v2.fluid.layers.BlockGuard
+    :members:
     :noindex:
 
-dynamic_gru
------------
-..  autofunction:: paddle.v2.fluid.layers.dynamic_gru
+BlockGuardWithCompletion
+------------------------
+
+..  autoclass:: paddle.v2.fluid.layers.BlockGuardWithCompletion
+    :members:
     :noindex:
 
-data
-----
-..  autofunction:: paddle.v2.fluid.layers.data
+StaticRNNMemoryLink
+-------------------
+
+..  autoclass:: paddle.v2.fluid.layers.StaticRNNMemoryLink
+    :members:
     :noindex:
 
-mean
-----
-..  autofunction:: paddle.v2.fluid.layers.mean
+WhileGuard
+----------
+
+..  autoclass:: paddle.v2.fluid.layers.WhileGuard
+    :members:
     :noindex:
 
-mul
----
-..  autofunction:: paddle.v2.fluid.layers.mul
+While
+-----
+
+..  autoclass:: paddle.v2.fluid.layers.While
+    :members:
     :noindex:
 
-elementwise_add
----------------
-..  autofunction:: paddle.v2.fluid.layers.elementwise_add
+lod_rank_table
+--------------
+
+..  autofunction:: paddle.v2.fluid.layers.lod_rank_table
     :noindex:
 
-elementwise_sub
----------------
-..  autofunction:: paddle.v2.fluid.layers.elementwise_sub
+max_sequence_len
+----------------
+
+..  autofunction:: paddle.v2.fluid.layers.max_sequence_len
     :noindex:
 
-elementwise_mul
----------------
-..  autofunction:: paddle.v2.fluid.layers.elementwise_mul
+topk
+----
+
+..  autofunction:: paddle.v2.fluid.layers.topk
     :noindex:
 
-elementwise_div
----------------
-..  autofunction:: paddle.v2.fluid.layers.elementwise_div
+lod_tensor_to_array
+-------------------
+
+..  autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
     :noindex:
 
+array_to_lod_tensor
+-------------------
 
-dropout
--------
-..  autofunction:: paddle.v2.fluid.layers.dropout
+..  autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
     :noindex:
 
+increment
+---------
 
-reshape
---------
-..  autofunction:: paddle.v2.fluid.layers.reshape
+..  autofunction:: paddle.v2.fluid.layers.increment
     :noindex:
 
+array_write
+-----------
 
-sigmoid
----------
-..  autofunction:: paddle.v2.fluid.layers.sigmoid
+..  autofunction:: paddle.v2.fluid.layers.array_write
     :noindex:
 
+create_array
+------------
 
-scale
+..  autofunction:: paddle.v2.fluid.layers.create_array
+    :noindex:
+
+less_than
 ---------
-..  autofunction:: paddle.v2.fluid.layers.scale
+
+..  autofunction:: paddle.v2.fluid.layers.less_than
     :noindex:
 
+array_read
+----------
 
-transpose
+..  autofunction:: paddle.v2.fluid.layers.array_read
+    :noindex:
+
+shrink_memory
+-------------
+
+..  autofunction:: paddle.v2.fluid.layers.shrink_memory
+    :noindex:
+
+array_length
+------------
+
+..  autofunction:: paddle.v2.fluid.layers.array_length
+    :noindex:
+
+IfElse
+------
+
+..  autoclass:: paddle.v2.fluid.layers.IfElse
+    :members:
+    :noindex:
+
+DynamicRNN
+----------
+
+..  autoclass:: paddle.v2.fluid.layers.DynamicRNN
+    :members:
+    :noindex:
+
+ConditionalBlock
+----------------
+
+..  autoclass:: paddle.v2.fluid.layers.ConditionalBlock
+    :members:
+    :noindex:
+
+StaticRNN
 ---------
-..  autofunction:: paddle.v2.fluid.layers.transpose
+
+..  autoclass:: paddle.v2.fluid.layers.StaticRNN
+    :members:
     :noindex:
 
+reorder_lod_tensor_by_rank
+--------------------------
 
-sigmoid_cross_entropy_with_logits
----------------------------------
-..  autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits
+..  autofunction:: paddle.v2.fluid.layers.reorder_lod_tensor_by_rank
     :noindex:
 
+ParallelDo
+----------
 
-cast
+..  autoclass:: paddle.v2.fluid.layers.ParallelDo
+    :members:
+    :noindex:
+
+Print
+-----
+
+..  autofunction:: paddle.v2.fluid.layers.Print
+    :noindex:
+
+device
+======
+
+get_places
+----------
+
+..  autofunction:: paddle.v2.fluid.layers.get_places
+    :noindex:
+
+io
+==
+
+data
 ----
-..  autofunction:: paddle.v2.fluid.layers.cast
+
+..  autofunction:: paddle.v2.fluid.layers.data
     :noindex:
 
+BlockGuardServ
+--------------
 
-concat
--------
-..  autofunction:: paddle.v2.fluid.layers.concat
+..  autoclass:: paddle.v2.fluid.layers.BlockGuardServ
+    :members:
     :noindex:
 
+ListenAndServ
+-------------
 
-sums
+..  autoclass:: paddle.v2.fluid.layers.ListenAndServ
+    :members:
+    :noindex:
+
+Send
 ----
-..  autofunction:: paddle.v2.fluid.layers.sums
+
+..  autofunction:: paddle.v2.fluid.layers.Send
     :noindex:
 
+nn
+==
 
-linear_chain_crf
-----------------
-..  autofunction:: paddle.v2.fluid.layers.linear_chain_crf
+fc
+--
+
+..  autofunction:: paddle.v2.fluid.layers.fc
     :noindex:
 
+embedding
+---------
 
-assign
--------
 ..  autofunction:: paddle.v2.fluid.layers.embedding
     :noindex:
 
+dynamic_lstm
+------------
 
-split_lod_tensor
-----------------
-..  autofunction:: paddle.v2.fluid.layers.split_lod_tensor
+..  autofunction:: paddle.v2.fluid.layers.dynamic_lstm
     :noindex:
 
+dynamic_lstmp
+-------------
 
-merge_lod_tensor
+..  autofunction:: paddle.v2.fluid.layers.dynamic_lstmp
+    :noindex:
+
+dynamic_gru
+-----------
+
+..  autofunction:: paddle.v2.fluid.layers.dynamic_gru
+    :noindex:
+
+gru_unit
+--------
+
+..  autofunction:: paddle.v2.fluid.layers.gru_unit
+    :noindex:
+
+linear_chain_crf
 ----------------
-..  autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
+
+..  autofunction:: paddle.v2.fluid.layers.linear_chain_crf
+    :noindex:
+
+crf_decoding
+------------
+
+..  autofunction:: paddle.v2.fluid.layers.crf_decoding
     :noindex:
 
 cos_sim
---------
+-------
+
 ..  autofunction:: paddle.v2.fluid.layers.cos_sim
     :noindex:
 
-
 cross_entropy
 -------------
+
 ..  autofunction:: paddle.v2.fluid.layers.cross_entropy
     :noindex:
 
-
-
 square_error_cost
 -----------------
+
 ..  autofunction:: paddle.v2.fluid.layers.square_error_cost
     :noindex:
 
-
 accuracy
----------
+--------
+
 ..  autofunction:: paddle.v2.fluid.layers.accuracy
     :noindex:
 
+chunk_eval
+----------
+
+..  autofunction:: paddle.v2.fluid.layers.chunk_eval
+    :noindex:
 
 sequence_conv
 -------------
+
 ..  autofunction:: paddle.v2.fluid.layers.sequence_conv
     :noindex:
 
-
 conv2d
 ------
+
 ..  autofunction:: paddle.v2.fluid.layers.conv2d
     :noindex:
 
-
 sequence_pool
 -------------
+
 ..  autofunction:: paddle.v2.fluid.layers.sequence_pool
     :noindex:
 
+pool2d
+------
 
-sequence_first_step
--------------------
-..  autofunction:: paddle.v2.fluid.layers.sequence_first_step
+..  autofunction:: paddle.v2.fluid.layers.pool2d
     :noindex:
 
+batch_norm
+----------
 
-sequence_last_step
+..  autofunction:: paddle.v2.fluid.layers.batch_norm
+    :noindex:
+
+layer_norm
+----------
+
+..  autofunction:: paddle.v2.fluid.layers.layer_norm
+    :noindex:
+
+beam_search_decode
 ------------------
-..  autofunction:: paddle.v2.fluid.layers.sequence_last_step
+
+..  autofunction:: paddle.v2.fluid.layers.beam_search_decode
     :noindex:
 
+conv2d_transpose
+----------------
 
-pool2d
-------
-..  autofunction:: paddle.v2.fluid.layers.pool2d
+..  autofunction:: paddle.v2.fluid.layers.conv2d_transpose
     :noindex:
 
+sequence_expand
+---------------
 
-batch_norm
+..  autofunction:: paddle.v2.fluid.layers.sequence_expand
+    :noindex:
+
+lstm_unit
+---------
+
+..  autofunction:: paddle.v2.fluid.layers.lstm_unit
+    :noindex:
+
+reduce_sum
 ----------
-..  autofunction:: paddle.v2.fluid.layers.batch_norm
+
+..  autofunction:: paddle.v2.fluid.layers.reduce_sum
+    :noindex:
+
+reduce_mean
+-----------
+
+..  autofunction:: paddle.v2.fluid.layers.reduce_mean
     :noindex:
 
+reduce_max
+----------
 
-beam_search_decode
+..  autofunction:: paddle.v2.fluid.layers.reduce_max
+    :noindex:
+
+reduce_min
+----------
+
+..  autofunction:: paddle.v2.fluid.layers.reduce_min
+    :noindex:
+
+sequence_first_step
+-------------------
+
+..  autofunction:: paddle.v2.fluid.layers.sequence_first_step
+    :noindex:
+
+sequence_last_step
 ------------------
-..  autofunction:: paddle.v2.fluid.layers.beam_search_decode
+
+..  autofunction:: paddle.v2.fluid.layers.sequence_last_step
+    :noindex:
+
+dropout
+-------
+
+..  autofunction:: paddle.v2.fluid.layers.dropout
+    :noindex:
+
+split
+-----
+
+..  autofunction:: paddle.v2.fluid.layers.split
     :noindex:
 
+ctc_greedy_decoder
+------------------
 
-lod_rank_table
---------------
-..  autofunction:: paddle.v2.fluid.layers.lod_rank_table
+..  autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder
     :noindex:
 
+edit_distance
+-------------
 
-max_sequence_len
-----------------
-..  autofunction:: paddle.v2.fluid.layers.max_sequence_len
+..  autofunction:: paddle.v2.fluid.layers.edit_distance
     :noindex:
 
+l2_normalize
+------------
 
-topk
------
-..  autofunction:: paddle.v2.fluid.layers.topk
+..  autofunction:: paddle.v2.fluid.layers.l2_normalize
     :noindex:
 
+matmul
+------
 
-lod_tensor_to_array
--------------------
-..  autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
+..  autofunction:: paddle.v2.fluid.layers.matmul
     :noindex:
 
+warpctc
+-------
 
-
-array_to_lod_tensor
--------------------
-..  autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
+..  autofunction:: paddle.v2.fluid.layers.warpctc
     :noindex:
 
+sequence_reshape
+----------------
 
+..  autofunction:: paddle.v2.fluid.layers.sequence_reshape
+    :noindex:
 
+transpose
+---------
 
-fill_constant
--------------
-..  autofunction:: paddle.v2.fluid.layers.fill_constant
+..  autofunction:: paddle.v2.fluid.layers.transpose
     :noindex:
 
+im2sequence
+-----------
 
-
-fill_constant_batch_size_like
------------------------------
-..  autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
+..  autofunction:: paddle.v2.fluid.layers.im2sequence
     :noindex:
 
+nce
+---
 
-ones
-----
-..  autofunction:: paddle.v2.fluid.layers.ones
+..  autofunction:: paddle.v2.fluid.layers.nce
     :noindex:
 
+beam_search
+-----------
 
-zeros
------
-..  autofunction:: paddle.v2.fluid.layers.zeros
+..  autofunction:: paddle.v2.fluid.layers.beam_search
     :noindex:
 
+row_conv
+--------
 
-increment
----------
-..  autofunction:: paddle.v2.fluid.layers.increment
+..  autofunction:: paddle.v2.fluid.layers.row_conv
     :noindex:
 
+multiplex
+---------
 
-array_write
------------
-..  autofunction:: paddle.v2.fluid.layers.array_write
+..  autofunction:: paddle.v2.fluid.layers.multiplex
     :noindex:
 
+ops
+===
 
+mean
+----
 
-create_array
-------------
-..  autofunction:: paddle.v2.fluid.layers.create_array
+..  autofunction:: paddle.v2.fluid.layers.mean
     :noindex:
 
+mul
+---
 
-less_than
----------
-..  autofunction:: paddle.v2.fluid.layers.less_than
+..  autofunction:: paddle.v2.fluid.layers.mul
     :noindex:
 
+reshape
+-------
 
-array_read
-----------
-..  autofunction:: paddle.v2.fluid.layers.array_read
+..  autofunction:: paddle.v2.fluid.layers.reshape
     :noindex:
 
+scale
+-----
 
-shrink_memory
---------------
-..  autofunction:: paddle.v2.fluid.layers.shrink_memory
+..  autofunction:: paddle.v2.fluid.layers.scale
     :noindex:
 
+sigmoid_cross_entropy_with_logits
+---------------------------------
 
-array_length
--------------
-..  autofunction:: paddle.v2.fluid.layers.array_length
+..  autofunction:: paddle.v2.fluid.layers.sigmoid_cross_entropy_with_logits
     :noindex:
 
+elementwise_add
+---------------
 
-conv2d_transpose
-----------------
-..  autofunction:: paddle.v2.fluid.layers.conv2d_transpose
+..  autofunction:: paddle.v2.fluid.layers.elementwise_add
     :noindex:
 
-
-sequence_expand
+elementwise_div
 ---------------
-..  autofunction:: paddle.v2.fluid.layers.sequence_expand
+
+..  autofunction:: paddle.v2.fluid.layers.elementwise_div
     :noindex:
 
+elementwise_sub
+---------------
 
-gru_unit
---------
-..  autofunction:: paddle.v2.fluid.layers.gru_unit
+..  autofunction:: paddle.v2.fluid.layers.elementwise_sub
     :noindex:
 
+elementwise_mul
+---------------
 
-lstm_unit
----------
-..  autofunction:: paddle.v2.fluid.layers.lstm_unit
+..  autofunction:: paddle.v2.fluid.layers.elementwise_mul
     :noindex:
 
+elementwise_max
+---------------
 
-sequence_softmax
-----------------
-..  autofunction:: paddle.v2.fluid.layers.sequence_softmax
+..  autofunction:: paddle.v2.fluid.layers.elementwise_max
     :noindex:
 
+elementwise_min
+---------------
 
-reduce_sum
-----------
-..  autofunction:: paddle.v2.fluid.layers.reduce_sum
+..  autofunction:: paddle.v2.fluid.layers.elementwise_min
     :noindex:
 
+elementwise_pow
+---------------
 
-reduce_mean
------------
-..  autofunction:: paddle.v2.fluid.layers.reduce_mean
+..  autofunction:: paddle.v2.fluid.layers.elementwise_pow
     :noindex:
 
+clip
+----
 
-reduce_max
-----------
-..  autofunction:: paddle.v2.fluid.layers.reduce_max
+..  autofunction:: paddle.v2.fluid.layers.clip
     :noindex:
 
+clip_by_norm
+------------
 
-reduce_min
-----------
-..  autofunction:: paddle.v2.fluid.layers.reduce_min
+..  autofunction:: paddle.v2.fluid.layers.clip_by_norm
     :noindex:
 
+sequence_softmax
+----------------
 
-split
------
-..  autofunction:: paddle.v2.fluid.layers.split
+..  autofunction:: paddle.v2.fluid.layers.sequence_softmax
     :noindex:
 
+sigmoid
+-------
 
-matmul
-------
-..  autofunction:: paddle.v2.fluid.layers.matmul
+..  autofunction:: paddle.v2.fluid.layers.sigmoid
     :noindex:
 
 logsigmoid
 ----------
+
 ..  autofunction:: paddle.v2.fluid.layers.logsigmoid
     :noindex:
 
 exp
 ---
+
 ..  autofunction:: paddle.v2.fluid.layers.exp
     :noindex:
 
 relu
 ----
+
 ..  autofunction:: paddle.v2.fluid.layers.relu
     :noindex:
 
 tanh
 ----
+
 ..  autofunction:: paddle.v2.fluid.layers.tanh
     :noindex:
 
 tanh_shrink
 -----------
+
 ..  autofunction:: paddle.v2.fluid.layers.tanh_shrink
     :noindex:
 
 softshrink
 ----------
+
 ..  autofunction:: paddle.v2.fluid.layers.softshrink
     :noindex:
 
 sqrt
 ----
+
 ..  autofunction:: paddle.v2.fluid.layers.sqrt
     :noindex:
 
 abs
-----
+---
+
 ..  autofunction:: paddle.v2.fluid.layers.abs
     :noindex:
 
 ceil
 ----
+
 ..  autofunction:: paddle.v2.fluid.layers.ceil
     :noindex:
 
 floor
 -----
+
 ..  autofunction:: paddle.v2.fluid.layers.floor
     :noindex:
 
 round
 -----
+
 ..  autofunction:: paddle.v2.fluid.layers.round
     :noindex:
 
 reciprocal
 ----------
+
 ..  autofunction:: paddle.v2.fluid.layers.reciprocal
     :noindex:
 
 log
 ---
+
 ..  autofunction:: paddle.v2.fluid.layers.log
     :noindex:
 
 square
 ------
+
 ..  autofunction:: paddle.v2.fluid.layers.square
     :noindex:
 
 softplus
 --------
+
 ..  autofunction:: paddle.v2.fluid.layers.softplus
     :noindex:
 
 softsign
----------
+--------
+
 ..  autofunction:: paddle.v2.fluid.layers.softsign
     :noindex:
 
 brelu
 -----
+
 ..  autofunction:: paddle.v2.fluid.layers.brelu
     :noindex:
 
 leaky_relu
 ----------
+
 ..  autofunction:: paddle.v2.fluid.layers.leaky_relu
     :noindex:
 
 soft_relu
 ---------
+
 ..  autofunction:: paddle.v2.fluid.layers.soft_relu
     :noindex:
 
 elu
-----
+---
+
 ..  autofunction:: paddle.v2.fluid.layers.elu
     :noindex:
 
 relu6
 -----
+
 ..  autofunction:: paddle.v2.fluid.layers.relu6
     :noindex:
 
 pow
-----
+---
+
 ..  autofunction:: paddle.v2.fluid.layers.pow
     :noindex:
 
+stanh
+-----
+
+..  autofunction:: paddle.v2.fluid.layers.stanh
+    :noindex:
+
 hard_shrink
 -----------
+
 ..  autofunction:: paddle.v2.fluid.layers.hard_shrink
     :noindex:
 
 thresholded_relu
 ----------------
+
 ..  autofunction:: paddle.v2.fluid.layers.thresholded_relu
     :noindex:
 
 hard_sigmoid
--------------
+------------
+
 ..  autofunction:: paddle.v2.fluid.layers.hard_sigmoid
     :noindex:
 
 swish
-------
+-----
+
 ..  autofunction:: paddle.v2.fluid.layers.swish
     :noindex:
 
-im2sequence
+tensor
+======
+
+create_tensor
+-------------
+
+..  autofunction:: paddle.v2.fluid.layers.create_tensor
+    :noindex:
+
+create_parameter
+----------------
+
+..  autofunction:: paddle.v2.fluid.layers.create_parameter
+    :noindex:
+
+create_global_var
+-----------------
+
+..  autofunction:: paddle.v2.fluid.layers.create_global_var
+    :noindex:
+
+cast
+----
+
+..  autofunction:: paddle.v2.fluid.layers.cast
+    :noindex:
+
+concat
 ------
-..  autofunction:: paddle.v2.fluid.layers.im2sequence
+
+..  autofunction:: paddle.v2.fluid.layers.concat
     :noindex:
 
-edit_distance
----------------
-..  autofunction:: paddle.v2.fluid.layers.edit_distance_error
+sums
+----
+
+..  autofunction:: paddle.v2.fluid.layers.sums
     :noindex:
 
-ctc_greedy_decoder
----------------
-..  autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder
+assign
+------
+
+..  autofunction:: paddle.v2.fluid.layers.assign
     :noindex:
 
-l2_normalize
-------------
-..  autofunction:: paddle.v2.fluid.layers.l2_normalize
+fill_constant_batch_size_like
+-----------------------------
+
+..  autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
     :noindex:
 
-sequence_reshape
-----------------
-..  autofunction:: paddle.v2.fluid.layers.sequence_reshape
+fill_constant
+-------------
+
+..  autofunction:: paddle.v2.fluid.layers.fill_constant
     :noindex:
 
-row_conv
---------
-..  autofunction:: paddle.v2.fluid.layers.row_conv
+ones
+----
+
+..  autofunction:: paddle.v2.fluid.layers.ones
+    :noindex:
+
+zeros
+-----
+
+..  autofunction:: paddle.v2.fluid.layers.zeros
     :noindex:
+
diff --git a/doc/api/v2/fluid/nets.rst b/doc/api/v2/fluid/nets.rst
index 500019bc507f859c4c91de5d322a82eb1e78e2de..015581b7660848bdb0845fafe2d3fc05405e6ae6 100644
--- a/doc/api/v2/fluid/nets.rst
+++ b/doc/api/v2/fluid/nets.rst
@@ -1,33 +1,31 @@
-===========
-Nets
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+====
+nets
+====
 
 simple_img_conv_pool
 --------------------
-..  autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
-    :noindex:
 
-
-img_conv_group
----------------
-..  autofunction:: paddle.v2.fluid.nets.img_conv_group
+..  autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
     :noindex:
 
-
 sequence_conv_pool
 ------------------
+
 ..  autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
     :noindex:
 
-
 glu
 ---
+
 ..  autofunction:: paddle.v2.fluid.nets.glu
     :noindex:
 
-
 scaled_dot_product_attention
 ----------------------------
+
 ..  autofunction:: paddle.v2.fluid.nets.scaled_dot_product_attention
     :noindex:
 
diff --git a/doc/api/v2/fluid/optimizer.rst b/doc/api/v2/fluid/optimizer.rst
index 19b4940f08de3e2f7dc177f2961e538946d10a78..1691ebb9a7cb16da96e04147d0adea322374f529 100644
--- a/doc/api/v2/fluid/optimizer.rst
+++ b/doc/api/v2/fluid/optimizer.rst
@@ -1,54 +1,49 @@
-===========
-Optimizer
-===========
-
-Optimizer
------------
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: Optimizer
-    :noindex:
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
 
+=========
+optimizer
+=========
 
-SGDOptimizer
------------
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: SGDOptimizer
-    :noindex:
+SGD
+---
 
+..  autoclass:: paddle.v2.fluid.optimizer.SGD
+    :members:
+    :noindex:
 
+Momentum
+--------
 
-MomentumOptimizer
------------------
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: MomentumOptimizer
+..  autoclass:: paddle.v2.fluid.optimizer.Momentum
+    :members:
     :noindex:
 
+Adagrad
+-------
 
-
-AdagradOptimizer
-----------------
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: AdagradOptimizer
+..  autoclass:: paddle.v2.fluid.optimizer.Adagrad
+    :members:
     :noindex:
 
+Adam
+----
 
-AdamOptimizer
--------------
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: AdamOptimizer
+..  autoclass:: paddle.v2.fluid.optimizer.Adam
+    :members:
     :noindex:
 
+Adamax
+------
 
-AdamaxOptimizer
------------
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: AdamaxOptimizer
+..  autoclass:: paddle.v2.fluid.optimizer.Adamax
+    :members:
     :noindex:
 
+DecayedAdagrad
+--------------
 
-DecayedAdagradOptimizer
------------------------
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: DecayedAdagradOptimizer
+..  autoclass:: paddle.v2.fluid.optimizer.DecayedAdagrad
+    :members:
     :noindex:
 
diff --git a/doc/api/v2/fluid/param_attr.rst b/doc/api/v2/fluid/param_attr.rst
index ca0c8af9e8c4f2271de7a131ad0d27c0e8635f50..8083d0d858dafcd275eaddb9b475875ee42ef724 100644
--- a/doc/api/v2/fluid/param_attr.rst
+++ b/doc/api/v2/fluid/param_attr.rst
@@ -1,11 +1,21 @@
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+==========
+param_attr
+==========
+
 ParamAttr
-===========
+---------
 
+..  autoclass:: paddle.v2.fluid.param_attr.ParamAttr
+    :members:
+    :noindex:
 
+WeightNormParamAttr
+-------------------
 
-ParamAttr
------------
-..  automodule:: paddle.v2.fluid.param_attr
-    :members: ParamAttr
+..  autoclass:: paddle.v2.fluid.param_attr.WeightNormParamAttr
+    :members:
     :noindex:
+
diff --git a/doc/api/v2/fluid/profiler.rst b/doc/api/v2/fluid/profiler.rst
index 7d4042d1f41c12c4a551ba6576559d612116872a..4a1ff7cb6976e0054f77428b699ea679aa91394f 100644
--- a/doc/api/v2/fluid/profiler.rst
+++ b/doc/api/v2/fluid/profiler.rst
@@ -1,10 +1,25 @@
-===========
-Profiler
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
 
+========
+profiler
+========
 
+cuda_profiler
+-------------
 
-Profiler
------------
 ..  autofunction:: paddle.v2.fluid.profiler.cuda_profiler
     :noindex:
+
+reset_profiler
+--------------
+
+..  autofunction:: paddle.v2.fluid.profiler.reset_profiler
+    :noindex:
+
+profiler
+--------
+
+..  autofunction:: paddle.v2.fluid.profiler.profiler
+    :noindex:
+
diff --git a/doc/api/v2/fluid/regularizer.rst b/doc/api/v2/fluid/regularizer.rst
index 868e225ed3d59e79aeb217fb88081ea25f80fa2c..2c17d15599baa1d02eb87c7b6c40034769ebb3a4 100644
--- a/doc/api/v2/fluid/regularizer.rst
+++ b/doc/api/v2/fluid/regularizer.rst
@@ -1,25 +1,27 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
 ===========
-Regularizer
+regularizer
 ===========
 
-WeightDecayRegularizer
-----------------------
-..  automodule:: paddle.v2.fluid.regularizer
-    :members: WeightDecayRegularizer
-    :noindex:
-
+append_regularization_ops
+-------------------------
 
-L2DecayRegularizer
-------------------
-..  automodule:: paddle.v2.fluid.regularizer
-    :members: L2DecayRegularizer
+..  autofunction:: paddle.v2.fluid.regularizer.append_regularization_ops
     :noindex:
 
+L1Decay
+-------
 
+..  autoclass:: paddle.v2.fluid.regularizer.L1Decay
+    :members:
+    :noindex:
 
-L1DecayRegularizer
--------------------
-..  automodule:: paddle.v2.fluid.regularizer
-    :members: L1DecayRegularizer
+L2Decay
+-------
 
+..  autoclass:: paddle.v2.fluid.regularizer.L2Decay
+    :members:
+    :noindex:
 
diff --git a/doc/howto/dev/build_cn.md b/doc/build_and_install/build_cn.md
similarity index 100%
rename from doc/howto/dev/build_cn.md
rename to doc/build_and_install/build_cn.md
diff --git a/doc/howto/dev/build_en.md b/doc/build_and_install/build_en.md
similarity index 100%
rename from doc/howto/dev/build_en.md
rename to doc/build_and_install/build_en.md
diff --git a/doc/getstarted/build_and_install/build_from_source_cn.rst b/doc/build_and_install/build_from_source_cn.rst
similarity index 99%
rename from doc/getstarted/build_and_install/build_from_source_cn.rst
rename to doc/build_and_install/build_from_source_cn.rst
index 71904dc41ed0d946867d890cc585e1b88450ca8c..ff904b1022a41612c9680dce92d3fc2c69ad7e93 100644
--- a/doc/getstarted/build_and_install/build_from_source_cn.rst
+++ b/doc/build_and_install/build_from_source_cn.rst
@@ -115,7 +115,7 @@ PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种B
     "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
     "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
     "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
-    "WITH_TESTING", "是否开启单元测试", "ON"
+    "WITH_TESTING", "是否开启单元测试", "OFF"
     "WITH_DOC", "是否编译中英文文档", "OFF"
     "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练", "Auto"
     "WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON"
diff --git a/doc/getstarted/build_and_install/build_from_source_en.rst b/doc/build_and_install/build_from_source_en.rst
similarity index 99%
rename from doc/getstarted/build_and_install/build_from_source_en.rst
rename to doc/build_and_install/build_from_source_en.rst
index 27f73b2e2c029b41d514e1612912ed1c335605b6..718fb869c23a1f7be82c87c726282bded9dad516 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.rst
+++ b/doc/build_and_install/build_from_source_en.rst
@@ -126,7 +126,7 @@ You can add :code:`-D` argument to pass such options, like:
     "WITH_AVX", "Build with AVX support", "ON"
     "WITH_PYTHON", "Build with integrated Python interpreter", "ON"
     "WITH_STYLE_CHECK", "Check code style when building", "ON"
-    "WITH_TESTING", "Build unit tests", "ON"
+    "WITH_TESTING", "Build unit tests", "OFF"
     "WITH_DOC", "Build documentations", "OFF"
     "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
     "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/build_and_install/docker_install_cn.rst
similarity index 97%
rename from doc/getstarted/build_and_install/docker_install_cn.rst
rename to doc/build_and_install/docker_install_cn.rst
index 98fada7bdb46f4dd2927d6f93bcbcebbe7d18604..79d214635a069a739060e0b79424729f6ff90387 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/build_and_install/docker_install_cn.rst
@@ -95,6 +95,12 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
 
      docker run -p 8888:8888 paddlepaddle/book
 
+国内用户可以使用下面的镜像源来加速访问：
+
+  .. code-block: bash
+
+    docker run -p 8888:8888 docker.paddlepaddlehub.com/book
+
 然后在浏览器中输入以下网址：
 
   .. code-block:: text
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/build_and_install/docker_install_en.rst
similarity index 97%
rename from doc/getstarted/build_and_install/docker_install_en.rst
rename to doc/build_and_install/docker_install_en.rst
index b1d0890b4cdddb77114a80276130afd07c22d270..e0e0559fb858a093db96a9b4ec1c5a45d6c71a38 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/build_and_install/docker_install_en.rst
@@ -102,6 +102,12 @@ We provide a packaged book image, simply issue the command:
 
      docker run -p 8888:8888 paddlepaddle/book
 
+For users in China, we provide a faster mirror:
+
+  .. code-block: bash
+
+    docker run -p 8888:8888 docker.paddlepaddlehub.com/book
+
 Then, you would back and paste the address into the local browser:
 
   .. code-block:: text
diff --git a/doc/getstarted/build_and_install/index_cn.rst b/doc/build_and_install/index_cn.rst
similarity index 94%
rename from doc/getstarted/build_and_install/index_cn.rst
rename to doc/build_and_install/index_cn.rst
index c9ba84c842b530162c92713046e64fdf82bd441b..4220ff2279333f25eb644227100308428bf72362 100644
--- a/doc/getstarted/build_and_install/index_cn.rst
+++ b/doc/build_and_install/index_cn.rst
@@ -13,7 +13,7 @@ PaddlePaddle提供pip和Docker的安装方式：
 
    pip_install_cn.rst
    docker_install_cn.rst
-   ../../howto/dev/build_cn.md
+   build_cn.md
 
 编译流程
 ++++++++
diff --git a/doc/getstarted/build_and_install/index_en.rst b/doc/build_and_install/index_en.rst
similarity index 95%
rename from doc/getstarted/build_and_install/index_en.rst
rename to doc/build_and_install/index_en.rst
index 32d66d63dd5b2a30d5de4a088dc80b680830cb84..db6b5be742be1619c52f5f7000bec013e818693d 100644
--- a/doc/getstarted/build_and_install/index_en.rst
+++ b/doc/build_and_install/index_en.rst
@@ -13,7 +13,7 @@ You can choose either pip or Docker to complete your install:
 
    pip_install_en.rst
    docker_install_en.rst
-   ../../howto/dev/build_en.md
+   build_en.md
 
 
 Build from Source
diff --git a/doc/getstarted/build_and_install/paddleci.png b/doc/build_and_install/paddleci.png
similarity index 100%
rename from doc/getstarted/build_and_install/paddleci.png
rename to doc/build_and_install/paddleci.png
diff --git a/doc/getstarted/build_and_install/pip_install_cn.rst b/doc/build_and_install/pip_install_cn.rst
similarity index 93%
rename from doc/getstarted/build_and_install/pip_install_cn.rst
rename to doc/build_and_install/pip_install_cn.rst
index 0c741e936b46eda5e7165e4ee54b545b14a28a19..8e4165da6b8135d083766c650f1092158f9d01c2 100644
--- a/doc/getstarted/build_and_install/pip_install_cn.rst
+++ b/doc/build_and_install/pip_install_cn.rst
@@ -39,6 +39,7 @@ PaddlePaddle可以使用常用的Python包管理工具
 
     "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
     "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
+    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
     "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
     "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
     "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
diff --git a/doc/getstarted/build_and_install/pip_install_en.rst b/doc/build_and_install/pip_install_en.rst
similarity index 93%
rename from doc/getstarted/build_and_install/pip_install_en.rst
rename to doc/build_and_install/pip_install_en.rst
index 285ed09805b09790beaef014f6813c227aff33ac..c1e806c0fe5f03139c0dff985f9ae0856eaa2e98 100644
--- a/doc/getstarted/build_and_install/pip_install_en.rst
+++ b/doc/build_and_install/pip_install_en.rst
@@ -42,6 +42,7 @@ If the links below shows up the login form, just click "Log in as guest" to star
 
     "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
     "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
+    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
     "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
     "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
     "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
diff --git a/doc/design/auto_gradient_check.md b/doc/design/auto_gradient_check.md
index f9991541bc51c6e13ffce4e9cec60f73dc800121..773b7b6a767541f28c27f247c1ad8c9a8a2d0ccf 100644
--- a/doc/design/auto_gradient_check.md
+++ b/doc/design/auto_gradient_check.md
@@ -1,23 +1,23 @@
-## Auto Gradient Checker Design
+## Auto Gradient Check Design
 
-## Backgraound：
-- Generally, it is easy to check whether the forward computation of an Operator is correct or not. However, backpropagation is a notoriously difficult algorithm to debug and get right:
-  1. you should get the right backpropagation formula according to the forward computation.
-  2. you should implement it right in CPP.
-  3. it's difficult to prepare test data.
+## Background：
+- Generally, it is easy to check whether the forward computation of an Operator is correct or not. However, backpropagation is a notoriously difficult algorithm to debug and get right because of the following challenges:
+  1. The formula for backpropagation formula should be correct according to the forward computation.
+  2. The Implementation of the above shoule be correct in CPP.
+  3. It is difficult to prepare an unbiased test data.
 
-- Auto gradient checking gets a numerical gradient by forward Operator and use it as a reference of the backward Operator's result. It has several advantages:
-  1. numerical gradient checker only need forward operator.
-  2. user only need to prepare the input data for forward Operator.
+- Auto gradient checking gets a numerical gradient using forward Operator and uses it as a reference for the backward Operator's result. It has several advantages:
+  1. Numerical gradient checker only needs the forward operator.
+  2. The user only needs to prepare the input data for forward Operator and not worry about the backward Operator.
 
 ## Mathematical Theory
-The following two document from Stanford has a detailed explanation of how to get numerical gradient and why it's useful.
+The following documents from Stanford have a detailed explanation of how to compute the numerical gradient and why it is useful.
 
 - [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
 - [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
 
 
-## Numeric Gradient Implementation
+## Numerical Gradient Implementation
 ### Python Interface
 ```python
 def get_numerical_gradient(op,
@@ -27,73 +27,76 @@ def get_numerical_gradient(op,
                          delta=0.005,
                          local_scope=None):
     """
-    Get Numeric Gradient for an operator's input.
+    Get Numerical Gradient for the input of an operator.
 
-    :param op: C++ operator instance, could be an network
+    :param op: C++ operator instance, could be an network.
     :param input_values: The input variables. Should be an dictionary, whose key is
-    variable name, and value is numpy array.
+    variable name, and value is a numpy array.
     :param output_name: The final output variable name.
-    :param input_to_check: The input variable with respect to which to compute the gradient.
-    :param delta: The perturbation value for numeric gradient method. The
-    smaller delta is, the more accurate result will get. But if that delta is
-     too small, it will suffer from numerical stability problem.
+    :param input_to_check: The input variable with respect to which the gradient has to be computed.
+    :param delta: The perturbation value for numerical gradient method. The
+    smaller the delta, the more accurate the result. But if the delta is too
+    small, it will suffer from the numerical stability problem.
     :param local_scope: The local scope used for get_numeric_gradient.
     :return: The gradient array in numpy format.
     """
 ```
 
-### Explaination:
+### Explanation:
 
-- Why need `output_name`
-  - An Operator may have multiple Output, one can get independent gradient from each Output. So caller should specify the name of the output variable.
+- Why do we need an `output_name`
+  - An Operator may have multiple Outputs, one can compute an independent gradient from each Output. So the caller should specify the name of the output variable.
 
-- Why need `input_to_check`
-  - One operator may have multiple inputs. Gradient Op can calculate the gradient of these inputs at the same time. But Numeric Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times.
+- Why do we need `input_to_check`
+  - One operator can have multiple inputs. Gradient Op can calculate the gradient of these inputs at the same time. But Numerical Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times each with a different input.
 
 
 ### Core Algorithm Implementation
 
 
 ```python
-    # we only compute gradient of one element a time.
+    # we only compute the gradient of one element a time.
     # we use a for loop to compute the gradient of each element.
     for i in xrange(tensor_size):
-        # get one input element by its index i.
-        origin = tensor_to_check.get_float_element(i)
+        # get one input element using the index i.
+        original = tensor_to_check.get_float_element(i)
 
-        # add delta to it, run op and then get the new value of the result tensor.
-        x_pos = origin + delta
+        # add delta to it, run the forward op and then
+        # get the new value of the result tensor.
+        x_pos = original + delta
         tensor_to_check.set_float_element(i, x_pos)
         y_pos = get_output()
 
-        # plus delta to this element, run op and get the new value of the result tensor.
-        x_neg = origin - delta
+        # Subtract delta from this element, run the op again
+        # and get the new value of the result tensor.
+        x_neg = original - delta
         tensor_to_check.set_float_element(i, x_neg)
         y_neg = get_output()
 
         # restore old value
-        tensor_to_check.set_float_element(i, origin)
+        tensor_to_check.set_float_element(i, original)
 
-        # compute the gradient of this element and store it into a numpy array.
+        # compute the gradient of this element and store
+        # it into a numpy array.
         gradient_flat[i] = (y_pos - y_neg) / delta / 2
 
     # reshape the gradient result to the shape of the source tensor.
     return gradient_flat.reshape(tensor_to_check.get_dims())
 ```
 
-## Auto Graident Checker Framework
+## Auto Gradient Check Framework
 
 Each Operator Kernel has three kinds of Gradient:
 
 1. Numerical gradient
 2. CPU kernel gradient
-3. GPU kernel gradient (if supported)
+3. GPU kernel gradient (if supported by the device)
 
-The numerical gradient only relies on forward Operator. So we use the numerical gradient as the reference value. And the gradient checking is performed in the following three steps:
+The numerical gradient only relies on the forward Operator, so we use the numerical gradient as the reference value. The gradient checking is performed in the following three steps:
 
-1. calculate the numerical gradient
-2. calculate CPU kernel gradient with the backward Operator and compare it with the numerical gradient
-3. calculate GPU kernel gradient with the backward Operator and compare it with the numeric gradient (if supported)
+1. Calculate the numerical gradient
+2. Calculate CPU kernel gradient with the backward Operator and compare it with the numerical gradient.
+3. Calculate GPU kernel gradient with the backward Operator and compare it with the numeric gradient. (if supported)
 
 #### Python Interface
 
@@ -109,26 +112,27 @@ The numerical gradient only relies on forward Operator. So we use the numerical
         """
         :param forward_op: used to create backward_op
         :param input_vars: numpy value of input variable. The following
-            computation will use these variables.
-        :param inputs_to_check: the input variable with respect to which to compute the gradient.
+          computation will use these variables.
+        :param inputs_to_check: the input variable with respect to which the
+          gradient will be computed.
         :param output_name: The final output variable name.
         :param max_relative_error: The relative tolerance parameter.
-        :param no_grad_set: used when create backward ops
+        :param no_grad_set: used to create backward ops
         :param only_cpu: only compute and check gradient on cpu kernel.
         :return:
         """
 ```
 
-### How to check if two numpy array is close enough?
-if `abs_numerical_grad` is nearly zero, then use abs error for numerical_grad
+### How to check if two numpy arrays are close enough?
+if `abs_numerical_grad` is nearly zero, then use absolute error for numerical_grad.
 
 ```python
 numerical_grad = ...
 operator_grad = numpy.array(scope.find_var(grad_var_name(name)).get_tensor())
 
 abs_numerical_grad = numpy.abs(numerical_grad)
-# if abs_numerical_grad is nearly zero, then use abs error for numeric_grad, not relative
-# error.
+# if abs_numerical_grad is nearly zero, then use abs error for
+# numeric_grad, instead of relative error.
 abs_numerical_grad[abs_numerical_grad < 1e-3] = 1
 
 diff_mat = numpy.abs(abs_numerical_grad - operator_grad) / abs_numerical_grad
@@ -137,10 +141,10 @@ max_diff = numpy.max(diff_mat)
 
 
 #### Notes：
-The Input data for auto gradient checker should be reasonable to avoid numerical  stability problem.
+The Input data for auto gradient checker should be reasonable to avoid numerical stability problem.
 
 
-#### Refs:
+#### References:
 
 - [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
 - [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
diff --git a/doc/design/cpp_data_feeding.md b/doc/design/cpp_data_feeding.md
new file mode 100644
index 0000000000000000000000000000000000000000..40205350f99722f0b71bfa6f390fe9d01d831966
--- /dev/null
+++ b/doc/design/cpp_data_feeding.md
@@ -0,0 +1,79 @@
+# C++ Data Feeding
+
+In training with Paddle V2 API, data feeding wholly dependents on Python code. To get rid of the Python environment and achieve the goal of "wrapping the whole training by a while loop op" in Paddle Fluid, a C++ data feeding mechanism is required. 
+
+In this document we show the fundamental design of C++ data feeding process, which includes the data reading, shuffling and batching.
+
+## Reader
+
+A new concept named 'Reader' is introduced. `Reader` is a series of inherited classes which can be hold by our `Variable` and they are used to read or process file data.
+
+
+### `ReaderBase`
+
+`ReaderBase` is the abstract base class of all readers. It defines the all readers' interfaces.
+
+```cpp
+class ReaderBase {
+ public:
+  explicit ReaderBase(const std::vector<DDim>& shapes) : shapes_(shapes) {
+    PADDLE_ENFORCE(!shapes_.empty());
+  }
+  // Read the next batch of data. (A 'batch' can be only one instance)
+  virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
+  // Show whether the next bacth exists.
+  virtual bool HasNext() const = 0;
+  
+  // Reinitialize the reader and read the file from the begin.
+  virtual void ReInit() = 0;
+  
+  // Get a certain read in data's shape.
+  DDim shape(size_t idx) const;
+  // Get shapes of all read in data.
+  std::vector<DDim> shapes() const { return shapes_; }
+  // Set shapes of read in data.
+  void set_shapes(const std::vector<DDim>& shapes) { shapes_ = shapes; }
+
+  virtual ~ReaderBase() {}
+
+ protected:
+  std::vector<DDim> shapes_;
+};
+```
+
+### `FileReader` and `DecoratedReader`
+
+These two classes are derived from the `ReaderBase` and will further be derived by respective specific readers. That is to say, in our design, there are two kinds of readers: file readers and decorated readers. A file reader reads from a file of some specific format, and yield only one instance of data at a time. e.g. RecordIO reader, jpg reader, .... A decorated reader takes another reader(both file reader and decorated reader are OK) as its 'underlying reader'. It gets data from its underlying reader, does some process on them(shuffling, or batching), then yields processed data. The output data of a decorated reader can be a single instance or a batch. `ShuffleReader` and `BatchReader` are both decorated readers.
+
+All the readers share exactly the same interfaces defined in `ReaderBase`. So they can be decorated for more than one time: We can **shuffle** a reader's outputs and then **batch** the shuffle outputs. The interface consistency also allows related ops use readers without knowing what they are exactly.
+
+
+### `ReaderHolder`
+
+Different readers belong to different class types. It leads to a problem: How can we drop them into `Variable`s and fetch them out by a unified method? For example, if a Variable holds a `BatchReader`, we can not get it by the following code:
+
+```cpp
+var->Get<ReaderBase>("batch_reader");
+```
+
+we have to write:
+
+```cpp
+var->Get<BatchReader>("batch_reader");
+```
+
+This requires each time getting a reader from a variable we must know the reader's type exactly. It is nearly impossible.
+
+To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an empty decorator of `ReaderBase`, which erases reader's type. With `ReaderHolder` we are able to fetch all types of readers by `var->Get<ReaderHolder>("...")` and regard the obtained object as a reader.
+
+## Related Operators
+
+To create and invoke readers, some now ops are introduced:
+
+### `CreateReaderOp`
+
+Each reader has its creating op. File readers' creating ops have no input and yield the created file reader as its output. Decorated readers' creating ops take the underlying readers as inputs and then yield new decorated readers.
+
+### `ReadOp`
+
+A reader is only a Variable. It cannot trigger the reading process by itself. So we add the `ReadOp` to execute it. A `ReadOp` takes a reader Variable as its input. Each time it runs, it invokes the reader‘s `ReadNext()` function and gets a new batch of data(or only one instance of data, if we use file reader directly). The output data of a reader are in the form of `std::vector<LoDTenosr>`, so the `ReadOp` also needs to split the vector and move LoDTensors to their respective output Variables.
diff --git a/doc/design/csp.md b/doc/design/csp.md
index ba9cacfdea7dcf7c6499b562dfc58400d082f2c8..10d936860fab7e09241e968a63526c7d86d3e568 100644
--- a/doc/design/csp.md
+++ b/doc/design/csp.md
@@ -42,7 +42,7 @@ The type *channel* is conceptually the blocking queue.  In Go, its implemented i
 
 The `select` operation has been in OS kernels long before Go language.  All Unix kernels implement system calls *poll* and *select*.  They monitor multiple file descriptors to see if I/O is possible on any of them.  This takes O(N) time.  Since Linux 2.6, a new system call, *epoll*, can do the same in O(1) time.  In BSD systems, there is a similar system call *kqueue*.  Go's Linux implementation uses epoll.
 
-It might be a good idea to implement Fluid's select using epoll too.  In this design doc, we start from the O(N) way, so we could focus on Python binding and the syntax.
+It might be a good idea to implement Fluid's select using epoll too.  In this design doc, we start from the O(N) way so that we could focus on Python binding and the syntax.
 
 ### Type Channel
 
@@ -71,14 +71,14 @@ ch1 := make(chan int, 100)  // a channel that can buffer 100 ints.
 In Fluid, we should be able to do the same:
 
 ```python
-ch  = fluid.make_chan(dtype=INT)
-ch1 = fluid.make_chan(dtype=INT, 100)
+ch  = fluid.make_channel(dtype=INT)
+ch1 = fluid.make_channel(dtype=INT, 100)
 ```
 
 In addition to that, we want channels that can hold more complex element types, e.g., Tensors of float16:
 
 ```python
-ch = fluid.make_chan(dtype=Tensor, etype=float16)
+ch = fluid.make_channel(dtype=Tensor, etype=float16)
 ```
 
 or Tensors of Tensors of float16 etc.
@@ -87,8 +87,136 @@ The point here is that we need a consistent way to compose types, like in C++ we
 
 ### Send and Recv
 
+Go's CSP implementation depends on data type *channel*. There are two types of channels:
+
+1. The unblocked channel, or buffered channel, is a blocking queue with a non-zero sized buffer. The sending to buffered channel blocks if the buffer is full, and the receive operation blocks if the buffer is empty.
+1. blocked channel, or unbuffered channel, is a blocking queue with no buffer.  Both sending and receiving block with unbuffered channels.
+
+There are four types of actions with a channel:
+
+1. Create a channel
+
+   ```go
+   ch := make(chan int) // this is an unbuffered channel
+   ch := make(chan int, 100) // this is a buffered channel of 100 ints.
+   ```
+
+1. Send
+
+   ```go
+   ch <- 111
+   ```
+
+1. Recv
+
+   ```go
+   y, ok <- ch
+   ```
+
+1. Close
+
+   ```go
+   close(ch)
+   ```
+   
+   Please be aware that a closed channel is not a nil channel, which is `var ch chan int`.
+   
+There are some [axioms with channels](https://dave.cheney.net/2014/03/19/channel-axioms):
+
+1. A send to a nil channel blocks forever
+
+1. A receive from a nil channel blocks forever
+
+1. A send to a closed channel panics
+
+1. A receive from a closed channel returns the residual values and then zeros.
+
+In Fluid, we have [buffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/buffered_channel.h) and [unbuffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/unbuffered_channel.h)
+
+The following program illustrates the Python syntax for accessing Fluid buffers.
+
+```python
+import fluid
+
+buffer_size = 10
+ch = fluid.make_channel(dtype=INT, buffer_size)
+
+# Now write three elements to the channel
+with fluid.while(steps=buffer_size):
+  fluid.send(ch, step)
+
+fluid.close_channel(ch)
+
+with fluid.while(steps=buffer_size):
+  fluid.print(fluid.recv(ch))
+```
+
+The following example shows that to avoid the always-blocking behavior of unbuffered channels, we need to use Fluid's goroutines.
+
+```python
+import fluid
+
+ch = fluid.make_channel(dtype=INT)
+
+with fluid.go():
+  fluid.send(ch)
+
+y = fluid.recv(ch)
+
+fluid.close_channel(ch)
+```
+
 ### Select
 
+In Go, the `select` statement lets a goroutine wait on multiple communication operations. A `select` blocks until one of its cases can run, then it executes that case. It chooses one at random if multiple are ready.
+
+```go
+
+ch1  := make(chan int)       
+ch2  := make(chan int, 100)
+
+x := 0
+
+for {
+    select {
+    case ch1 <- x:
+      x := x + 1
+    case y <- ch2:
+      fmt.Println("Received on channel")
+    default:
+      fmt.Println("Default")
+    }
+  }
+
+```
+
+In Fluid, we should be able to do the same:
+
+```python
+ch1  = fluid.make_chan(dtype=INT)
+ch2 = fluid.make_chan(dtype=INT, 100)
+
+sel = fluid.select()
+
+with sel.case(ch1, 'w', X):
+    fluid.layers.increment(X)
+
+with sel.case(ch2, 'r', Y):
+    fluid.print("Received on Channel")
+
+with sel.default():
+    fluid.print("Default")
+
+```
+
+In the above code snippet, `X` and `Y` are variables. Now let us look at each of these statements one by one.
+
+- `sel.case(ch1, 'w', X)` : This specifies that we are writing to `ch1` and we want to write the integer in variable `X` to the channel. The character `w` is used here to make the syntax familiar to write syntax in Python I/O.
+
+- `sel.case(ch2, 'r', Y)` : This specifies that we would like to read the result from `ch2` into variable `Y`. The character `r` is used here to make the syntax familiar to read syntax in Python I/O.
+
+- `sel.default()` : This is equivalent to the default in Go `select`. If none of the channels are ready for read or write, then the fluid code in the default block will be executed.
+
 ## Example Programs
 
 ### 1. RPC between Trainers and Parameter Servers
diff --git a/doc/design/dist_refactor/distributed_architecture.md b/doc/design/dist_refactor/distributed_architecture.md
index 3a741f95866fb6c301ca9097af7916281f2278cf..9368c5780dc922953f38bf0f86d9f797a4a8a6fe 100644
--- a/doc/design/dist_refactor/distributed_architecture.md
+++ b/doc/design/dist_refactor/distributed_architecture.md
@@ -152,12 +152,12 @@ for data in train_reader():
 `JobDesc` object describe the distributed job resource specification to run on
 Cluster environment.
 
-<img src="src/remote_executor.png"/>
+<img src="src/remote_executor.png" width="500" align="center" />
 
 `RemoteExecutor.run` sends the `ProgramDesc` and
 [TrainingJob](https://github.com/PaddlePaddle/cloud/blob/develop/doc/autoscale/README.md#training-job-resource)
 to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible
-to start the final Kubernetes Jobs to run the different role of `ProgramDesc`.
+to start the final Kubernetes Jobs to run the different role of `ProgramDesc` from `ConfigMap`.
 
 
 ### Placement Algorithm
diff --git a/doc/design/dist_refactor/src/remote_executor.graffle b/doc/design/dist_refactor/src/remote_executor.graffle
index ce2c18fee5687732053c48af9c8c290a994a8090..41b2067311694b56d211a4f32d1b76884eeffd2d 100644
Binary files a/doc/design/dist_refactor/src/remote_executor.graffle and b/doc/design/dist_refactor/src/remote_executor.graffle differ
diff --git a/doc/design/dist_refactor/src/remote_executor.png b/doc/design/dist_refactor/src/remote_executor.png
index 6be4b1841b99efdb59557975485d0387f422308c..744e2fb2e0f1bbe058e991ba7b2a09000965ee79 100644
Binary files a/doc/design/dist_refactor/src/remote_executor.png and b/doc/design/dist_refactor/src/remote_executor.png differ
diff --git a/doc/design/speech/README.MD b/doc/design/speech/deep_speech_2.md
similarity index 85%
rename from doc/design/speech/README.MD
rename to doc/design/speech/deep_speech_2.md
index 7304650e628dba210488cd2dc4836318b5383b2a..cfdc4d6df04344c70d3334626bd38eca997c31ff 100644
--- a/doc/design/speech/README.MD
+++ b/doc/design/speech/deep_speech_2.md
@@ -140,7 +140,19 @@ TODO by Assignees
 
 ### Beam Search with CTC and LM
 
-TODO by Assignees
+<div align="center">
+<img src="image/beam_search.png" width=600><br/>
+Figure 2. Algorithm for CTC Beam Search Decoder.
+</div>
+
+- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts: 
+   - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths; 
+   - 2) the if condition ```if l^+ not in A_prev then``` after probabilities' computation is deprecated for it is hard to understand and seems unnecessary.
+- An **external scorer** would be passed into the decoder to evaluate a candidate prefix during decoding whenever a white space appended in English decoding and any character appended in Mandarin decoding.
+- Such external scorer consists of language model, word count or any other custom scorers.
+- The **language model** is built from Task 5, with parameters should be carefully tuned to achieve minimum WER/CER (c.f. Task 7)
+- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality. 
+ 
 
 ## Future Work
 
@@ -153,3 +165,4 @@ TODO by Assignees
 
 1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016.
 2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). 	arXiv:1512.02595.
+3. Awni Y. Hannun, etc. [First-Pass Large Vocabulary Continuous Speech Recognition using Bi-Directional Recurrent DNNs](https://arxiv.org/abs/1408.2873). arXiv:1408.2873
diff --git a/doc/design/speech/image/beam_search.png b/doc/design/speech/image/beam_search.png
new file mode 100644
index 0000000000000000000000000000000000000000..7f7e35f34223162d0f7f0ed97375909c43b830ae
Binary files /dev/null and b/doc/design/speech/image/beam_search.png differ
diff --git a/doc/design/support_new_device.md b/doc/design/support_new_device.md
index 4c5f10e2ecb9ec09b78926ca27552741d02d7cc9..8983df900460127fc130043c52373dab505363ba 100644
--- a/doc/design/support_new_device.md
+++ b/doc/design/support_new_device.md
@@ -2,9 +2,9 @@
 
 ## Background
 
-Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries flexibly and efficiently.
+Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries in a flexible and efficient manner.
 
-On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example,Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library.
+On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example, Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library.
 
 On the other hand, users usually do not want to care about the low-level hardware and computing libraries when writing a neural network configuration. In Fluid, `Layer` is exposed in `Python`, and `Operator` is exposed in `C++`. Both `Layer` and `Operator` are hardware independent.
 
@@ -17,7 +17,7 @@ For a general overview of fluid, please refer to the [overview doc](https://gith
 
 There are mainly three parts that we have to consider while integrating a new device/library:
 
-- Place and DeviceContext: indicates the device id and manages hardware resources
+- Place and DeviceContext: indicate the device id and manage hardware resources
 
 - Memory and Tensor: malloc/free data on certain device
 
@@ -25,10 +25,10 @@ There are mainly three parts that we have to consider while integrating a new de
 
 ### Place and DeviceContext
 
-Please remind that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.
+Please note that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.
 
 #### Place
-Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add corresponding `DevicePlace`.
+Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add the corresponding `DevicePlace`.
 
 ```
         |   CPUPlace
@@ -144,7 +144,7 @@ class Tensor {
 };
 ```
 
-`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configure its shape, and then call `mutuable_data` to allocate the actual memory.
+`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configurate its shape, and then call `mutuable_data` to allocate the actual memory.
 
 ```cpp
 paddle::framework::Tensor t;
@@ -163,7 +163,7 @@ Fluid implements computing units based on different DeviceContexts. Some computi
 
 Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/math/maxouting.h#L27) as an example:
 
-The interface is defined in header file.
+The interface is defined in the header file.
 
 ```
 template <typename DeviceContext, typename T>
@@ -174,7 +174,7 @@ class MaxOutFunctor {
 };
 ```
 
-CPU implemention is in .cc file
+CPU implementation is in .cc file
 
 ```
 template <typename T>
@@ -188,7 +188,7 @@ class MaxOutFunctor<platform::CPUDeviceContext, T> {
 };
 ```
 
-CUDA implemention is in .cu file
+CUDA implementation is in .cu file
 
 ```
 template <typename T>
@@ -203,9 +203,9 @@ class MaxOutFunctor<platform::CUDADeviceContext, T> {
 ```
 
 
-We get computing handle from a concrete DeviceContext, and make compution on tensors.
+We first obtain the computing handle from a concrete DeviceContext and then compute on tensors.
 
-The implemention of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map.
+The implementation of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map.
 
 Fluid provides different register interfaces in op_registry.h
 
@@ -231,7 +231,7 @@ REGISTER_OP_CUDA_KERNEL(
 
 ## Advanced topics: How to switch between different Device/Library
 
-Generally, we will impelement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not sutibale on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run at GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.
+Generally, we will implement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not suitable on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run on GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.
 
 
 For more details, please refer to following docs:
diff --git a/doc/design/switch.md b/doc/design/switch.md
new file mode 100644
index 0000000000000000000000000000000000000000..827d0601c621e4a230de28e2baad8e196e69625e
--- /dev/null
+++ b/doc/design/switch.md
@@ -0,0 +1,31 @@
+### Design Doc: Switch
+
+### Background
+
+Many programming languages provide `switch` as a generalization of `if-elif-else`.  We want to add it to Fluid.
+
+The following example shows the usage of `fluid.switch`.
+
+```python
+a = fluid.Var(10)
+b = fluid.Var(0)
+
+with switch() as switch:
+    with switch.case(fluid.less_equal(a, 10)):
+        fluid.print("Case 1")
+    with switch.case(fluid.larger(a, 0)):
+        fluid.print("Case 2")
+    with switch.default():
+        fluid.print("Case 3")
+```
+
+### The Semantics
+
+1. A `switch` control-flow checks cases one-by-one.
+1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values.
+1. It runs the first matched case, or the default case if there is one.
+1. Once it matches a case, it runs the corresponding branch and only that branch.  It's like there is a C's `break` keyword at the end of each case.
+
+The above program should print and print only "Case 1".
+
+The implementation of the backward pass of the `switch` control-flow is easier than the backward of the `if_else`, because `switch` runs at most one branch, whereas `if-else` could run more than one branches.
diff --git a/doc/howto/dev/FullyConnected.jpg b/doc/dev/FullyConnected.jpg
similarity index 100%
rename from doc/howto/dev/FullyConnected.jpg
rename to doc/dev/FullyConnected.jpg
diff --git a/doc/howto/dev/contribute_to_paddle_cn.md b/doc/dev/contribute_to_paddle_cn.md
similarity index 100%
rename from doc/howto/dev/contribute_to_paddle_cn.md
rename to doc/dev/contribute_to_paddle_cn.md
diff --git a/doc/dev/contribute_to_paddle_en.md b/doc/dev/contribute_to_paddle_en.md
new file mode 120000
index 0000000000000000000000000000000000000000..f939e75f21a8badb5c40f527abd0e098fe9bc472
--- /dev/null
+++ b/doc/dev/contribute_to_paddle_en.md
@@ -0,0 +1 @@
+../../CONTRIBUTING.md
\ No newline at end of file
diff --git a/doc/dev/index_cn.rst b/doc/dev/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..487db868bb2a0a5383d56c3a723912d9fd5910b7
--- /dev/null
+++ b/doc/dev/index_cn.rst
@@ -0,0 +1,8 @@
+开发标准
+========
+
+..  toctree::
+  :maxdepth: 1
+
+  contribute_to_paddle_cn.md
+  write_docs_cn.rst
diff --git a/doc/dev/index_en.rst b/doc/dev/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5dd12d2233cff20e021b90beb94571a2817bd1ad
--- /dev/null
+++ b/doc/dev/index_en.rst
@@ -0,0 +1,9 @@
+Development
+------------
+
+..  toctree::
+  :maxdepth: 1
+
+  new_layer_en.rst
+  contribute_to_paddle_en.md
+  write_docs_en.rst
diff --git a/doc/howto/dev/new_layer_cn.rst b/doc/dev/new_layer_cn.rst
similarity index 100%
rename from doc/howto/dev/new_layer_cn.rst
rename to doc/dev/new_layer_cn.rst
diff --git a/doc/howto/dev/new_layer_en.rst b/doc/dev/new_layer_en.rst
similarity index 100%
rename from doc/howto/dev/new_layer_en.rst
rename to doc/dev/new_layer_en.rst
diff --git a/doc/howto/dev/new_op_cn.md b/doc/dev/new_op_cn.md
similarity index 100%
rename from doc/howto/dev/new_op_cn.md
rename to doc/dev/new_op_cn.md
diff --git a/doc/howto/dev/new_op_en.md b/doc/dev/new_op_en.md
similarity index 100%
rename from doc/howto/dev/new_op_en.md
rename to doc/dev/new_op_en.md
diff --git a/doc/howto/dev/new_op_kernel_en.md b/doc/dev/new_op_kernel_en.md
similarity index 100%
rename from doc/howto/dev/new_op_kernel_en.md
rename to doc/dev/new_op_kernel_en.md
diff --git a/doc/howto/dev/use_eigen_cn.md b/doc/dev/use_eigen_cn.md
similarity index 100%
rename from doc/howto/dev/use_eigen_cn.md
rename to doc/dev/use_eigen_cn.md
diff --git a/doc/howto/dev/use_eigen_en.md b/doc/dev/use_eigen_en.md
similarity index 100%
rename from doc/howto/dev/use_eigen_en.md
rename to doc/dev/use_eigen_en.md
diff --git a/doc/howto/dev/write_docs_cn.rst b/doc/dev/write_docs_cn.rst
similarity index 98%
rename from doc/howto/dev/write_docs_cn.rst
rename to doc/dev/write_docs_cn.rst
index 1bc947c260d7adb75ee5a2bb10e6b91bc0be2d4c..f79769b810b91c6984016d95f40b89186bfb61b0 100644
--- a/doc/howto/dev/write_docs_cn.rst
+++ b/doc/dev/write_docs_cn.rst
@@ -1,6 +1,6 @@
-##################
-如何贡献/修改文档
-##################
+#############
+如何贡献文档
+#############
 
 PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成，生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。
 也可以利用PaddlePaddle 工具来编译文档，这个情况下所有的文件会存在整理过的的文件目录 .ppo_workspace/content 下
diff --git a/doc/howto/dev/write_docs_en.rst b/doc/dev/write_docs_en.rst
similarity index 98%
rename from doc/howto/dev/write_docs_en.rst
rename to doc/dev/write_docs_en.rst
index b3ef07eb1d0012827df8e6a4f27c5fa643649492..f3408a84269aaeef19986c220454555fbbe30e23 100644
--- a/doc/howto/dev/write_docs_en.rst
+++ b/doc/dev/write_docs_en.rst
@@ -1,6 +1,6 @@
-##################
+########################
 Contribute Documentation
-##################
+########################
 
 PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``.
 Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories.
diff --git a/doc/getstarted/concepts/use_concepts_cn.rst b/doc/getstarted/concepts/use_concepts_cn.rst
index e695ff283e2e806377a51c559b37e8068360a4ff..608f49f5a969b3291eb43bf2acf582af74e566a1 100644
--- a/doc/getstarted/concepts/use_concepts_cn.rst
+++ b/doc/getstarted/concepts/use_concepts_cn.rst
@@ -4,7 +4,7 @@
 
 PaddlePaddle是源于百度的一个深度学习平台。PaddlePaddle为深度学习研究人员提供了丰富的API，可以轻松地完成神经网络配置，模型训练等任务。
 这里将介绍PaddlePaddle的基本使用概念，并且展示了如何利用PaddlePaddle来解决一个经典的线性回归问题。
-在使用该文档之前，请参考 `安装文档 <../build_and_install/index_cn.html>`_ 完成PaddlePaddle的安装。
+在使用该文档之前，请参考 `安装文档 <../../build_and_install/index_cn.html>`_ 完成PaddlePaddle的安装。
 
 
 配置网络
diff --git a/doc/getstarted/index_cn.rst b/doc/getstarted/index_cn.rst
index 9f6ee25987d51dcca3a37cf0f62a70a5a5a2d89a..1dc141396b95bda776aeff87ac30fad6baf37bd2 100644
--- a/doc/getstarted/index_cn.rst
+++ b/doc/getstarted/index_cn.rst
@@ -1,61 +1,8 @@
 新手入门
 ============
 
-.. _quick_install:
-
-快速安装
-++++++++
-
-PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12，并安装有Python2.7。
-执行下面的命令完成快速安装，版本为cpu_avx_openblas：
-
-  .. code-block:: bash
-
-     pip install paddlepaddle
-
-如果需要安装支持GPU的版本（cuda7.5_cudnn5_avx_openblas），需要执行：
-
-  .. code-block:: bash
-
-     pip install paddlepaddle-gpu
-
-更详细的安装和编译方法参考：
-
-..  toctree::
-  :maxdepth: 1
-
-  build_and_install/index_cn.rst
-
-.. _quick_start:
-
-快速开始
-++++++++
-
-创建一个 housing.py 并粘贴此Python代码：
-
-  .. code-block:: python
-
-     import paddle.v2 as paddle
-
-     # Initialize PaddlePaddle.
-     paddle.init(use_gpu=False, trainer_count=1)
-
-     # Configure the neural network.
-     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
-     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
-
-     # Infer using provided test data.
-     probs = paddle.infer(
-         output_layer=y_predict,
-         parameters=paddle.dataset.uci_housing.model(),
-         input=[item for item in paddle.dataset.uci_housing.test()()])
-
-     for i in xrange(len(probs)):
-         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
-
-执行 :code:`python housing.py` 瞧！ 它应该打印出预测住房数据的清单。
-
 ..  toctree::
   :maxdepth: 1
 
+  quickstart_cn.rst
   concepts/use_concepts_cn.rst
diff --git a/doc/getstarted/index_en.rst b/doc/getstarted/index_en.rst
index 063d9d880c82550f7f5d47d3d0b1fff59865bca7..c680e1903750117073bee64cb4d4f4ccfff5ac3d 100644
--- a/doc/getstarted/index_en.rst
+++ b/doc/getstarted/index_en.rst
@@ -1,61 +1,7 @@
 GET STARTED
 ============
 
-.. _quick_install:
-
-Quick Install
-----------------------
-
-You can use pip to install PaddlePaddle with a single command, supports
-CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
-Simply run the following command to install, the version is cpu_avx_openblas:
-
-  .. code-block:: bash
-
-     pip install paddlepaddle
-
-If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
-
-  .. code-block:: bash
-
-     pip install paddlepaddle-gpu
-
-For more details about installation and build:
-
 ..  toctree::
   :maxdepth: 1
 
-  build_and_install/index_en.rst
-
-
-.. _quick_start:
-
-Quick Start
-++++++++
-
-Create a new file called housing.py, and paste this Python
-code:
-
-
-  .. code-block:: python
-
-     import paddle.v2 as paddle
-
-     # Initialize PaddlePaddle.
-     paddle.init(use_gpu=False, trainer_count=1)
-
-     # Configure the neural network.
-     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
-     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
-
-     # Infer using provided test data.
-     probs = paddle.infer(
-         output_layer=y_predict,
-         parameters=paddle.dataset.uci_housing.model(),
-         input=[item for item in paddle.dataset.uci_housing.test()()])
-
-     for i in xrange(len(probs)):
-         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
-
-Run :code:`python housing.py` and voila! It should print out a list of predictions
-for the test housing data.
+  quickstart_en.rst
diff --git a/doc/getstarted/quickstart_cn.rst b/doc/getstarted/quickstart_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d511cead262dabafd095f68adb5ffc596a7fe596
--- /dev/null
+++ b/doc/getstarted/quickstart_cn.rst
@@ -0,0 +1,47 @@
+快速开始
+========
+
+快速安装
+--------
+
+PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12，并安装有Python2.7。
+执行下面的命令完成快速安装，版本为cpu_avx_openblas：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+如果需要安装支持GPU的版本（cuda7.5_cudnn5_avx_openblas），需要执行：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+更详细的安装和编译方法参考：:ref:`install_steps` 。
+
+快速使用
+--------
+
+创建一个 housing.py 并粘贴此Python代码：
+
+  .. code-block:: python
+
+     import paddle.v2 as paddle
+
+     # Initialize PaddlePaddle.
+     paddle.init(use_gpu=False, trainer_count=1)
+
+     # Configure the neural network.
+     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+     # Infer using provided test data.
+     probs = paddle.infer(
+         output_layer=y_predict,
+         parameters=paddle.dataset.uci_housing.model(),
+         input=[item for item in paddle.dataset.uci_housing.test()()])
+
+     for i in xrange(len(probs)):
+         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
+
+执行 :code:`python housing.py` 瞧！ 它应该打印出预测住房数据的清单。
diff --git a/doc/getstarted/quickstart_en.rst b/doc/getstarted/quickstart_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..70f7fe0646068aa79cd72955c6848ac0250c2300
--- /dev/null
+++ b/doc/getstarted/quickstart_en.rst
@@ -0,0 +1,51 @@
+Quick Start
+============
+
+Quick Install
+-------------
+
+You can use pip to install PaddlePaddle with a single command, supports
+CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
+Simply run the following command to install, the version is cpu_avx_openblas:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+For more details about installation and build: :ref:`install_steps` .
+
+Quick Use
+---------
+
+Create a new file called housing.py, and paste this Python
+code:
+
+
+  .. code-block:: python
+
+     import paddle.v2 as paddle
+
+     # Initialize PaddlePaddle.
+     paddle.init(use_gpu=False, trainer_count=1)
+
+     # Configure the neural network.
+     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+     # Infer using provided test data.
+     probs = paddle.infer(
+         output_layer=y_predict,
+         parameters=paddle.dataset.uci_housing.model(),
+         input=[item for item in paddle.dataset.uci_housing.test()()])
+
+     for i in xrange(len(probs)):
+         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
+
+Run :code:`python housing.py` and voila! It should print out a list of predictions
+for the test housing data.
diff --git a/doc/howto/usage/capi/compile_paddle_lib_cn.md b/doc/howto/capi/compile_paddle_lib_cn.md
similarity index 99%
rename from doc/howto/usage/capi/compile_paddle_lib_cn.md
rename to doc/howto/capi/compile_paddle_lib_cn.md
index ac5ecffe2ea8ddc3703a32e9a0a8ee83bbe5dd14..fd8dec8164580b9dcb716e69f3cc5357639f17d3 100644
--- a/doc/howto/usage/capi/compile_paddle_lib_cn.md
+++ b/doc/howto/capi/compile_paddle_lib_cn.md
@@ -1,4 +1,4 @@
-## 编译 PaddlePaddle 预测库
+## 安装与编译C-API预测库
 
 ### 概述
 
diff --git a/doc/howto/usage/capi/images/csr.png b/doc/howto/capi/images/csr.png
similarity index 100%
rename from doc/howto/usage/capi/images/csr.png
rename to doc/howto/capi/images/csr.png
diff --git a/doc/howto/usage/capi/images/sequence_data.png b/doc/howto/capi/images/sequence_data.png
similarity index 100%
rename from doc/howto/usage/capi/images/sequence_data.png
rename to doc/howto/capi/images/sequence_data.png
diff --git a/doc/howto/usage/capi/images/workflow_of_CAPI.png b/doc/howto/capi/images/workflow_of_CAPI.png
similarity index 100%
rename from doc/howto/usage/capi/images/workflow_of_CAPI.png
rename to doc/howto/capi/images/workflow_of_CAPI.png
diff --git a/doc/howto/usage/capi/index_cn.rst b/doc/howto/capi/index_cn.rst
similarity index 87%
rename from doc/howto/usage/capi/index_cn.rst
rename to doc/howto/capi/index_cn.rst
index fd774fbc742671c5a8009cb742f2c9d06a525199..e589a6d346a1e23a4eed9801e02727c80782ae8b 100644
--- a/doc/howto/usage/capi/index_cn.rst
+++ b/doc/howto/capi/index_cn.rst
@@ -1,4 +1,4 @@
-PaddlePaddle C-API
+C-API预测库
 ==================
 
 ..  toctree::
diff --git a/doc/howto/usage/capi/organization_of_the_inputs_cn.md b/doc/howto/capi/organization_of_the_inputs_cn.md
similarity index 100%
rename from doc/howto/usage/capi/organization_of_the_inputs_cn.md
rename to doc/howto/capi/organization_of_the_inputs_cn.md
diff --git a/doc/howto/usage/capi/workflow_of_capi_cn.md b/doc/howto/capi/workflow_of_capi_cn.md
similarity index 99%
rename from doc/howto/usage/capi/workflow_of_capi_cn.md
rename to doc/howto/capi/workflow_of_capi_cn.md
index e0a42fff12cf0f53dee18165e059150861524f74..a61d2267bfdb7c32da528735b20d7c6a531aaa1f 100644
--- a/doc/howto/usage/capi/workflow_of_capi_cn.md
+++ b/doc/howto/capi/workflow_of_capi_cn.md
@@ -1,4 +1,4 @@
-## C-API 使用流程
+## C-API使用流程
 
 这篇文档介绍 PaddlePaddle C-API 整体使用流程。
 
diff --git a/doc/howto/usage/cluster/cluster_train_cn.md b/doc/howto/cluster/cmd_argument_cn.md
similarity index 55%
rename from doc/howto/usage/cluster/cluster_train_cn.md
rename to doc/howto/cluster/cmd_argument_cn.md
index c2fc86687d7106aac7c74d6dd16bc229353cb7c1..5c575dd5b53f6e4ea025a8fbaebdb2d1a1f1c9ed 100644
--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/cluster/cmd_argument_cn.md
@@ -1,41 +1,7 @@
-# 分布式训练
-
-
-## 概述
-
-本文将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示：
-
-<img src="https://user-images.githubusercontent.com/13348433/31772175-5f419eca-b511-11e7-9db7-5231fe3d9ccb.png" width="500">
-
-- 数据分片（Data shard): 用于训练神经网络的数据，被切分成多个部分，每个部分分别给每个trainer使用。
-- 计算节点（Trainer）: 每个trainer启动后读取切分好的一部分数据，开始神经网络的“前馈”和“后馈”计算，并和参数服务器通信。在完成一定量数据的训练后，上传计算得出的梯度（gradients），然后下载优化更新后的神经网络参数（parameters）。
-- 参数服务器（Parameter server）:每个参数服务器只保存整个神经网络所有参数的一部分。参数服务器接收从计算节点上传的梯度，并完成参数优化更新，再将更新后的参数下发到每个计算节点。
-
-这样，通过计算节点和参数服务器的分布式协作，可以完成神经网络的SGD方法的训练。PaddlePaddle可以同时支持同步随机梯度下降（SGD）和异步随机梯度下降。
-
-在使用同步SGD训练神经网络时，PaddlePaddle使用同步屏障（barrier），使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中，则并不会等待所有trainer提交梯度才更新参数，这样极大地提高了计算的并行性：参数服务器之间不相互依赖，并行地接收梯度和更新参数，参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步，计算节点之间也不会相互依赖，并行地执行模型的训练。可以看出，虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新，在任意时间某一台参数服务器上保存的参数可能比另一台要更新，与同步SGD相比，梯度会有噪声。
-
-
-## 环境准备
-
-1. 准备您的计算集群。计算集群通常由一组（几台到几千台规模）的Linux服务器组成。服务器之间可以通过局域网（LAN）联通，每台服务器具有集群中唯一的IP地址（或者可被DNS解析的主机名）。集群中的每台计算机通常被成为一个“节点”。
-1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU，还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/index_cn.html)的多种安装方式。我们推荐使用[Docker](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)安装方式来快速安装PaddlePaddle。
-
-安装完成之后，执行下面的命令可以查看已经安装的版本（docker安装方式可以进入docker容器执行：`docker run -it paddlepaddle/paddle:[tag] /bin/bash`）：
-```bash
-$ paddle version
-PaddlePaddle 0.10.0, compiled with
-    with_avx: ON
-    with_gpu: OFF
-    with_double: OFF
-    with_python: ON
-    with_rdma: OFF
-    with_timer: OFF
-```
+## 启动参数说明
 
-下面以`doc/howto/usage/cluster/src/word2vec`中的代码作为实例，介绍使用PaddlePaddle v2 API完成分布式训练。
+下面以`doc/howto/cluster/src/word2vec`中的代码作为实例，介绍使用PaddlePaddle v2 API完成分布式训练。
 
-## 启动参数说明
 ### 启动参数服务器
 执行以下的命令启动一个参数服务器并等待和计算节点的数据交互
 ```bash
@@ -92,11 +58,11 @@ paddle.init(
 参数说明
 
 - use_gpu： **可选，默认False**，是否启用GPU训练
-- trainer_count：**必选，默认1**，当前训练任务trainer总个数
+- trainer_count：**必选，默认1**，当前trainer的线程数目
 - port：**必选，默认7164**，连接到pserver的端口
 - ports_num：**必选，默认1**，连接到pserver的端口个数
 - ports_num_for_sparse：**必选，默认0**，和pserver之间用于稀疏类型参数通信的端口个数
-- num_gradient_servers：**必选，默认1**，当前训练任务pserver总数
+- num_gradient_servers：**必选，默认1**，当前训练任务trainer总数
 - trainer_id：**必选，默认0**，每个trainer的唯一ID，从0开始的整数
 - pservers：**必选，默认127.0.0.1**，当前训练任务启动的pserver的IP列表，多个IP使用“,”隔开
 
@@ -167,22 +133,3 @@ test.txt-00002
 
 - `train_data_dir`：包含训练数据的目录，可以是从分布式存储挂载过来的，也可以是在任务启动前下载到本地的。
 - `test_data_dir`：包含测试数据集的目录。
-
-## 使用分布式计算平台或工具
-
-PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务，包括：
-- [Kubernetes](http://kubernetes.io) Google开源的容器集群的调度框架，支持大规模集群生产环境的完整集群方案。
-- [OpenMPI](https://www.open-mpi.org) 成熟的高性能并行计算框架。
-- [Fabric](http://www.fabfile.org) 集群管理工具。可以使用`Fabric`编写集群任务提交和管理脚本。
-
-对于不同的集群平台，会分别介绍集群作业的启动和停止方法。这些例子都可以在[cluster_train_v2](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2)找到。
-
-在使用分布式计算平台进行训练时，任务被调度在集群中时，分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数，比如节点的ID、IP和任务节点个数等。
-
-## 在不同集群中运行
-
-  - [fabric集群](fabric_cn.md)
-  - [openmpi集群](openmpi_cn.md)
-  - [kubernetes单机](k8s_cn.md)
-  - [kubernetes distributed分布式](k8s_distributed_cn.md)
-  - [AWS上运行kubernetes集群训练](k8s_aws_cn.md)
diff --git a/doc/howto/usage/cluster/cluster_train_en.md b/doc/howto/cluster/cmd_argument_en.md
similarity index 56%
rename from doc/howto/usage/cluster/cluster_train_en.md
rename to doc/howto/cluster/cmd_argument_en.md
index 28cd1fa7903e559e33a7fc2f00172fdfbe2fdc97..06fd5717564c99e3bb46835a2bd5071dff665f23 100644
--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/cluster/cmd_argument_en.md
@@ -1,40 +1,7 @@
-# Distributed Training
-
-## Introduction
-
-In this article, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job:
-
-<img src="https://user-images.githubusercontent.com/13348433/31772146-41523d84-b511-11e7-8a12-a69fd136c283.png" width="500">
-
-- Data shard: training data will be split into multiple partitions, trainers use the partitions of the whole dataset to do the training job.
-- Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training.
-- Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers.
-
-PaddlePaddle can support both synchronize stochastic gradient descent (SGD) and asynchronous SGD.
-
-When training with synchronize SGD, PaddlePaddle uses an internal "synchronize barrier" which makes gradients update and parameter download in strict order. On the other hand, asynchronous SGD won't wait for all trainers to finish upload at a single step, this will increase the parallelism of distributed training: parameter servers do not depend on each other, they'll do parameter optimization concurrently. Parameter servers will not wait for trainers, so trainers will also do their work concurrently. But asynchronous SGD will introduce more randomness and noises in the gradient.
-
-## Preparations
-1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes".
-2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html) document. We strongly recommend using [Docker installation](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html).
-
-After installation, you can check the version by typing the below command (run a docker container  if using docker: `docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
-
-```bash
-$ paddle version
-PaddlePaddle 0.10.0rc, compiled with
-    with_avx: ON
-    with_gpu: OFF
-    with_double: OFF
-    with_python: ON
-    with_rdma: OFF
-    with_timer: OFF
-```
-
-We'll take `doc/howto/usage/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API.
-
 ## Command-line arguments
 
+We'll take `doc/howto/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API.
+
 ### Starting parameter server
 
 Type the below command to start a parameter server which will wait for trainers to connect:
@@ -95,11 +62,11 @@ paddle.init(
 Parameter Description
 
 - use_gpu: **optional, default False**, set to "True" to enable GPU training.
-- trainer_count: **required, default 1**, total count of trainers in the training job.
+- trainer_count: **required, default 1**, number of threads in current trainer.
 - port: **required, default 7164**, port to connect to parameter server.
 - ports_num: **required, default 1**, number of ports for communication.
 - ports_num_for_sparse: **required, default 0**, number of ports for sparse type caculation.
-- num_gradient_servers: **required, default 1**, total number of gradient server.
+- num_gradient_servers: **required, default 1**, number of trainers in current job.
 - trainer_id: **required, default 0**, ID for every trainer, start from 0.
 - pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".
 
@@ -171,21 +138,3 @@ Your workspace may looks like:
 
 - `train_data_dir`: containing training data. Mount from storage service or copy trainning data to here.
 - `test_data_dir`: containing testing data.
-
-## Use cluster platforms or cluster management tools
-
-PaddlePaddle supports running jobs on several platforms including:
-- [Kubernetes](http://kubernetes.io) open-source system for automating deployment, scaling, and management of containerized applications from Google.
-- [OpenMPI](https://www.open-mpi.org) Mature high performance parallel computing framework.
-- [Fabric](http://www.fabfile.org) A cluster management tool. Write scripts to submit jobs or manage the cluster.
-
-We'll introduce cluster job management on these platforms. The examples can be found under [cluster_train_v2](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2).
-
-These cluster platforms provide API or environment variables for training processes, when the job is dispatched to different nodes. Like node ID, IP or total number of nodes etc.
-
-## Use different clusters
-
-  - [fabric](fabric_en.md)
-  - [openmpi](openmpi_en.md)
-  - [kubernetes](k8s_en.md)
-  - [kubernetes on AWS](k8s_aws_en.md)
diff --git a/doc/howto/usage/cluster/fluid_cluster_train_en.md b/doc/howto/cluster/fluid_cluster_train_en.md
similarity index 91%
rename from doc/howto/usage/cluster/fluid_cluster_train_en.md
rename to doc/howto/cluster/fluid_cluster_train_en.md
index 11904a6f71bb6ce37417aeffb8e408ec65961b12..ae825d9a517c7e9005d4e32f8f34b3f6a79be0c9 100644
--- a/doc/howto/usage/cluster/fluid_cluster_train_en.md
+++ b/doc/howto/cluster/fluid_cluster_train_en.md
@@ -16,6 +16,12 @@ PaddlePaddle must be installed on all nodes. If you have GPU cards on your nodes
 
 PaddlePaddle build and installation guide can be found  [here](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html).
 
+In addition to above, the `cmake` command should be run with the option `WITH_DISTRIBUTE` set to on. An example bare minimum `cmake` command would look as follows:
+
+``` bash
+cmake .. -DWITH_DOC=OFF -DWITH_GPU=OFF -DWITH_DISTRIBUTE=ON -DWITH_SWIG_PY=ON -DWITH_PYTHON=ON
+```
+
 ### Update the training script
 
 #### Non-cluster training script
@@ -119,7 +125,14 @@ for pass_id in range(100):
 
 ### E2E demo
 
-Please find the complete demo from [here](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py). In parameter server node run the following in the command line:
+Please find the complete demo from [here](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py).
+First `cd` into the folder that contains the `python` files. In this case:
+
+```bash
+cd /paddle/python/paddle/v2/fluid/tests/book_distribute
+```
+
+In parameter server node run the following in the command line:
 
 ``` bash
 PSERVERS=192.168.1.2:6174 SERVER_ENDPOINT=192.168.1.2:6174 TRAINING_ROLE=PSERVER python notest_dist_fit_a_line.py
diff --git a/doc/howto/cluster/index_cn.rst b/doc/howto/cluster/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a60521b4a9646bdc6d9f1bf6da482acc989d8bf3
--- /dev/null
+++ b/doc/howto/cluster/index_cn.rst
@@ -0,0 +1,22 @@
+分布式训练
+==========
+
+本节将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示：
+
+.. image:: src/ps_cn.png
+   :width: 500
+
+- 数据分片（Data shard): 用于训练神经网络的数据，被切分成多个部分，每个部分分别给每个trainer使用。
+- 计算节点（Trainer）: 每个trainer启动后读取切分好的一部分数据，开始神经网络的“前馈”和“后馈”计算，并和参数服务器通信。在完成一定量数据的训练后，上传计算得出的梯度（gradients），然后下载优化更新后的神经网络参数（parameters）。
+- 参数服务器（Parameter server）:每个参数服务器只保存整个神经网络所有参数的一部分。参数服务器接收从计算节点上传的梯度，并完成参数优化更新，再将更新后的参数下发到每个计算节点。
+
+这样，通过计算节点和参数服务器的分布式协作，可以完成神经网络的SGD方法的训练。PaddlePaddle可以同时支持同步随机梯度下降（SGD）和异步随机梯度下降。
+
+在使用同步SGD训练神经网络时，PaddlePaddle使用同步屏障（barrier），使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中，则并不会等待所有trainer提交梯度才更新参数，这样极大地提高了计算的并行性：参数服务器之间不相互依赖，并行地接收梯度和更新参数，参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步，计算节点之间也不会相互依赖，并行地执行模型的训练。可以看出，虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新，在任意时间某一台参数服务器上保存的参数可能比另一台要更新，与同步SGD相比，梯度会有噪声。
+
+..  toctree::
+  :maxdepth: 1
+
+  preparations_cn.md
+  cmd_argument_cn.md
+  multi_cluster/index_cn.rst
diff --git a/doc/howto/cluster/index_en.rst b/doc/howto/cluster/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2640a09dcc904619bc97c9bd3f3d81a9dc307663
--- /dev/null
+++ b/doc/howto/cluster/index_en.rst
@@ -0,0 +1,22 @@
+Distributed Training
+====================
+
+In this section, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job:
+
+.. image:: src/ps_en.png
+   :width: 500
+
+- Data shard: training data will be split into multiple partitions, trainers use the partitions of the whole dataset to do the training job.
+- Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training.
+- Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers.
+
+PaddlePaddle can support both synchronize stochastic gradient descent (SGD) and asynchronous SGD.
+
+When training with synchronize SGD, PaddlePaddle uses an internal "synchronize barrier" which makes gradients update and parameter download in strict order. On the other hand, asynchronous SGD won't wait for all trainers to finish upload at a single step, this will increase the parallelism of distributed training: parameter servers do not depend on each other, they'll do parameter optimization concurrently. Parameter servers will not wait for trainers, so trainers will also do their work concurrently. But asynchronous SGD will introduce more randomness and noises in the gradient.
+
+..  toctree::
+  :maxdepth: 1
+
+  preparations_en.md
+  cmd_argument_en.md
+  multi_cluster/index_en.rst
diff --git a/doc/howto/usage/cluster/fabric_cn.md b/doc/howto/cluster/multi_cluster/fabric_cn.md
similarity index 100%
rename from doc/howto/usage/cluster/fabric_cn.md
rename to doc/howto/cluster/multi_cluster/fabric_cn.md
diff --git a/doc/howto/usage/cluster/fabric_en.md b/doc/howto/cluster/multi_cluster/fabric_en.md
similarity index 100%
rename from doc/howto/usage/cluster/fabric_en.md
rename to doc/howto/cluster/multi_cluster/fabric_en.md
diff --git a/doc/howto/cluster/multi_cluster/index_cn.rst b/doc/howto/cluster/multi_cluster/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ef56b6ddb38e59f20f7248de1ceb952c7627ce76
--- /dev/null
+++ b/doc/howto/cluster/multi_cluster/index_cn.rst
@@ -0,0 +1,20 @@
+在不同集群中运行
+================
+
+PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务，包括：
+- `Kubernetes <http://kubernetes.io>`_ Google开源的容器集群的调度框架，支持大规模集群生产环境的完整集群方案。
+- `OpenMPI <https://www.open-mpi.org>`_ 成熟的高性能并行计算框架。
+- `Fabric <http://www.fabfile.org>`_ 集群管理工具。可以使用`Fabric`编写集群任务提交和管理脚本。
+
+对于不同的集群平台，会分别介绍集群作业的启动和停止方法。这些例子都可以在 `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ 找到。
+
+在使用分布式计算平台进行训练时，任务被调度在集群中时，分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数，比如节点的ID、IP和任务节点个数等。
+
+..  toctree::
+  :maxdepth: 1
+
+  fabric_cn.md
+  openmpi_cn.md
+  k8s_cn.md
+  k8s_distributed_cn.md
+  k8s_aws_cn.md
diff --git a/doc/howto/cluster/multi_cluster/index_en.rst b/doc/howto/cluster/multi_cluster/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..dac7aaef085c80851c1bbb89250faf2151de4ca6
--- /dev/null
+++ b/doc/howto/cluster/multi_cluster/index_en.rst
@@ -0,0 +1,19 @@
+Use different clusters
+======================
+
+PaddlePaddle supports running jobs on several platforms including:
+- `Kubernetes <http://kubernetes.io>`_ open-source system for automating deployment, scaling, and management of containerized applications from Google.
+- `OpenMPI <https://www.open-mpi.org>`_ Mature high performance parallel computing framework.
+- `Fabric <http://www.fabfile.org>`_ A cluster management tool. Write scripts to submit jobs or manage the cluster.
+
+We'll introduce cluster job management on these platforms. The examples can be found under `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ .
+
+These cluster platforms provide API or environment variables for training processes, when the job is dispatched to different nodes. Like node ID, IP or total number of nodes etc.
+
+..  toctree::
+  :maxdepth: 1
+
+  fabric_en.md
+  openmpi_en.md
+  k8s_en.md
+  k8s_aws_en.md
diff --git a/doc/howto/usage/cluster/k8s_aws_cn.md b/doc/howto/cluster/multi_cluster/k8s_aws_cn.md
similarity index 100%
rename from doc/howto/usage/cluster/k8s_aws_cn.md
rename to doc/howto/cluster/multi_cluster/k8s_aws_cn.md
diff --git a/doc/howto/usage/cluster/k8s_aws_en.md b/doc/howto/cluster/multi_cluster/k8s_aws_en.md
similarity index 100%
rename from doc/howto/usage/cluster/k8s_aws_en.md
rename to doc/howto/cluster/multi_cluster/k8s_aws_en.md
diff --git a/doc/howto/usage/cluster/k8s_cn.md b/doc/howto/cluster/multi_cluster/k8s_cn.md
similarity index 100%
rename from doc/howto/usage/cluster/k8s_cn.md
rename to doc/howto/cluster/multi_cluster/k8s_cn.md
diff --git a/doc/howto/usage/cluster/k8s_distributed_cn.md b/doc/howto/cluster/multi_cluster/k8s_distributed_cn.md
similarity index 100%
rename from doc/howto/usage/cluster/k8s_distributed_cn.md
rename to doc/howto/cluster/multi_cluster/k8s_distributed_cn.md
diff --git a/doc/howto/usage/cluster/k8s_en.md b/doc/howto/cluster/multi_cluster/k8s_en.md
similarity index 100%
rename from doc/howto/usage/cluster/k8s_en.md
rename to doc/howto/cluster/multi_cluster/k8s_en.md
diff --git a/doc/howto/usage/cluster/openmpi_cn.md b/doc/howto/cluster/multi_cluster/openmpi_cn.md
similarity index 100%
rename from doc/howto/usage/cluster/openmpi_cn.md
rename to doc/howto/cluster/multi_cluster/openmpi_cn.md
diff --git a/doc/howto/usage/cluster/openmpi_en.md b/doc/howto/cluster/multi_cluster/openmpi_en.md
similarity index 100%
rename from doc/howto/usage/cluster/openmpi_en.md
rename to doc/howto/cluster/multi_cluster/openmpi_en.md
diff --git a/doc/howto/usage/cluster/src/add_security_group.png b/doc/howto/cluster/multi_cluster/src/add_security_group.png
similarity index 100%
rename from doc/howto/usage/cluster/src/add_security_group.png
rename to doc/howto/cluster/multi_cluster/src/add_security_group.png
diff --git a/doc/howto/usage/cluster/src/create_efs.png b/doc/howto/cluster/multi_cluster/src/create_efs.png
similarity index 100%
rename from doc/howto/usage/cluster/src/create_efs.png
rename to doc/howto/cluster/multi_cluster/src/create_efs.png
diff --git a/doc/howto/usage/cluster/src/k8s-paddle-arch.png b/doc/howto/cluster/multi_cluster/src/k8s-paddle-arch.png
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s-paddle-arch.png
rename to doc/howto/cluster/multi_cluster/src/k8s-paddle-arch.png
diff --git a/doc/howto/usage/cluster/src/k8s_data/Dockerfile b/doc/howto/cluster/multi_cluster/src/k8s_data/Dockerfile
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s_data/Dockerfile
rename to doc/howto/cluster/multi_cluster/src/k8s_data/Dockerfile
diff --git a/doc/howto/usage/cluster/src/k8s_data/README.md b/doc/howto/cluster/multi_cluster/src/k8s_data/README.md
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s_data/README.md
rename to doc/howto/cluster/multi_cluster/src/k8s_data/README.md
diff --git a/doc/howto/usage/cluster/src/k8s_data/get_data.sh b/doc/howto/cluster/multi_cluster/src/k8s_data/get_data.sh
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s_data/get_data.sh
rename to doc/howto/cluster/multi_cluster/src/k8s_data/get_data.sh
diff --git a/doc/howto/usage/cluster/src/k8s_train/Dockerfile b/doc/howto/cluster/multi_cluster/src/k8s_train/Dockerfile
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s_train/Dockerfile
rename to doc/howto/cluster/multi_cluster/src/k8s_train/Dockerfile
diff --git a/doc/howto/usage/cluster/src/k8s_train/README.md b/doc/howto/cluster/multi_cluster/src/k8s_train/README.md
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s_train/README.md
rename to doc/howto/cluster/multi_cluster/src/k8s_train/README.md
diff --git a/doc/howto/usage/cluster/src/k8s_train/start.sh b/doc/howto/cluster/multi_cluster/src/k8s_train/start.sh
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s_train/start.sh
rename to doc/howto/cluster/multi_cluster/src/k8s_train/start.sh
diff --git a/doc/howto/usage/cluster/src/k8s_train/start_paddle.py b/doc/howto/cluster/multi_cluster/src/k8s_train/start_paddle.py
similarity index 100%
rename from doc/howto/usage/cluster/src/k8s_train/start_paddle.py
rename to doc/howto/cluster/multi_cluster/src/k8s_train/start_paddle.py
diff --git a/doc/howto/usage/cluster/src/pserver_and_trainer.png b/doc/howto/cluster/multi_cluster/src/pserver_and_trainer.png
similarity index 100%
rename from doc/howto/usage/cluster/src/pserver_and_trainer.png
rename to doc/howto/cluster/multi_cluster/src/pserver_and_trainer.png
diff --git a/doc/howto/usage/cluster/src/route53_create_recordset.png b/doc/howto/cluster/multi_cluster/src/route53_create_recordset.png
similarity index 100%
rename from doc/howto/usage/cluster/src/route53_create_recordset.png
rename to doc/howto/cluster/multi_cluster/src/route53_create_recordset.png
diff --git a/doc/howto/usage/cluster/src/route53_create_zone.png b/doc/howto/cluster/multi_cluster/src/route53_create_zone.png
similarity index 100%
rename from doc/howto/usage/cluster/src/route53_create_zone.png
rename to doc/howto/cluster/multi_cluster/src/route53_create_zone.png
diff --git a/doc/howto/usage/cluster/src/worker_security_group.png b/doc/howto/cluster/multi_cluster/src/worker_security_group.png
similarity index 100%
rename from doc/howto/usage/cluster/src/worker_security_group.png
rename to doc/howto/cluster/multi_cluster/src/worker_security_group.png
diff --git a/doc/howto/cluster/preparations_cn.md b/doc/howto/cluster/preparations_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..ce40697e703503b66f6306e15ebdb0ce1329991d
--- /dev/null
+++ b/doc/howto/cluster/preparations_cn.md
@@ -0,0 +1,16 @@
+## 环境准备
+
+1. 准备您的计算集群。计算集群通常由一组（几台到几千台规模）的Linux服务器组成。服务器之间可以通过局域网（LAN）联通，每台服务器具有集群中唯一的IP地址（或者可被DNS解析的主机名）。集群中的每台计算机通常被成为一个“节点”。
+1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU，还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/index_cn.html)的多种安装方式。我们推荐使用[Docker](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)安装方式来快速安装PaddlePaddle。
+
+安装完成之后，执行下面的命令可以查看已经安装的版本（docker安装方式可以进入docker容器执行：`docker run -it paddlepaddle/paddle:[tag] /bin/bash`）：
+```bash
+$ paddle version
+PaddlePaddle 0.10.0, compiled with
+    with_avx: ON
+    with_gpu: OFF
+    with_double: OFF
+    with_python: ON
+    with_rdma: OFF
+    with_timer: OFF
+```
diff --git a/doc/howto/cluster/preparations_en.md b/doc/howto/cluster/preparations_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..4b77b293907ae0548134fc65ceed3aa0ed0b845d
--- /dev/null
+++ b/doc/howto/cluster/preparations_en.md
@@ -0,0 +1,17 @@
+## Preparations
+
+1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes".
+2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html) document. We strongly recommend using [Docker installation](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html).
+
+After installation, you can check the version by typing the below command (run a docker container  if using docker: `docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
+
+```bash
+$ paddle version
+PaddlePaddle 0.10.0rc, compiled with
+    with_avx: ON
+    with_gpu: OFF
+    with_double: OFF
+    with_python: ON
+    with_rdma: OFF
+    with_timer: OFF
+```
diff --git a/doc/howto/usage/cluster/src/Dockerfile b/doc/howto/cluster/src/Dockerfile
similarity index 100%
rename from doc/howto/usage/cluster/src/Dockerfile
rename to doc/howto/cluster/src/Dockerfile
diff --git a/doc/howto/usage/cluster/src/efs_mount.png b/doc/howto/cluster/src/efs_mount.png
similarity index 100%
rename from doc/howto/usage/cluster/src/efs_mount.png
rename to doc/howto/cluster/src/efs_mount.png
diff --git a/doc/howto/usage/cluster/src/managed_policy.png b/doc/howto/cluster/src/managed_policy.png
similarity index 100%
rename from doc/howto/usage/cluster/src/managed_policy.png
rename to doc/howto/cluster/src/managed_policy.png
diff --git a/doc/howto/usage/cluster/src/trainer_cn.png b/doc/howto/cluster/src/ps_cn.png
similarity index 100%
rename from doc/howto/usage/cluster/src/trainer_cn.png
rename to doc/howto/cluster/src/ps_cn.png
diff --git a/doc/howto/usage/cluster/src/trainer.png b/doc/howto/cluster/src/ps_en.png
similarity index 100%
rename from doc/howto/usage/cluster/src/trainer.png
rename to doc/howto/cluster/src/ps_en.png
diff --git a/doc/howto/cluster/src/trainer.png b/doc/howto/cluster/src/trainer.png
new file mode 100644
index 0000000000000000000000000000000000000000..6537d3d56589ca9f19a77a50a970e4b5275e6ce0
Binary files /dev/null and b/doc/howto/cluster/src/trainer.png differ
diff --git a/doc/howto/cluster/src/trainer_cn.png b/doc/howto/cluster/src/trainer_cn.png
new file mode 100644
index 0000000000000000000000000000000000000000..f9525739cc8bc6506adde642aafa0a85ae3ebebc
Binary files /dev/null and b/doc/howto/cluster/src/trainer_cn.png differ
diff --git a/doc/howto/usage/cluster/src/word2vec/api_train_v2.py b/doc/howto/cluster/src/word2vec/api_train_v2.py
similarity index 100%
rename from doc/howto/usage/cluster/src/word2vec/api_train_v2.py
rename to doc/howto/cluster/src/word2vec/api_train_v2.py
diff --git a/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py b/doc/howto/cluster/src/word2vec/api_train_v2_cluster.py
similarity index 100%
rename from doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
rename to doc/howto/cluster/src/word2vec/api_train_v2_cluster.py
diff --git a/doc/howto/usage/cluster/src/word2vec/prepare.py b/doc/howto/cluster/src/word2vec/prepare.py
similarity index 100%
rename from doc/howto/usage/cluster/src/word2vec/prepare.py
rename to doc/howto/cluster/src/word2vec/prepare.py
diff --git a/doc/howto/usage/cmd_parameter/arguments_cn.md b/doc/howto/cmd_parameter/arguments_cn.md
similarity index 100%
rename from doc/howto/usage/cmd_parameter/arguments_cn.md
rename to doc/howto/cmd_parameter/arguments_cn.md
diff --git a/doc/howto/usage/cmd_parameter/arguments_en.md b/doc/howto/cmd_parameter/arguments_en.md
similarity index 100%
rename from doc/howto/usage/cmd_parameter/arguments_en.md
rename to doc/howto/cmd_parameter/arguments_en.md
diff --git a/doc/howto/usage/cmd_parameter/detail_introduction_cn.md b/doc/howto/cmd_parameter/detail_introduction_cn.md
similarity index 100%
rename from doc/howto/usage/cmd_parameter/detail_introduction_cn.md
rename to doc/howto/cmd_parameter/detail_introduction_cn.md
diff --git a/doc/howto/usage/cmd_parameter/detail_introduction_en.md b/doc/howto/cmd_parameter/detail_introduction_en.md
similarity index 100%
rename from doc/howto/usage/cmd_parameter/detail_introduction_en.md
rename to doc/howto/cmd_parameter/detail_introduction_en.md
diff --git a/doc/howto/usage/cmd_parameter/index_cn.rst b/doc/howto/cmd_parameter/index_cn.rst
similarity index 85%
rename from doc/howto/usage/cmd_parameter/index_cn.rst
rename to doc/howto/cmd_parameter/index_cn.rst
index 4c8729821110b9aec99351fc0a83a1ba75a8a2bb..17b379f6295d66d864e2b53108012eff5895d96b 100644
--- a/doc/howto/usage/cmd_parameter/index_cn.rst
+++ b/doc/howto/cmd_parameter/index_cn.rst
@@ -1,6 +1,6 @@
 ..  _cmd_line_index:
 
-设置命令行参数
+命令行参数设置
 ===============
 
 ..  toctree::
diff --git a/doc/howto/usage/cmd_parameter/index_en.rst b/doc/howto/cmd_parameter/index_en.rst
similarity index 100%
rename from doc/howto/usage/cmd_parameter/index_en.rst
rename to doc/howto/cmd_parameter/index_en.rst
diff --git a/doc/howto/usage/cmd_parameter/use_case_cn.md b/doc/howto/cmd_parameter/use_case_cn.md
similarity index 100%
rename from doc/howto/usage/cmd_parameter/use_case_cn.md
rename to doc/howto/cmd_parameter/use_case_cn.md
diff --git a/doc/howto/usage/cmd_parameter/use_case_en.md b/doc/howto/cmd_parameter/use_case_en.md
similarity index 100%
rename from doc/howto/usage/cmd_parameter/use_case_en.md
rename to doc/howto/cmd_parameter/use_case_en.md
diff --git a/doc/howto/dev/contribute_to_paddle_en.md b/doc/howto/dev/contribute_to_paddle_en.md
deleted file mode 120000
index c97564d93a7f0a753a23cd97d2467d595bd154ff..0000000000000000000000000000000000000000
--- a/doc/howto/dev/contribute_to_paddle_en.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../CONTRIBUTING.md
\ No newline at end of file
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
index e0c69f7a6a4043abe999af6c8dd2555178b68424..0c534f107b6e047035c424ed2ea59f3982799b63 100644
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -1,37 +1,11 @@
-进阶指南
+进阶使用
 ========
 
-使用说明
---------
-
-..  toctree::
-  :maxdepth: 1
-
-  usage/cmd_parameter/index_cn.rst
-  usage/cluster/cluster_train_cn.md
-  usage/capi/index_cn.rst
-
-开发标准
---------
-
-..  toctree::
-  :maxdepth: 1
-
-  dev/contribute_to_paddle_cn.md
-  dev/write_docs_cn.rst
-
-模型配置
---------
-
-..  toctree::
-  :maxdepth: 1
-
-  deep_model/rnn/index_cn.rst
-
-性能优化
---------
-
 ..  toctree::
   :maxdepth: 1
 
+  cmd_parameter/index_cn.rst
+  cluster/index_cn.rst
+  capi/index_cn.rst
+  rnn/index_cn.rst
   optimization/gpu_profiling_cn.rst
diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst
index 6d1bf7dfc003da6de31410ee0a7959233adfaf76..ae8b86f75b5de770312fb2fdc46db490a18e5ff6 100644
--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -1,37 +1,10 @@
 HOW TO
 =======
 
-Usage
--------
-
-..  toctree::
-  :maxdepth: 1
-
-  usage/cmd_parameter/index_en.rst
-  usage/cluster/cluster_train_en.md
-
-Development
-------------
-
-..  toctree::
-  :maxdepth: 1
-
-  dev/new_layer_en.rst
-  dev/contribute_to_paddle_en.md
-  dev/write_docs_en.rst
-
-Configuration
--------------
-
-..  toctree::
-  :maxdepth: 1
-
-  deep_model/rnn/index_en.rst
-
-Optimization
--------------
-
 ..  toctree::
   :maxdepth: 1
 
+  cmd_parameter/index_en.rst
+  cluster/index_en.rst
+  rnn/index_en.rst
   optimization/gpu_profiling_en.rst
diff --git a/doc/howto/optimization/cpu_profiling.md b/doc/howto/optimization/cpu_profiling_en.md
similarity index 100%
rename from doc/howto/optimization/cpu_profiling.md
rename to doc/howto/optimization/cpu_profiling_en.md
diff --git a/doc/howto/optimization/gpu_profiling_cn.rst b/doc/howto/optimization/gpu_profiling_cn.rst
index e2b0b0396e0034b01ed2c5081effdd3bcabf31ae..0239eef4f118197bf92f9fc7d323be58344b0ded 100644
--- a/doc/howto/optimization/gpu_profiling_cn.rst
+++ b/doc/howto/optimization/gpu_profiling_cn.rst
@@ -1,6 +1,6 @@
-==================
-GPU性能分析与调优
-==================
+============
+GPU性能调优
+============
 
 ..  contents::
 
diff --git a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst b/doc/howto/rnn/hierarchical_layer_cn.rst
similarity index 100%
rename from doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
rename to doc/howto/rnn/hierarchical_layer_cn.rst
diff --git a/doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst b/doc/howto/rnn/hrnn_rnn_api_compare_cn.rst
similarity index 100%
rename from doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst
rename to doc/howto/rnn/hrnn_rnn_api_compare_cn.rst
diff --git a/doc/howto/deep_model/rnn/index_cn.rst b/doc/howto/rnn/index_cn.rst
similarity index 90%
rename from doc/howto/deep_model/rnn/index_cn.rst
rename to doc/howto/rnn/index_cn.rst
index 9ecab5594cff47cde4700b7ce0f58013a960a16e..bcc8c2f46eb662ec3650e829a77992224dbbb8e7 100644
--- a/doc/howto/deep_model/rnn/index_cn.rst
+++ b/doc/howto/rnn/index_cn.rst
@@ -1,4 +1,4 @@
-RNN相关模型
+RNN模型
 ===========
 
 ..  toctree::
diff --git a/doc/howto/deep_model/rnn/index_en.rst b/doc/howto/rnn/index_en.rst
similarity index 100%
rename from doc/howto/deep_model/rnn/index_en.rst
rename to doc/howto/rnn/index_en.rst
diff --git a/doc/howto/deep_model/rnn/recurrent_group_cn.md b/doc/howto/rnn/recurrent_group_cn.md
similarity index 100%
rename from doc/howto/deep_model/rnn/recurrent_group_cn.md
rename to doc/howto/rnn/recurrent_group_cn.md
diff --git a/doc/howto/deep_model/rnn/rnn_config_cn.rst b/doc/howto/rnn/rnn_config_cn.rst
similarity index 100%
rename from doc/howto/deep_model/rnn/rnn_config_cn.rst
rename to doc/howto/rnn/rnn_config_cn.rst
diff --git a/doc/howto/deep_model/rnn/rnn_config_en.rst b/doc/howto/rnn/rnn_config_en.rst
similarity index 100%
rename from doc/howto/deep_model/rnn/rnn_config_en.rst
rename to doc/howto/rnn/rnn_config_en.rst
diff --git a/doc/howto/deep_model/rnn/src/bi_lstm.jpg b/doc/howto/rnn/src/bi_lstm.jpg
similarity index 100%
rename from doc/howto/deep_model/rnn/src/bi_lstm.jpg
rename to doc/howto/rnn/src/bi_lstm.jpg
diff --git a/doc/howto/deep_model/rnn/src/encoder-decoder-attention-model.png b/doc/howto/rnn/src/encoder-decoder-attention-model.png
similarity index 100%
rename from doc/howto/deep_model/rnn/src/encoder-decoder-attention-model.png
rename to doc/howto/rnn/src/encoder-decoder-attention-model.png
diff --git a/doc/howto/deep_model/rnn/src/glossary_rnn.dot b/doc/howto/rnn/src/glossary_rnn.dot
similarity index 100%
rename from doc/howto/deep_model/rnn/src/glossary_rnn.dot
rename to doc/howto/rnn/src/glossary_rnn.dot
diff --git a/doc/howto/deep_model/rnn/src/glossary_rnn_with_memory.dot b/doc/howto/rnn/src/glossary_rnn_with_memory.dot
similarity index 100%
rename from doc/howto/deep_model/rnn/src/glossary_rnn_with_memory.dot
rename to doc/howto/rnn/src/glossary_rnn_with_memory.dot
diff --git a/doc/howto/deep_model/rnn/src/simple_full_hierarchical_recurrent.dot b/doc/howto/rnn/src/simple_full_hierarchical_recurrent.dot
similarity index 100%
rename from doc/howto/deep_model/rnn/src/simple_full_hierarchical_recurrent.dot
rename to doc/howto/rnn/src/simple_full_hierarchical_recurrent.dot
diff --git a/doc/howto/deep_model/rnn/src/simple_full_recurrent.dot b/doc/howto/rnn/src/simple_full_recurrent.dot
similarity index 100%
rename from doc/howto/deep_model/rnn/src/simple_full_recurrent.dot
rename to doc/howto/rnn/src/simple_full_recurrent.dot
diff --git a/doc/index_cn.rst b/doc/index_cn.rst
index ada51c2d73263898b2c748437f8eb0f30b537073..63a78428583477792e309a3b3d26af340caccfca 100644
--- a/doc/index_cn.rst
+++ b/doc/index_cn.rst
@@ -5,7 +5,8 @@ PaddlePaddle 文档
   :maxdepth: 1
 
   getstarted/index_cn.rst
+  build_and_install/index_cn.rst
   howto/index_cn.rst
+  dev/index_cn.rst
   api/index_cn.rst
   faq/index_cn.rst
-  mobile/index_cn.rst
diff --git a/doc/index_en.rst b/doc/index_en.rst
index 23b64b6cadf776d44c4d0aa5a550ffe24be13b18..5631381be087017c26b2a6a3984b3c5bdb49f12d 100644
--- a/doc/index_en.rst
+++ b/doc/index_en.rst
@@ -5,6 +5,7 @@ PaddlePaddle Documentation
   :maxdepth: 1
 
   getstarted/index_en.rst
+  build_and_install/index_en.rst
   howto/index_en.rst
+  dev/index_en.rst
   api/index_en.rst
-  mobile/index_en.rst
diff --git a/doc/mobile/index_cn.rst b/doc/mobile/index_cn.rst
deleted file mode 100644
index 1d99666e58b7043b85b0203ee0dfcd1957710161..0000000000000000000000000000000000000000
--- a/doc/mobile/index_cn.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-MOBILE
-======
-
-..  toctree::
-  :maxdepth: 1
-
-  cross_compiling_for_android_cn.md
-  cross_compiling_for_ios_cn.md
-  cross_compiling_for_raspberry_cn.md
diff --git a/doc/mobile/index_en.rst b/doc/mobile/index_en.rst
deleted file mode 100644
index ef421dacad458828cadf8cf505375d6c4bfd9dde..0000000000000000000000000000000000000000
--- a/doc/mobile/index_en.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-MOBILE
-======
-
-..  toctree::
-  :maxdepth: 1
-
-  cross_compiling_for_android_en.md
-  cross_compiling_for_ios_en.md
-  cross_compiling_for_raspberry_en.md
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 8d9260811a8c9274dcaade9b090bab727d1952ca..ef1bc07c2dbe71268c706a119056d3a9fcfc7f8c 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -20,13 +20,16 @@ endif()
 
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
+nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place paddle_memory device_context init)
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
-nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
+nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
+
+cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
 
 cc_test(variable_test SRCS variable_test.cc)
 
-cc_library(threadpool SRCS threadpool.cc)
+cc_library(threadpool SRCS threadpool.cc DEPS enforce)
 cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
 
 cc_library(scope SRCS scope.cc DEPS glog threadpool)
@@ -74,8 +77,10 @@ cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op)
 cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 
+cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
+
 cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
-framework_proto backward glog lod_rank_table profiler)
+framework_proto backward glog lod_rank_table profiler feed_fetch_method)
 
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
@@ -90,9 +95,4 @@ cc_test(init_test SRCS init_test.cc DEPS init)
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
       
-if(NOT WITH_C_API AND WITH_FLUID)
-  file(GLOB FRAMEWORK_HEADERS *.h)
-  install(FILES ${FRAMEWORK_HEADERS} DESTINATION include/paddle/framework)
-  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/framework.pb.h DESTINATION include/paddle/framework)
-  install(FILES details/cow_ptr.h details/op_registry.h DESTINATION include/paddle/framework/details)
-endif()
+cc_test(channel_test SRCS channel_test.cc)
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index 85e693434af863bfc3bde29989dbbfc69678d3b7..f52a51519fceffd96a4e8db240b23d15ed399fff 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -534,7 +534,7 @@ ParamGradInfoMap AppendBackward(
   auto root_block = program_desc.MutableBlock(root_block_idx);
 
   std::string fill_one_op_out = GradVarName(target.Name());
-  bool is_scalar = target.Shape() == std::vector<int64_t>{1};
+  bool is_scalar = target.GetShape() == std::vector<int64_t>{1};
   PADDLE_ENFORCE(is_scalar, "target should be scalar");
   VLOG(3) << "backward from loss=" << target.Name()
           << " data_type=" << target.GetDataType();
@@ -565,7 +565,7 @@ ParamGradInfoMap AppendBackward(
 
   auto var = root_block->Var(fill_one_op_out);
   var->SetDataType(target.GetDataType());
-  var->SetShape(target.Shape());
+  var->SetShape(target.GetShape());
   auto& target_grad = retv[target.Name()];
   target_grad.name_ = fill_one_op_out;
   target_grad.block_idx_ = root_block_idx;
diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc
index dd2ed87252102aee6d384f37365d19305f19b281..3e344ea3790f57b0f53f36a40263dcdd326e67a9 100644
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -162,9 +162,8 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
     : prog_(prog), desc_(desc) {
   need_update_ = true;
   for (auto &op : other.ops_) {
-    ops_.emplace_back(new OpDesc(*op, this));
+    ops_.emplace_back(new OpDesc(*op->Proto(), prog, this));
   }
-
   for (auto &it : other.vars_) {
     auto *var = new VarDesc(*it.second);
     vars_[it.first].reset(var);
diff --git a/paddle/framework/channel.h b/paddle/framework/channel.h
new file mode 100644
index 0000000000000000000000000000000000000000..146f0e9e71ea9101a8f6c71e6c023178f131f967
--- /dev/null
+++ b/paddle/framework/channel.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stddef.h>  // for size_t
+
+namespace paddle {
+namespace framework {
+
+// Channel is the abstract class of buffered and un-buffered channels.
+template <typename T>
+class Channel {
+ public:
+  virtual bool Send(T*) = 0;
+  virtual bool Receive(T*) = 0;
+  virtual size_t Cap() = 0;
+  virtual void Close() = 0;
+  virtual ~Channel() {}
+};
+
+// Forward declaration of channel implementations.
+namespace details {
+template <typename T>
+class Buffered;
+template <typename T>
+class UnBuffered;
+}  // namespace details
+
+template <typename T>
+Channel<T>* MakeChannel(size_t buffer_size) {
+  if (buffer_size > 0) {
+    return new details::Buffered<T>(buffer_size);
+  }
+  return new details::UnBuffered<T>();
+}
+
+template <typename T>
+void CloseChannel(Channel<T>* ch) {
+  ch->Close();
+}
+
+}  // namespace framework
+}  // namespace paddle
+
+#include "paddle/framework/details/buffered_channel.h"
+#include "paddle/framework/details/unbuffered_channel.h"
diff --git a/paddle/framework/channel_test.cc b/paddle/framework/channel_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d7140dd10661c7b8582930b47872ab0b330c4d66
--- /dev/null
+++ b/paddle/framework/channel_test.cc
@@ -0,0 +1,510 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/channel.h"
+
+#include <chrono>
+#include <thread>
+
+#include "gtest/gtest.h"
+
+using paddle::framework::Channel;
+using paddle::framework::MakeChannel;
+using paddle::framework::CloseChannel;
+using paddle::framework::details::Buffered;
+using paddle::framework::details::UnBuffered;
+
+void RecevingOrderEqualToSendingOrder(Channel<int> *ch) {
+  unsigned sum_send = 0;
+  std::thread t([&]() {
+    for (int i = 0; i < 5; i++) {
+      EXPECT_EQ(ch->Send(&i), true);
+      sum_send += i;
+    }
+  });
+  for (int i = 0; i < 5; i++) {
+    int recv;
+    EXPECT_EQ(ch->Receive(&recv), true);
+    EXPECT_EQ(recv, i);
+  }
+
+  CloseChannel(ch);
+  t.join();
+  EXPECT_EQ(sum_send, 10U);
+  delete ch;
+}
+
+TEST(Channel, MakeAndClose) {
+  using paddle::framework::details::Buffered;
+  using paddle::framework::details::UnBuffered;
+  {
+    // MakeChannel should return a buffered channel is buffer_size > 0.
+    auto ch = MakeChannel<int>(10);
+    EXPECT_NE(dynamic_cast<Buffered<int> *>(ch), nullptr);
+    EXPECT_EQ(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
+    CloseChannel(ch);
+    delete ch;
+  }
+  {
+    // MakeChannel should return an un-buffered channel is buffer_size = 0.
+    auto ch = MakeChannel<int>(0);
+    EXPECT_EQ(dynamic_cast<Buffered<int> *>(ch), nullptr);
+    EXPECT_NE(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
+    CloseChannel(ch);
+    delete ch;
+  }
+}
+
+TEST(Channel, SufficientBufferSizeDoesntBlock) {
+  const size_t buffer_size = 10;
+  auto ch = MakeChannel<size_t>(buffer_size);
+  for (size_t i = 0; i < buffer_size; ++i) {
+    EXPECT_EQ(ch->Send(&i), true);  // should not block
+  }
+
+  size_t out;
+  for (size_t i = 0; i < buffer_size; ++i) {
+    EXPECT_EQ(ch->Receive(&out), true);  // should not block
+    EXPECT_EQ(out, i);
+  }
+  CloseChannel(ch);
+  delete ch;
+}
+
+// This tests that a  channel must return false
+// on send and receive performed after closing the channel.
+// Receive will only return false after close when queue is empty.
+// By creating separate threads for sending and receiving, we make this
+// function able to test both buffered and unbuffered channels.
+void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) {
+  const size_t data = 5;
+  std::thread send_thread{[&]() {
+    size_t i = data;
+    EXPECT_EQ(ch->Send(&i), true);  // should not block
+  }};
+
+  std::thread recv_thread{[&]() {
+    size_t i;
+    EXPECT_EQ(ch->Receive(&i), true);  // should not block
+    EXPECT_EQ(i, data);
+  }};
+
+  send_thread.join();
+  recv_thread.join();
+
+  // After closing send should return false. Receive should
+  // also return false as there is no data in queue.
+  CloseChannel(ch);
+  send_thread = std::thread{[&]() {
+    size_t i = data;
+    EXPECT_EQ(ch->Send(&i), false);  // should return false
+  }};
+  recv_thread = std::thread{[&]() {
+    size_t i;
+    // should return false because channel is closed and queue is empty
+    EXPECT_EQ(ch->Receive(&i), false);
+  }};
+
+  send_thread.join();
+  recv_thread.join();
+}
+
+TEST(Channel, SendReceiveClosedBufferedChannelPanics) {
+  size_t buffer_size = 10;
+  auto ch = MakeChannel<size_t>(buffer_size);
+  SendReceiveWithACloseChannelShouldPanic(ch);
+  delete ch;
+}
+
+TEST(Channel, SendReceiveClosedUnBufferedChannelPanics) {
+  auto ch = MakeChannel<size_t>(0);
+  SendReceiveWithACloseChannelShouldPanic(ch);
+  delete ch;
+}
+
+TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) {
+  const size_t buffer_size = 10;
+  auto ch = MakeChannel<size_t>(buffer_size);
+
+  for (size_t i = 0; i < buffer_size; ++i) {
+    EXPECT_EQ(ch->Send(&i), true);  // sending should not block
+  }
+
+  size_t out;
+  for (size_t i = 0; i < buffer_size / 2; ++i) {
+    EXPECT_EQ(ch->Receive(&out), true);  // receiving should not block
+    EXPECT_EQ(out, i);
+  }
+
+  CloseChannel(ch);
+
+  for (size_t i = buffer_size / 2; i < buffer_size; ++i) {
+    EXPECT_EQ(ch->Receive(&out),
+              true);  // receving should return residual values.
+    EXPECT_EQ(out, i);
+  }
+
+  for (size_t i = 0; i < buffer_size; ++i) {
+    EXPECT_EQ(ch->Receive(&out),
+              false);  // receiving on closed channel should return false
+  }
+  delete ch;
+}
+
+TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
+  const size_t buffer_size = 10;
+  auto ch = MakeChannel<size_t>(buffer_size);
+  size_t sum = 0;
+  std::thread t([&]() {
+    // Try to write more than buffer size.
+    for (size_t i = 0; i < 2 * buffer_size; ++i) {
+      if (i < buffer_size)
+        EXPECT_EQ(ch->Send(&i), true);  // should block after 10 iterations
+      else
+        EXPECT_EQ(ch->Send(&i), false);
+      sum += i;
+    }
+  });
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
+  EXPECT_EQ(sum, 45U);
+
+  CloseChannel(ch);
+  t.join();
+  delete ch;
+}
+
+TEST(Channel, RecevingOrderEqualToSendingOrderWithUnBufferedChannel) {
+  auto ch = MakeChannel<int>(0);
+  RecevingOrderEqualToSendingOrder(ch);
+}
+
+TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel) {
+  auto ch = MakeChannel<int>(10);
+  RecevingOrderEqualToSendingOrder(ch);
+}
+
+void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+
+  // Launches threads that try to read and are blocked because of no writers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data;
+          EXPECT_EQ(ch->Receive(&data), false);
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
+
+  // Verify that all the threads are blocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], false);
+  }
+
+  // Explicitly close the channel
+  // This should unblock all receivers
+  CloseChannel(ch);
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+void ChannelCloseUnblocksSendersTest(Channel<int> *ch) {
+  using paddle::framework::details::Buffered;
+  using paddle::framework::details::UnBuffered;
+
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+  bool send_success[num_threads];
+
+  // Launches threads that try to write and are blocked because of no readers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    send_success[i] = false;
+    t[i] = std::thread(
+        [&](bool *ended, bool *success) {
+          int data = 10;
+          *success = ch->Send(&data);
+          *ended = true;
+        },
+        &thread_ended[i], &send_success[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+
+  if (dynamic_cast<Buffered<int> *>(ch)) {
+    // If ch is Buffered, atleast 4 threads must be blocked.
+    int ct = 0;
+    for (size_t i = 0; i < num_threads; i++) {
+      if (!thread_ended[i]) ct++;
+    }
+    EXPECT_GE(ct, 4);
+  } else {
+    // If ch is UnBuffered, all the threads should be blocked.
+    for (size_t i = 0; i < num_threads; i++) {
+      EXPECT_EQ(thread_ended[i], false);
+    }
+  }
+  // Explicitly close the thread
+  // This should unblock all senders
+  CloseChannel(ch);
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  if (dynamic_cast<Buffered<int> *>(ch)) {
+    // Verify that only 1 send was successful
+    int ct = 0;
+    for (size_t i = 0; i < num_threads; i++) {
+      if (send_success[i]) ct++;
+    }
+    // Only 1 send must be successful
+    EXPECT_EQ(ct, 1);
+  }
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+// This tests that closing a buffered channel also unblocks
+//  any receivers waiting on the channel
+TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
+  auto ch = MakeChannel<int>(1);
+  ChannelCloseUnblocksReceiversTest(ch);
+  delete ch;
+}
+
+// This tests that closing a buffered channel also unblocks
+//  any senders waiting for channel to have write space
+TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
+  auto ch = MakeChannel<int>(1);
+  ChannelCloseUnblocksSendersTest(ch);
+  delete ch;
+}
+
+// This tests that closing an unbuffered channel also unblocks
+//  unblocks any receivers waiting for senders
+TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
+  auto ch = MakeChannel<int>(0);
+  ChannelCloseUnblocksReceiversTest(ch);
+  delete ch;
+}
+
+// This tests that closing an unbuffered channel also unblocks
+//  unblocks any senders waiting for senders
+TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
+  auto ch = MakeChannel<int>(0);
+  ChannelCloseUnblocksReceiversTest(ch);
+  delete ch;
+}
+
+TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
+  auto ch = MakeChannel<int>(0);
+  unsigned sum_send = 0;
+  // Send should block after three iterations
+  // since we only have three receivers.
+  std::thread t([&]() {
+    // Try to send more number of times
+    // than receivers
+    for (int i = 0; i < 4; i++) {
+      ch->Send(&i);
+      sum_send += i;
+    }
+  });
+  for (int i = 0; i < 3; i++) {
+    int recv;
+    ch->Receive(&recv);
+    EXPECT_EQ(recv, i);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.5 sec
+  EXPECT_EQ(sum_send, 3U);
+
+  CloseChannel(ch);
+  t.join();
+  delete ch;
+}
+
+TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
+  auto ch = MakeChannel<int>(0);
+  unsigned sum_send = 0;
+  unsigned sum_receive = 0;
+  // The receiver should block after 5
+  // iterations, since there are only 5 senders.
+  std::thread t([&]() {
+    for (int i = 0; i < 8; i++) {
+      int recv;
+      ch->Receive(&recv);  // should block after the fifth iteration.
+      EXPECT_EQ(recv, i);
+      sum_receive += i;
+    }
+  });
+  for (int i = 0; i < 5; i++) {
+    ch->Send(&i);
+    sum_send += i;
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  EXPECT_EQ(sum_send, 10U);
+  EXPECT_EQ(sum_receive, 10U);
+  // send three more elements
+  for (int i = 5; i < 8; i++) {
+    ch->Send(&i);
+    sum_send += i;
+  }
+
+  CloseChannel(ch);
+  t.join();
+  EXPECT_EQ(sum_send, 28U);
+  EXPECT_EQ(sum_receive, 28U);
+  delete ch;
+}
+
+// This tests that destroying a channel unblocks
+//  any senders waiting for channel to have write space
+void ChannelDestroyUnblockSenders(Channel<int> *ch) {
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+  bool send_success[num_threads];
+
+  // Launches threads that try to write and are blocked because of no readers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    send_success[i] = false;
+    t[i] = std::thread(
+        [&](bool *ended, bool *success) {
+          int data = 10;
+          *success = ch->Send(&data);
+          *ended = true;
+        },
+        &thread_ended[i], &send_success[i]);
+  }
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  bool is_buffered_channel = false;
+  if (dynamic_cast<Buffered<int> *>(ch)) is_buffered_channel = true;
+
+  if (is_buffered_channel) {
+    // If channel is buffered, verify that atleast 4 threads are blocked
+    int ct = 0;
+    for (size_t i = 0; i < num_threads; i++) {
+      if (thread_ended[i] == false) ct++;
+    }
+    // Atleast 4 threads must be blocked
+    EXPECT_GE(ct, 4);
+  } else {
+    // Verify that all the threads are blocked
+    for (size_t i = 0; i < num_threads; i++) {
+      EXPECT_EQ(thread_ended[i], false);
+    }
+  }
+  // Explicitly destroy the channel
+  delete ch;
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  // Count number of successfuld sends
+  int ct = 0;
+  for (size_t i = 0; i < num_threads; i++) {
+    if (send_success[i]) ct++;
+  }
+
+  if (is_buffered_channel) {
+    // Only 1 send must be successful
+    EXPECT_EQ(ct, 1);
+  } else {
+    // In unbuffered channel, no send should be successful
+    EXPECT_EQ(ct, 0);
+  }
+
+  // Join all threads
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+// This tests that destroying a channel also unblocks
+//  any receivers waiting on the channel
+void ChannelDestroyUnblockReceivers(Channel<int> *ch) {
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+
+  // Launches threads that try to read and are blocked because of no writers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data;
+          // All reads should return false
+          EXPECT_EQ(ch->Receive(&data), false);
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+
+  // Verify that all threads are blocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], false);
+  }
+  // delete the channel
+  delete ch;
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) {
+  size_t buffer_size = 1;
+  auto ch = MakeChannel<int>(buffer_size);
+  ChannelDestroyUnblockReceivers(ch);
+}
+
+TEST(Channel, BufferedChannelDestroyUnblocksSendersTest) {
+  size_t buffer_size = 1;
+  auto ch = MakeChannel<int>(buffer_size);
+  ChannelDestroyUnblockSenders(ch);
+}
+
+// This tests that destroying an unbuffered channel also unblocks
+//  unblocks any receivers waiting for senders
+TEST(Channel, UnbufferedChannelDestroyUnblocksReceiversTest) {
+  auto ch = MakeChannel<int>(0);
+  ChannelDestroyUnblockReceivers(ch);
+}
+
+TEST(Channel, UnbufferedChannelDestroyUnblocksSendersTest) {
+  auto ch = MakeChannel<int>(0);
+  ChannelDestroyUnblockSenders(ch);
+}
diff --git a/paddle/framework/data_type.h b/paddle/framework/data_type.h
index 6a372ac32e48131eed28e2d42125feb5b92a11c7..98eb3e857d1943e71f1d41f24ecbedbe09e85b7b 100644
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@@ -79,5 +79,33 @@ inline void VisitDataType(proto::DataType type, Visitor visitor) {
   }
 }
 
+inline std::string DataTypeToString(const proto::DataType type) {
+  using namespace paddle::framework::proto;
+  switch (type) {
+    case DataType::FP16:
+      return "float16";
+    case DataType::FP32:
+      return "float32";
+    case DataType::FP64:
+      return "float64";
+    case DataType::INT16:
+      return "int16";
+    case DataType::INT32:
+      return "int32";
+    case DataType::INT64:
+      return "int64";
+    case DataType::BOOL:
+      return "bool";
+    default:
+      PADDLE_THROW("Not support type %d", type);
+  }
+}
+
+inline std::ostream& operator<<(std::ostream& out,
+                                const proto::DataType& type) {
+  out << DataTypeToString(type);
+  return out;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/details/buffered_channel.h b/paddle/framework/details/buffered_channel.h
new file mode 100644
index 0000000000000000000000000000000000000000..227a4e4811f95441158150396b5b882815fd7844
--- /dev/null
+++ b/paddle/framework/details/buffered_channel.h
@@ -0,0 +1,142 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <atomic>
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+
+#include "paddle/framework/channel.h"
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+// Four of the properties of Buffered Channel:
+// - A send to a full channel blocks temporarily until a receive from the
+// channel or the channel is closed.
+// - A receive from an empty channel blocks temporarily until a send to the
+// channel or the channel is closed.
+// - A send to a closed channel returns false immediately.
+// - A receive from a closed channel returns false immediately.
+
+template <typename T>
+class Buffered : public paddle::framework::Channel<T> {
+  friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
+  friend void paddle::framework::CloseChannel<T>(Channel<T>*);
+
+ public:
+  virtual bool Send(T*);
+  virtual bool Receive(T*);
+  virtual size_t Cap() { return cap_; }
+  virtual void Close();
+  virtual ~Buffered();
+
+ private:
+  size_t cap_;
+  std::mutex mu_;
+  std::condition_variable empty_cond_var_;
+  std::condition_variable full_cond_var_;
+  std::condition_variable destructor_cond_var_;
+  std::deque<T> channel_;
+  std::atomic<bool> closed_{false};
+  std::atomic<unsigned> send_ctr{0};
+  std::atomic<unsigned> recv_ctr{0};
+
+  Buffered(size_t cap) : cap_(cap), closed_(false) {
+    PADDLE_ENFORCE_GT(cap, 0);
+  }
+
+  void NotifyAllParticipants(std::unique_lock<std::mutex>*);
+};
+
+template <typename T>
+bool Buffered<T>::Send(T* item) {
+  bool ret = false;
+  if (closed_) {
+    return ret;
+  }
+  send_ctr++;
+  std::unique_lock<std::mutex> lock(mu_);
+  full_cond_var_.wait(lock,
+                      [this]() { return channel_.size() < cap_ || closed_; });
+  if (!closed_) {
+    channel_.push_back(std::move(*item));
+    lock.unlock();
+    empty_cond_var_.notify_one();
+    ret = true;
+  }
+  send_ctr--;
+  destructor_cond_var_.notify_one();
+  return ret;
+}
+
+template <typename T>
+bool Buffered<T>::Receive(T* item) {
+  bool ret = false;
+  // Once the channel has been closed and all data has been consumed,
+  // just return false. Don't even try acquiring the mutex.
+  if (closed_ && channel_.empty()) {
+    return false;
+  }
+  recv_ctr++;
+  std::unique_lock<std::mutex> lock(mu_);
+  empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
+  if (!channel_.empty()) {
+    *item = std::move(channel_.front());
+    channel_.pop_front();
+    full_cond_var_.notify_one();
+    ret = true;
+  }
+  recv_ctr--;
+  destructor_cond_var_.notify_one();
+  return ret;
+}
+
+template <typename T>
+void Buffered<T>::Close() {
+  if (closed_) {
+    return;
+  }
+  std::unique_lock<std::mutex> lock(mu_);
+  closed_ = true;
+  NotifyAllParticipants(&lock);
+}
+
+template <typename T>
+Buffered<T>::~Buffered() {
+  std::unique_lock<std::mutex> lock(mu_);
+  closed_ = true;
+  channel_.clear();
+  NotifyAllParticipants(&lock);
+
+  // The destructor must wait for all readers and writers to complete their task
+  // The channel has been closed, so we will not accept new readers and writers
+  lock.lock();
+  destructor_cond_var_.wait(
+      lock, [this]() { return send_ctr == 0 && recv_ctr == 0; });
+}
+
+template <typename T>
+void Buffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
+  lock->unlock();
+  full_cond_var_.notify_all();
+  empty_cond_var_.notify_all();
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/details/cow_ptr.h b/paddle/framework/details/cow_ptr.h
index 7e308ffb5a49876aa2c1833b3b7e2a2c7eb137aa..69bcea625288eba897e761a1d634f19c41dc0f79 100644
--- a/paddle/framework/details/cow_ptr.h
+++ b/paddle/framework/details/cow_ptr.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/paddle/framework/details/cow_ptr_test.cc b/paddle/framework/details/cow_ptr_test.cc
index 936954a2333e7e5d2a932abad641279db9ef7b9f..1f4a12bca0dcab2d146cc62cd7ce1c2d7abcddf9 100644
--- a/paddle/framework/details/cow_ptr_test.cc
+++ b/paddle/framework/details/cow_ptr_test.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/paddle/framework/details/op_registry.h b/paddle/framework/details/op_registry.h
index 6d50e820b2b625f932768d2ca671d999071f1ca6..31a40bcbcb3905f01aebefe89526f3cfba8cb8c7 100644
--- a/paddle/framework/details/op_registry.h
+++ b/paddle/framework/details/op_registry.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/framework/details/unbuffered_channel.h b/paddle/framework/details/unbuffered_channel.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b5c2196cb2991051c48f7da8397d2f479ca4c58
--- /dev/null
+++ b/paddle/framework/details/unbuffered_channel.h
@@ -0,0 +1,174 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <atomic>
+#include <condition_variable>
+#include <mutex>
+
+#include "paddle/framework/channel.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+// Four of the properties of UnBuffered Channel:
+// - A send to a channel blocks temporarily until a receive from the
+// channel or the channel is closed.
+// - A receive from a channel blocks temporarily until a send to the
+// channel or the channel is closed.
+// - A send to a closed channel returns false immediately.
+// - A receive from a closed channel returns false immediately.
+template <typename T>
+class UnBuffered : public paddle::framework::Channel<T> {
+  friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
+  friend void paddle::framework::CloseChannel<T>(Channel<T>*);
+
+ public:
+  virtual bool Send(T*);
+  virtual bool Receive(T*);
+  virtual size_t Cap() { return 0; }
+  virtual void Close();
+  virtual ~UnBuffered();
+
+ private:
+  std::mutex mu_ch_;
+  // Mutex for readers and writers who are waiting for other reader
+  // and writer to complete execution
+  std::recursive_mutex mu_read_, mu_write_;
+  // reader_found_ is set true when a reader is ready to accept data
+  // writer_found_ is set true when a writer is ready to send data
+  // A transaction occurs only when both are true
+  std::atomic<bool> reader_found_{false}, writer_found_{false};
+  std::condition_variable cv_channel_;
+  std::condition_variable_any cv_reader_, cv_writer_, cv_destructor_;
+  T* item{nullptr};
+  std::atomic<bool> closed_{false};
+  std::atomic<unsigned> send_ctr{0};
+  std::atomic<unsigned> recv_ctr{0};
+
+  UnBuffered() : closed_(false) {}
+
+  void NotifyAllParticipants(std::unique_lock<std::mutex>*);
+};
+
+// This function implements the concept of how data should
+// be sent from a writer to a reader.
+template <typename T>
+bool UnBuffered<T>::Send(T* data) {
+  bool ret = false;
+  if (closed_) {
+    return ret;
+  }
+  send_ctr++;
+  // Prevent other writers from entering
+  std::unique_lock<std::recursive_mutex> writer_lock(mu_write_);
+  writer_found_ = true;
+  std::unique_lock<std::recursive_mutex> cv_lock(mu_write_);
+  // If writer comes first, it should wait till a reader arrives
+  cv_writer_.wait(cv_lock,
+                  [this]() { return reader_found_ == true || closed_; });
+  cv_reader_.notify_one();
+  if (!closed_) {
+    std::unique_lock<std::mutex> channel_lock(mu_ch_);
+    item = data;
+    channel_lock.unlock();
+    cv_channel_.notify_one();
+    channel_lock.lock();
+    cv_channel_.wait(channel_lock,
+                     [this]() { return item == nullptr || closed_; });
+    ret = true;
+  }
+  writer_found_ = false;
+  send_ctr--;
+  cv_destructor_.notify_one();
+  return ret;
+}
+
+// This function implements the concept of how
+// data that was sent by a writer is read from a reader.
+template <typename T>
+bool UnBuffered<T>::Receive(T* data) {
+  bool ret = false;
+  // If channel is closed, we don't even want any reader to enter.
+  // Unlike a buffered channel, an unbuffered channel does not allow
+  // readers to read after closing because there is no buffer to be consumed.
+  if (closed_) return ret;
+  recv_ctr++;
+  // Prevent other readers from entering
+  std::unique_lock<std::recursive_mutex> read_lock{mu_read_};
+  reader_found_ = true;
+  std::unique_lock<std::recursive_mutex> cv_lock{mu_read_};
+  // If reader comes first, it should wait till a writer arrives
+  cv_reader_.wait(cv_lock,
+                  [this]() { return writer_found_ == true || closed_; });
+  cv_writer_.notify_one();
+  if (!closed_) {
+    std::unique_lock<std::mutex> lock_ch{mu_ch_};
+    // Reader should wait for the writer to first write its data
+    cv_channel_.wait(lock_ch, [this]() { return item != nullptr || closed_; });
+    if (!closed_) {
+      *data = std::move(*item);
+      item = nullptr;
+      lock_ch.unlock();
+      ret = true;
+    }
+    cv_channel_.notify_one();
+  }
+  reader_found_ = false;
+  recv_ctr--;
+  cv_destructor_.notify_one();
+  return ret;
+}
+
+// This function implements the sequence of events
+// that take place once the channel is closed.
+template <typename T>
+void UnBuffered<T>::Close() {
+  if (closed_) {
+    return;
+  }
+  std::unique_lock<std::mutex> lock(mu_ch_);
+  item = nullptr;
+  closed_ = true;
+  NotifyAllParticipants(&lock);
+}
+
+// This function implements the sequence of events
+// that are executed once the object of an UnBuffered
+// channel is destroyed.
+template <typename T>
+UnBuffered<T>::~UnBuffered() {
+  std::unique_lock<std::mutex> lock(mu_ch_);
+  item = nullptr;
+  closed_ = true;
+  NotifyAllParticipants(&lock);
+  lock.lock();
+  cv_destructor_.wait(lock,
+                      [this]() { return send_ctr == 0 && recv_ctr == 0; });
+}
+
+// This function notifies all the readers, writers and
+// the channel condition variables.
+template <typename T>
+void UnBuffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
+  lock->unlock();
+  cv_writer_.notify_all();
+  cv_channel_.notify_all();
+  cv_reader_.notify_all();
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index c28ffefdd0872238299cdbb0653ee17cdad61699..2a88e5a92985fab7311c1edd266cb89f7d76d867 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -17,14 +17,16 @@ limitations under the License. */
 #include <set>
 
 #include "gflags/gflags.h"
+#include "paddle/framework/feed_fetch_method.h"
 #include "paddle/framework/feed_fetch_type.h"
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/framework/reader.h"
 #include "paddle/platform/place.h"
 #include "paddle/platform/profiler.h"
 
-DECLARE_bool(do_memory_benchmark);
+DECLARE_bool(benchmark);
 DEFINE_bool(check_nan_inf, false,
             "Checking whether operator produce NAN/INF or not. It will be "
             "extremely slow so please use this flag wisely.");
@@ -32,9 +34,6 @@ DEFINE_bool(check_nan_inf, false,
 namespace paddle {
 namespace framework {
 
-const std::string kFeedOpType = "feed";
-const std::string kFetchOpType = "fetch";
-
 Executor::Executor(const platform::Place& place) : place_(place) {}
 
 static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
@@ -54,11 +53,13 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
     var->GetMutable<LoDTensorArray>();
   } else if (var_type == proto::VarDesc::PLACE_LIST) {
     var->GetMutable<platform::PlaceList>();
+  } else if (var_type == proto::VarDesc::READER) {
+    var->GetMutable<ReaderHolder>();
   } else {
     PADDLE_THROW(
         "Variable type %d is not in "
-        "[LoDTensor, SelectedRows, FEED_MINIBATCH, FETCH_LIST, LOD_RANK_TABLE,"
-        " PLACE_LIST]",
+        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
+        "LOD_RANK_TABLE, PLACE_LIST, READER]",
         var_type);
   }
 }
@@ -124,7 +125,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
 
     op->Run(*local_scope, place_);
     VLOG(3) << op->DebugStringEx(local_scope);
-    if (FLAGS_do_memory_benchmark) {
+    if (FLAGS_benchmark) {
       VLOG(2) << "Memory used after operator " + op->Type() + " running: "
               << memory::memory_usage(place_);
     }
@@ -141,7 +142,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
   if (create_vars && create_local_scope) {
     scope->DeleteScope(local_scope);
   }
-  if (FLAGS_do_memory_benchmark) {
+  if (FLAGS_benchmark) {
     VLOG(2) << "-------------------------------------------------------";
     VLOG(2) << "Memory used after deleting local scope: "
             << memory::memory_usage(place_);
@@ -149,5 +150,164 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
   }
 }
 
+// Check whether the block already has feed operators and feed_holder.
+// Return false if the block does not have any feed operators.
+// If some feed operators have been prepended to the block, check that
+// the info contained in these feed operators matches the feed_targets
+// and feed_holder_name. Raise exception when any mismatch is found.
+// Return true if the block has feed operators and holder of matching info.
+static bool has_feed_operators(
+    BlockDesc* block, std::map<std::string, const LoDTensor*>& feed_targets,
+    const std::string& feed_holder_name) {
+  size_t feed_count = 0;
+  for (auto* op : block->AllOps()) {
+    if (op->Type() == kFeedOpType) {
+      feed_count++;
+      PADDLE_ENFORCE_EQ(op->Input("X")[0], feed_holder_name,
+                        "Input to feed op should be '%s'", feed_holder_name);
+      std::string feed_target_name = op->Output("Out")[0];
+      PADDLE_ENFORCE(
+          feed_targets.find(feed_target_name) != feed_targets.end(),
+          "Feed operator output name '%s' cannot be found in 'feed_targets'",
+          feed_target_name);
+    }
+  }
+
+  if (feed_count > 0) {
+    PADDLE_ENFORCE_EQ(
+        feed_count, feed_targets.size(),
+        "The number of feed operators should match 'feed_targets'");
+
+    // When feed operator are present, so should be feed_holder
+    auto var = block->FindVar(feed_holder_name);
+    PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
+                            feed_holder_name);
+    PADDLE_ENFORCE_EQ(var->GetType(), proto::VarDesc::FEED_MINIBATCH,
+                      "'%s' variable should be 'FEED_MINIBATCH' type",
+                      feed_holder_name);
+  }
+
+  return feed_count > 0;
+}
+
+// Check whether the block already has fetch operators and fetch_holder.
+// Return false if the block does not have any fetch operators.
+// If some fetch operators have been appended to the block, check that
+// the info contained in these fetch operators matches the fetch_targets
+// and fetch_holder_name. Raise exception when any mismatch is found.
+// Return true if the block has fetch operators and holder of matching info.
+static bool has_fetch_operators(
+    BlockDesc* block, std::map<std::string, LoDTensor*>& fetch_targets,
+    const std::string& fetch_holder_name) {
+  size_t fetch_count = 0;
+  for (auto* op : block->AllOps()) {
+    if (op->Type() == kFetchOpType) {
+      fetch_count++;
+      PADDLE_ENFORCE_EQ(op->Output("Out")[0], fetch_holder_name,
+                        "Output of fetch op should be '%s'", fetch_holder_name);
+      std::string fetch_target_name = op->Input("X")[0];
+      PADDLE_ENFORCE(
+          fetch_targets.find(fetch_target_name) != fetch_targets.end(),
+          "Fetch operator input name '%s' cannot be found in 'fetch_targets'",
+          fetch_target_name);
+    }
+  }
+
+  if (fetch_count > 0) {
+    PADDLE_ENFORCE_EQ(
+        fetch_count, fetch_targets.size(),
+        "The number of fetch operators should match 'fetch_targets'");
+
+    // When fetch operator are present, so should be fetch_holder
+    auto var = block->FindVar(fetch_holder_name);
+    PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
+                            fetch_holder_name);
+    PADDLE_ENFORCE_EQ(var->GetType(), proto::VarDesc::FETCH_LIST,
+                      "'%s' variable should be 'FETCH_LIST' type",
+                      fetch_holder_name);
+  }
+
+  return fetch_count > 0;
+}
+
+void Executor::Run(const ProgramDesc& program, Scope* scope,
+                   std::map<std::string, const LoDTensor*>& feed_targets,
+                   std::map<std::string, LoDTensor*>& fetch_targets,
+                   const std::string& feed_holder_name,
+                   const std::string& fetch_holder_name) {
+  auto* copy_program = new ProgramDesc(program);
+  auto* global_block = copy_program->MutableBlock(0);
+
+  if (!has_feed_operators(global_block, feed_targets, feed_holder_name)) {
+    // create feed_holder variable
+    auto* feed_holder = global_block->Var(feed_holder_name);
+    feed_holder->SetType(proto::VarDesc::FEED_MINIBATCH);
+    feed_holder->SetPersistable(true);
+
+    int i = 0;
+    for (auto& feed_target : feed_targets) {
+      std::string var_name = feed_target.first;
+      VLOG(3) << "feed target's name: " << var_name;
+
+      // prepend feed op
+      auto* op = global_block->PrependOp();
+      op->SetType(kFeedOpType);
+      op->SetInput("X", {feed_holder_name});
+      op->SetOutput("Out", {var_name});
+      op->SetAttr("col", {static_cast<int>(i)});
+      op->CheckAttrs();
+
+      i++;
+    }
+  }
+
+  // map the data of feed_targets to feed_holder
+  for (auto* op : global_block->AllOps()) {
+    if (op->Type() == kFeedOpType) {
+      std::string feed_target_name = op->Output("Out")[0];
+      int idx = boost::get<int>(op->GetAttr("col"));
+      SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name,
+                      idx);
+    }
+  }
+
+  if (!has_fetch_operators(global_block, fetch_targets, fetch_holder_name)) {
+    // create fetch_holder variable
+    auto* fetch_holder = global_block->Var(fetch_holder_name);
+    fetch_holder->SetType(proto::VarDesc::FETCH_LIST);
+    fetch_holder->SetPersistable(true);
+
+    int i = 0;
+    for (auto& fetch_target : fetch_targets) {
+      std::string var_name = fetch_target.first;
+      VLOG(3) << "fetch target's name: " << var_name;
+
+      // append fetch op
+      auto* op = global_block->AppendOp();
+      op->SetType(kFetchOpType);
+      op->SetInput("X", {var_name});
+      op->SetOutput("Out", {fetch_holder_name});
+      op->SetAttr("col", {static_cast<int>(i)});
+      op->CheckAttrs();
+
+      i++;
+    }
+  }
+
+  Run(*copy_program, scope, 0, true, true);
+
+  // obtain the data of fetch_targets from fetch_holder
+  for (auto* op : global_block->AllOps()) {
+    if (op->Type() == kFetchOpType) {
+      std::string fetch_target_name = op->Input("X")[0];
+      int idx = boost::get<int>(op->GetAttr("col"));
+      *fetch_targets[fetch_target_name] =
+          GetFetchVariable(*scope, fetch_holder_name, idx);
+    }
+  }
+
+  delete copy_program;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h
index d869e18901b82959a40cc296aa0844c20ea63ac1..035ff48a52bd2fc4b1a46b48b1fbf1fbcb2ac70b 100644
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@@ -41,6 +41,12 @@ class Executor {
   void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true,
            bool create_vars = true);
 
+  void Run(const ProgramDesc& program, Scope* scope,
+           std::map<std::string, const LoDTensor*>& feed_targets,
+           std::map<std::string, LoDTensor*>& fetch_targets,
+           const std::string& feed_holder_name = "feed",
+           const std::string& fetch_holder_name = "fetch");
+
  private:
   const platform::Place place_;
 };
diff --git a/paddle/framework/feed_fetch_method.cc b/paddle/framework/feed_fetch_method.cc
new file mode 100644
index 0000000000000000000000000000000000000000..21201b675519e34b11e9f1f3a6f2a135c06d63a7
--- /dev/null
+++ b/paddle/framework/feed_fetch_method.cc
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/feed_fetch_method.h"
+#include "glog/logging.h"
+#include "paddle/framework/variable.h"
+
+namespace paddle {
+namespace framework {
+
+void SetFeedVariable(Scope* scope, const LoDTensor& input,
+                     const std::string& var_name, size_t index) {
+  // If var_name Variable is not found in GlobalScope, a new variable will
+  // be created.
+  VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
+  Variable* g_feed_value = scope->Var(var_name);
+  auto& feed_inputs =
+      *(g_feed_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
+  if (index >= feed_inputs.size()) {
+    feed_inputs.resize(index + 1);
+  }
+  // shared data with input tensor
+  feed_inputs[index].ShareDataWith(input);
+  // set lod
+  feed_inputs[index].set_lod(input.lod());
+}
+
+LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
+                            size_t index) {
+  // Since we want to fetch LodTensor from a variable, the variable must
+  // be created alreadly.
+  Variable* g_fetch_value = scope.FindVar(var_name);
+  PADDLE_ENFORCE(g_fetch_value->IsType<FeedFetchList>(),
+                 "Only %s can be invoked by GetFetchVariable",
+                 typeid(FeedFetchList).name());
+  auto& fetch_outputs = *g_fetch_value->GetMutable<FeedFetchList>();
+  auto& tensor = fetch_outputs[index];
+  VLOG(3) << "Fetch " << var_name << " with index " << index
+          << " shape= " << tensor.dims();
+  PADDLE_ENFORCE_LT(index, fetch_outputs.size());
+  return tensor;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/feed_fetch_method.h b/paddle/framework/feed_fetch_method.h
index 7feacb1e24708411e7fbb610f9909447cba9e291..b71945fcc8834d2e5fe21151e1e88788b4acd5c1 100644
--- a/paddle/framework/feed_fetch_method.h
+++ b/paddle/framework/feed_fetch_method.h
@@ -13,46 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "glog/logging.h"
+
 #include "paddle/framework/feed_fetch_type.h"
 #include "paddle/framework/scope.h"
-#include "paddle/framework/variable.h"
 
 namespace paddle {
 namespace framework {
 
 void SetFeedVariable(Scope* scope, const LoDTensor& input,
-                     const std::string& var_name, size_t index) {
-  // If var_name Variable is not found in GlobalScope, a new variable will
-  // be created.
-  VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
-  Variable* g_feed_value = scope->Var(var_name);
-  auto& feed_inputs =
-      *(g_feed_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
-  if (index >= feed_inputs.size()) {
-    feed_inputs.resize(index + 1);
-  }
-  // shared data with input tensor
-  feed_inputs[index].ShareDataWith(input);
-  // set lod
-  feed_inputs[index].set_lod(input.lod());
-}
+                     const std::string& var_name, size_t index);
 
 LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
-                            size_t index) {
-  // Since we want to fetch LodTensor from a variable, the variable must
-  // be created alreadly.
-  Variable* g_fetch_value = scope.FindVar(var_name);
-  PADDLE_ENFORCE(g_fetch_value->IsType<FeedFetchList>(),
-                 "Only %s can be invoked by GetFetchVariable",
-                 typeid(FeedFetchList).name());
-  auto& fetch_outputs = *g_fetch_value->GetMutable<FeedFetchList>();
-  auto& tensor = fetch_outputs[index];
-  VLOG(3) << "Fetch " << var_name << " with index " << index
-          << " shape= " << tensor.dims();
-  PADDLE_ENFORCE_LT(index, fetch_outputs.size());
-  return tensor;
-}
+                            size_t index);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/feed_fetch_type.h b/paddle/framework/feed_fetch_type.h
index 9bc4a90c44828ecb7458d524f59609f01848cc5c..168f456675af508df86dd0520cdeb5d16d94ad31 100644
--- a/paddle/framework/feed_fetch_type.h
+++ b/paddle/framework/feed_fetch_type.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
 #include <vector>
 #include "paddle/framework/lod_tensor.h"
 
@@ -20,5 +21,8 @@ namespace paddle {
 namespace framework {
 using FeedFetchType = LoDTensor;
 using FeedFetchList = std::vector<FeedFetchType>;
+
+static const std::string kFeedOpType = "feed";
+static const std::string kFetchOpType = "fetch";
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
index 5b6ef03f610926578d2c02dcf06f399f106a30a1..d7be1a7352da56e411396614e33919bb55bc3b0f 100644
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -116,6 +116,8 @@ message LoDTensorArrayDesc {
   optional int32 lod_level = 2 [ default = 0 ];
 }
 
+message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; }
+
 message VarDesc {
   enum VarType {
     LOD_TENSOR = 1;
@@ -126,13 +128,15 @@ message VarDesc {
     LOD_RANK_TABLE = 6;
     LOD_TENSOR_ARRAY = 7;
     PLACE_LIST = 8;
+    READER = 9;
   }
   required string name = 1;
   required VarType type = 2;
-  optional LoDTensorDesc lod_tensor = 3;
-  optional TensorDesc selected_rows = 4;
+  optional bool persistable = 3 [ default = false ];
+  optional LoDTensorDesc lod_tensor = 4;
+  optional TensorDesc selected_rows = 5;
   optional LoDTensorArrayDesc tensor_array = 6;
-  optional bool persistable = 5 [ default = false ];
+  optional ReaderDesc reader = 7;
 }
 
 message BlockDesc {
diff --git a/paddle/framework/init.cc b/paddle/framework/init.cc
index 4ef82a541efaa35bcf831d5122570154f2fa2423..3f6ea121b3994979d89a7d5a8c20c59240a0c111 100644
--- a/paddle/framework/init.cc
+++ b/paddle/framework/init.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <string.h>  // for strdup
 #include <algorithm>
+#include <stdexcept>
 #include <string>
 
 #include "paddle/framework/init.h"
@@ -46,17 +47,23 @@ void InitDevices() {
 
   std::vector<platform::Place> places;
   places.emplace_back(platform::CPUPlace());
+  int count = 0;
 
 #ifdef PADDLE_WITH_CUDA
-  int count = platform::GetCUDADeviceCount();
-  for (int i = 0; i < count; ++i) {
-    places.emplace_back(platform::CUDAPlace(i));
+  try {
+    count = platform::GetCUDADeviceCount();
+  } catch (const std::exception &exp) {
+    LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
   }
 #else
   LOG(WARNING)
-      << "'GPU' is not supported, Please re-compile with WITH_GPU option";
+      << "'CUDA' is not supported, Please re-compile with WITH_GPU option";
 #endif
 
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(platform::CUDAPlace(i));
+  }
+
   platform::DeviceContextPool::Init(places);
 }
 
diff --git a/paddle/framework/init_test.cc b/paddle/framework/init_test.cc
index f837a965d3be7d40c20803ae4462b3bfd91bffd0..01e076dd8ea24831e3ed7c8a7f8fae6818a89335 100644
--- a/paddle/framework/init_test.cc
+++ b/paddle/framework/init_test.cc
@@ -20,7 +20,21 @@ TEST(InitDevices, CPU) {
   using paddle::framework::InitDevices;
   using paddle::platform::DeviceContextPool;
 
+#ifndef PADDLE_WITH_CUDA
   InitDevices();
   DeviceContextPool& pool = DeviceContextPool::Instance();
-  ASSERT_GE(pool.size(), 1U);
+  ASSERT_EQ(pool.size(), 1U);
+#endif
+}
+
+TEST(InitDevices, CUDA) {
+  using paddle::framework::InitDevices;
+  using paddle::platform::DeviceContextPool;
+
+#ifdef PADDLE_WITH_CUDA
+  int count = paddle::platform::GetCUDADeviceCount();
+  InitDevices();
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
+#endif
 }
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index 53b0d0fe083579da4f0bb600f292765aa2aa0d8a..cb27de6991674247e6215ce64a2da5000fa78ed4 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -24,8 +24,6 @@ limitations under the License. */
 #include <algorithm>
 #include <iterator>
 
-#include <glog/logging.h>
-
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index 9d1294fdeb9bd76bf944f7ec3687e3c5bb333241..be2b301619639106ac7b578e5a79cf33f4379e48 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -18,11 +18,11 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
-#include <thrust/system/cuda/experimental/pinned_allocator.h>
 #endif
 
 #include <glog/logging.h>
 #include "paddle/framework/ddim.h"
+#include "paddle/framework/mixed_vector.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/framework/tensor_util.h"
 #include "paddle/platform/enforce.h"
@@ -31,15 +31,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-#ifndef PADDLE_WITH_CUDA
-template <typename T>
-using Vector = std::vector<T>;
-#else
-template <typename T>
-using Vector = thrust::host_vector<
-    T, thrust::system::cuda::experimental::pinned_allocator<T>>;
-#endif
-
 /*
  * LoD is short for Level of Details.
  *
@@ -55,7 +46,29 @@ using Vector = thrust::host_vector<
  *    0 2 4 7
  *    0 2 5 7 10 12 15 20
  */
-using LoD = std::vector<Vector<size_t>>;
+struct LoD : public std::vector<Vector<size_t>> {
+  using std::vector<Vector<size_t>>::vector;
+  platform::Place place() const {
+    if (this->size() == 0) {
+      // Not Initialze Yet.
+      return platform::CPUPlace();
+    } else {
+      return this->front().place();
+    }
+  }
+
+  void CopyFromCUDA() {
+    for (auto it = this->begin(); it != this->end(); ++it) {
+      it->CopyFromCUDA();
+    }
+  }
+
+  void CopyToPeer(platform::Place place) {
+    for (auto it = this->begin(); it != this->end(); ++it) {
+      it->CopyToPeer(place);
+    }
+  }
+};
 
 std::ostream& operator<<(std::ostream& os, const LoD& lod);
 std::ostream& operator<<(std::ostream& os, const LoDTensor& t);
@@ -109,7 +122,10 @@ bool CheckAbsLoD(const LoD& in, int tensor_height = -1);
  */
 class LoDTensor : public Tensor {
  public:
-  LoDTensor() {}
+  LoDTensor() : Tensor() {}
+
+  /* Constructor with place should only be used in pybind */
+  explicit LoDTensor(const platform::Place& place) : Tensor(place) {}
 
   explicit LoDTensor(const LoD& lod) : lod_(lod) {}
 
diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc
index 4d172c43c7cceacb7d0dfaf1c4d3028717350268..3b63020e685436396071fa05cd7697630ae56c95 100644
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -23,6 +23,17 @@
 namespace paddle {
 namespace framework {
 
+TEST(LoD, data) {
+  LoD lod{{0, 1, 2}};
+  lod.push_back({0, 2, 4, 5});
+  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
+
+  auto& v = lod[0];
+  for (size_t i = 0; i < v.size(); ++i) {
+    EXPECT_EQ(v[i], i);
+  }
+}
+
 TEST(LodExpand, test) {
   LoD lod{{0, 2}};
   LoDTensor tensor;
diff --git a/paddle/framework/lod_tensor_test.cu b/paddle/framework/lod_tensor_test.cu
index 1e253a2f6f35e827fb2e5db6270da03705b39514..adea02e3b3fdcf4873de76ff91116f43ac9fe259 100644
--- a/paddle/framework/lod_tensor_test.cu
+++ b/paddle/framework/lod_tensor_test.cu
@@ -14,6 +14,8 @@
 
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <stdio.h>
+#include "paddle/framework/init.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/platform/assert.h"
 
@@ -26,7 +28,26 @@ __global__ void test(size_t* a, int size) {
   }
 }
 
+TEST(LoD, data) {
+  paddle::framework::InitDevices();
+
+  paddle::framework::LoD lod{{0, 1, 2}};
+  lod.push_back({0, 2, 4, 5});
+  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
+
+  auto& v = lod[0];
+  test<<<1, 1>>>(v.cuda_data(), v.size());
+  cudaDeviceSynchronize();
+
+  v.CopyFromCUDA();
+  for (size_t i = 0; i < v.size(); ++i) {
+    EXPECT_EQ(v[i], i * 2);
+  }
+}
+
 TEST(LoDTensor, LoDInGPU) {
+  paddle::framework::InitDevices();
+
   paddle::framework::LoDTensor lod_tensor;
   paddle::platform::CUDAPlace place(0);
 
@@ -42,8 +63,9 @@ TEST(LoDTensor, LoDInGPU) {
 
   auto lod = lod_tensor.lod();
 
-  test<<<1, 8>>>(lod[0].data(), lod[0].size());
+  test<<<1, 8>>>(lod[0].cuda_data(), lod[0].size());
   cudaDeviceSynchronize();
+  lod.CopyFromCUDA();
 
   for (size_t i = 0; i < src_lod[0].size(); ++i) {
     EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);
diff --git a/paddle/framework/mixed_vector.h b/paddle/framework/mixed_vector.h
new file mode 100644
index 0000000000000000000000000000000000000000..5202775515d335ff81bb17e6ce21338c40041ca3
--- /dev/null
+++ b/paddle/framework/mixed_vector.h
@@ -0,0 +1,192 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <initializer_list>
+#include <vector>
+
+#include "paddle/memory/memcpy.h"
+#include "paddle/memory/memory.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace framework {
+
+/**
+ * @brief Vector support both cpu and gpu.
+ * host vector lifetime is same with Vector
+ * device vector is lazily malloc and modified.
+ */
+
+template <typename T>
+class Vector : public std::vector<T> {
+ public:
+  using std::vector<T>::vector;
+
+  Vector() {}
+  Vector(const std::vector<T> &v) : std::vector<T>(v) {}  // NOLINT
+
+  inline platform::Place place() const { return place_; }
+
+  /*! Return a pointer to constant memory block. */
+  inline const T *data(platform::Place place) const;
+
+  /*! Return a pointer to mutable memory block. */
+  inline T *mutable_data(platform::Place place);
+
+  // TODO(dzhwinter): below interfaces should be removed
+  /* Get device vector */
+  T *cuda_data() {
+    CopyToCUDA();
+    PADDLE_ENFORCE_NOT_NULL(
+        cuda_ptr_, "No data or Insufficient CUDA memory to allocation");
+    return static_cast<T *>(cuda_ptr_.get());
+  }
+
+  /* Get host vector */
+  T *data() { return std::vector<T>::data(); }
+  const T *data() const { return std::vector<T>::data(); }
+
+  T *data(const platform::Place &place) {
+    if (platform::is_cpu_place(place)) {
+      return data();
+    } else {
+      return cuda_data();
+    }
+  }
+
+  /* Synchronize host vector to device vector */
+  void CopyToCUDA();
+  /* Synchronize device vector to host vector */
+  void CopyFromCUDA();
+  /* Switch device vector location */
+  void CopyToPeer(platform::Place);
+
+ private:
+  std::shared_ptr<void> cuda_ptr_;
+  size_t cuda_size_ = 0;  // device vector numel
+  platform::CUDAPlace place_;
+};
+
+template <typename T>
+inline const T *Vector<T>::data(platform::Place place) const {
+  if (platform::is_cpu_place(place)) {
+    return std::vector<T>::data();
+  } else if (platform::is_gpu_place(place)) {
+    if (cuda_ptr_ == nullptr) {
+      return nullptr;
+    }
+    if (boost::get<platform::CUDAPlace>(place) == place_) {
+      return static_cast<const T *>(cuda_ptr_.get());
+    } else {
+      PADDLE_THROW(
+          "Unmatched place. Please use `mutable_data` copy lod to the target "
+          "Place first.");
+    }
+  } else {
+    PADDLE_THROW("Unsupport Place.");
+  }
+}
+
+template <typename T>
+inline T *Vector<T>::mutable_data(platform::Place place) {
+  if (platform::is_cpu_place(place)) {
+    return std::vector<T>::data();
+  } else if (platform::is_gpu_place(place)) {
+    if (boost::get<platform::CUDAPlace>(place) != place_) {
+      place_ = boost::get<platform::CUDAPlace>(place);
+    }
+#ifdef PADDLE_WITH_CUDA
+    if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) {
+      cuda_ptr_.reset(
+          memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T)),
+          memory::PlainDeleter<void, platform::CUDAPlace>(place_));
+    }
+    cuda_size_ = this->size();
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto *ctx = pool.GetByPlace(place_);
+    memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(),
+                 static_cast<const void *>(this->data()),
+                 this->size() * sizeof(T), ctx->stream());
+    ctx->Wait();
+    return static_cast<T *>(cuda_ptr_.get());
+#else
+    return nullptr;
+#endif
+  } else {
+    PADDLE_THROW("Unsupport Place.");
+  }
+}
+
+template <typename T>
+void Vector<T>::CopyToCUDA() {
+#ifdef PADDLE_WITH_CUDA
+  if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) {
+    cuda_ptr_.reset(
+        memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T)),
+        memory::PlainDeleter<void, platform::CUDAPlace>(place_));
+  }
+  cuda_size_ = this->size();
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto *ctx = pool.GetByPlace(place_);
+  memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(),
+               static_cast<const void *>(this->data()),
+               this->size() * sizeof(T), ctx->stream());
+  ctx->Wait();
+#endif
+}
+
+template <typename T>
+void Vector<T>::CopyFromCUDA() {
+#ifdef PADDLE_WITH_CUDA
+  if (cuda_ptr_ == nullptr) {
+    LOG(WARNING) << "No uncommitted cuda data.";
+    return;
+  }
+  this->resize(cuda_size_);
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto *ctx = pool.GetByPlace(place_);
+  memory::Copy(platform::CPUPlace(), static_cast<void *>(this->data()), place_,
+               static_cast<const void *>(cuda_ptr_.get()),
+               this->size() * sizeof(T), ctx->stream());
+  ctx->Wait();
+#endif
+}
+
+template <typename T>
+void Vector<T>::CopyToPeer(platform::Place place) {
+#ifdef PADDLE_WITH_CUDA
+  if (boost::get<platform::CUDAPlace>(place) != place_) {
+    place_ = boost::get<platform::CUDAPlace>(place);
+  }
+  if (cuda_size_ < this->size() || cuda_ptr_ == nullptr) {
+    cuda_ptr_.reset(
+        memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T)),
+        memory::PlainDeleter<void, platform::CUDAPlace>(place_));
+  }
+  cuda_size_ = this->size();
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto *ctx = pool.GetByPlace(place_);
+  memory::Copy(place_, cuda_ptr_.get(), platform::CPUPlace(),
+               static_cast<const void *>(this->data()),
+               this->size() * sizeof(T), ctx->stream());
+  ctx->Wait();
+#endif
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/mixed_vector_test.cu b/paddle/framework/mixed_vector_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7b571788ad1ade50e05dc9a70cba35b83f8db3ea
--- /dev/null
+++ b/paddle/framework/mixed_vector_test.cu
@@ -0,0 +1,72 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "gtest/gtest.h"
+
+#include "paddle/framework/init.h"
+#include "paddle/framework/mixed_vector.h"
+
+using namespace paddle::framework;
+using namespace paddle::platform;
+using namespace paddle::memory;
+
+template <typename T>
+__global__ void test(T* data, int size) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
+       i += blockDim.x * gridDim.x) {
+    data[i] *= 2;
+  }
+}
+
+TEST(Vector, Normal) {
+  // fill the device context pool.
+  InitDevices();
+
+  Vector<size_t> vec({1, 2, 3});
+  size_t* ptr = vec.data();
+  for (size_t i = 0; i < vec.size(); ++i) {
+    EXPECT_EQ(vec[i], *(ptr + i));
+  }
+
+  vec.clear();
+  vec.CopyFromCUDA();
+
+  std::vector<size_t> v = {1, 2, 3};
+  for (size_t i = 0; i < v.size(); ++i) {
+    EXPECT_EQ(v[i], vec[i]);
+  }
+}
+
+TEST(Vector, MultipleCopy) {
+  InitDevices();
+  Vector<size_t> vec({1, 2, 3});
+  CUDAPlace place(0);
+  vec.mutable_data(place);
+  auto vec2 = Vector<size_t>(vec);
+  {
+    const size_t* ptr = vec2.data(CPUPlace());
+    for (size_t i = 0; i < vec2.size(); ++i) {
+      EXPECT_EQ(*(ptr + i), vec[i]);
+    }
+  }
+  test<size_t><<<3, 3>>>(vec2.mutable_data(place), vec2.size());
+  vec2.CopyFromCUDA();
+  {
+    const size_t* ptr = vec2.data(CPUPlace());
+    for (size_t i = 0; i < vec2.size(); ++i) {
+      EXPECT_EQ(*(ptr + i), vec[i] * 2);
+    }
+  }
+}
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index f8df2cf97ad532f06cb1393b1a24cd789f8bde29..b51afe499bbc0e6b727aeeb4334f56e400ea81a5 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -39,10 +39,6 @@ class CompileTimeInferShapeContext : public InferShapeContext {
 
   bool HasOutputs(const std::string &name) const override;
 
-  DDim GetInputDim(const std::string &name) const override;
-
-  void SetOutputDim(const std::string &name, const DDim &dim) override;
-
   AttrReader Attrs() const override;
 
   const std::vector<std::string> &Inputs(
@@ -76,6 +72,11 @@ class CompileTimeInferShapeContext : public InferShapeContext {
 
   void SetDim(const std::string &name, const DDim &dim) override;
 
+  std::vector<DDim> GetRepeatedDims(const std::string &name) const override;
+
+  void SetRepeatedDims(const std::string &name,
+                       const std::vector<DDim> &dims) override;
+
   const OpDesc &op_;
   const BlockDesc &block_;
 };
@@ -124,11 +125,10 @@ OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block)
   // restore attrs_
   for (const proto::OpDesc::Attr &attr : desc_.attrs()) {
     std::string attr_name = attr.name();
+    // The sub_block referred to by the BLOCK attr hasn't been added
+    // to ProgramDesc class yet, we skip setting BLOCK attr here.
     if (attr.type() != proto::AttrType::BLOCK) {
       attrs_[attr_name] = GetAttrValue(attr);
-    } else {
-      auto bid = attr.block_idx();
-      attrs_[attr_name] = prog->MutableBlock(bid);
     }
   }
   this->block_ = block;
@@ -444,21 +444,6 @@ bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const {
   return true;
 }
 
-DDim CompileTimeInferShapeContext::GetInputDim(const std::string &name) const {
-  std::vector<DDim> ddims = GetInputsDim(name);
-  auto length = ddims.size();
-  PADDLE_ENFORCE_EQ(length, 1UL,
-                    "Input(%s) should have 1 value, "
-                    "but it has %d now",
-                    name, length);
-  return ddims[0];
-}
-
-void CompileTimeInferShapeContext::SetOutputDim(const std::string &name,
-                                                const DDim &dim) {
-  SetOutputsDim(name, {dim});
-}
-
 AttrReader CompileTimeInferShapeContext::Attrs() const {
   return AttrReader(op_.GetAttrMap());
 }
@@ -476,23 +461,48 @@ const std::vector<std::string> &CompileTimeInferShapeContext::Outputs(
 DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
   auto var = block_.FindVarRecursive(name);
   PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+  DDim res;
   try {
-    auto shape = var->Shape();
-    if (shape.empty()) {
-      return framework::make_ddim({0UL});
-    } else {
-      return framework::make_ddim(var->Shape());
-    }
+    auto shape = var->GetShape();
+    res = shape.empty() ? make_ddim({0UL}) : make_ddim(shape);
   } catch (...) {
     VLOG(5) << "GetDim of variable " << name << " error";
     std::rethrow_exception(std::current_exception());
   }
+  return res;
+}
+
+std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims(
+    const std::string &name) const {
+  auto var = block_.FindVarRecursive(name);
+  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+  std::vector<DDim> res;
+  try {
+    auto shapes = var->GetShapes();
+    for (const auto &s : shapes) {
+      res.push_back(s.empty() ? make_ddim({0UL}) : make_ddim(s));
+    }
+  } catch (...) {
+    VLOG(5) << "GetRepeatedDim of variable " << name << " error.";
+    std::rethrow_exception(std::current_exception());
+  }
+  return res;
 }
 
 void CompileTimeInferShapeContext::SetDim(const std::string &name,
                                           const DDim &dim) {
-  block_.FindVarRecursive(name)->SetShape(framework::vectorize(dim));
+  block_.FindVarRecursive(name)->SetShape(vectorize(dim));
+}
+
+void CompileTimeInferShapeContext::SetRepeatedDims(
+    const std::string &name, const std::vector<DDim> &dims) {
+  auto var = block_.FindVarRecursive(name);
+  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+  std::vector<std::vector<int64_t>> dim_vec(dims.size());
+  std::transform(dims.begin(), dims.end(), dim_vec.begin(), vectorize);
+  var->SetShapes(dim_vec);
 }
+
 bool CompileTimeInferShapeContext::IsRuntime() const { return false; }
 
 proto::VarDesc::VarType CompileTimeInferShapeContext::GetVarType(
diff --git a/paddle/framework/op_kernel_type_test.cc b/paddle/framework/op_kernel_type_test.cc
index 649afeee8a846b0579545f2edff77e9dbe3b4dd8..cb23bbde01493d1a3b5845e77d6160a75f409c7a 100644
--- a/paddle/framework/op_kernel_type_test.cc
+++ b/paddle/framework/op_kernel_type_test.cc
@@ -26,9 +26,9 @@ TEST(OpKernelType, ToString) {
   OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
                               LibraryType::kCUDNN);
 
-  ASSERT_EQ(
-      paddle::framework::KernelTypeToString(op_kernel_type),
-      "data_type[5]:data_layout[NCHW]:place[CPUPlace]:library_type[CUDNN]");
+  ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type),
+            "data_type[float32]:data_layout[NCHW]:place[CPUPlace]:library_type["
+            "CUDNN]");
 }
 
 TEST(OpKernelType, Hash) {
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 831b1e2a1e10777d9e89364adcd4b1f367e86080..52387aabd9d0b41b13814499fb3f0107f42401e7 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -22,9 +22,7 @@ limitations under the License. */
 #include "paddle/framework/shape_inference.h"
 #include "paddle/framework/var_type.h"
 
-DEFINE_bool(op_sync, false,
-            "Default cuda is asynchronous device, set to True will"
-            "force op run in synchronous mode.");
+DECLARE_bool(benchmark);
 
 namespace paddle {
 namespace framework {
@@ -322,8 +320,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
     if (length == 0) {
       return false;
     }
-    PADDLE_ENFORCE_EQ(length, 1UL, "Input %s should have more than one inputs",
-                      name);
+    PADDLE_ENFORCE_EQ(length, 1UL,
+                      "Input %s should not have more than one inputs", name);
     auto ipt = ins[0];
     auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
     return var != nullptr;
@@ -335,8 +333,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
     if (length == 0) {
       return false;
     }
-    PADDLE_ENFORCE_EQ(length, 1UL, "Output %s should have more than one inputs",
-                      name);
+    PADDLE_ENFORCE_EQ(length, 1UL,
+                      "Output %s should not have more than one inputs", name);
     auto ipt = outs[0];
     auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
     return var != nullptr;
@@ -368,14 +366,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
     return true;
   }
 
-  DDim GetInputDim(const std::string& name) const override {
-    return GetDim(op_.Input(name));
-  }
-
-  void SetOutputDim(const std::string& name, const DDim& dim) override {
-    SetDim(op_.Output(name), dim);
-  }
-
   AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
 
   const std::vector<std::string>& Inputs(
@@ -431,8 +421,22 @@ class RuntimeInferShapeContext : public InferShapeContext {
     } else if (var->IsType<SelectedRows>()) {
       return var->Get<SelectedRows>().GetCompleteDims();
     } else {
-      PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.",
-                   name, var->Type().name());
+      PADDLE_THROW(
+          "Only LoDTensor/SelectedRows support 'GetDim', but Variable %s's "
+          "type_id is %s.",
+          name, var->Type().name());
+    }
+  }
+
+  std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
+    Variable* var = scope_.FindVar(name);
+    if (var->IsType<ReaderHolder>()) {
+      return var->Get<ReaderHolder>().shapes();
+    } else {
+      PADDLE_THROW(
+          "Only ReaderHolder support 'GetRepeatedDims', but Variable %s's "
+          "type_id is %s.",
+          name, var->Type().name());
     }
   }
 
@@ -448,6 +452,19 @@ class RuntimeInferShapeContext : public InferShapeContext {
     }
   }
 
+  void SetRepeatedDims(const std::string& name,
+                       const std::vector<DDim>& dims) override {
+    Variable* var = scope_.FindVar(name);
+    if (var->IsType<ReaderHolder>()) {
+      var->GetMutable<ReaderHolder>()->set_shapes(dims);
+    } else {
+      PADDLE_THROW(
+          "Only ReaderHolder support 'SetRepeatedDims', but Variable %s's "
+          "type_id is %s.",
+          name, var->Type().name());
+    }
+  }
+
   proto::VarDesc::VarType GetVarType(const std::string& name) const override {
     auto* var = scope_.FindVar(name);
     return ToVarType(var->Type());
@@ -531,7 +548,7 @@ void OperatorWithKernel::Run(const Scope& scope,
       ExecutionContext(*this, new_scope, *new_dev_ctx));
 
   /*For profiling/benchmark only*/
-  if (FLAGS_op_sync) {
+  if (FLAGS_benchmark) {
     new_dev_ctx->Wait();
   }
 }
diff --git a/paddle/framework/program_desc.cc b/paddle/framework/program_desc.cc
index b5d9e5e385c1ba57169ef885824fc23b0f130692..0e937dda4e185590648962a6d4f827eea21eb620 100644
--- a/paddle/framework/program_desc.cc
+++ b/paddle/framework/program_desc.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/framework/program_desc.h"
 #include "paddle/framework/block_desc.h"
+#include "paddle/framework/feed_fetch_type.h"
 
 namespace paddle {
 namespace framework {
@@ -42,11 +43,20 @@ ProgramDesc::ProgramDesc() {
 
 ProgramDesc::ProgramDesc(const ProgramDesc &o) {
   desc_ = o.desc_;
-
   for (int i = 0; i < desc_.blocks_size(); ++i) {
     auto *block = desc_.mutable_blocks(i);
     blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this));
   }
+  for (auto &block : blocks_) {
+    for (auto *op : block->AllOps()) {
+      for (const auto &attr : op->Proto()->attrs()) {
+        if (attr.type() == proto::AttrType::BLOCK) {
+          size_t blk_idx = attr.block_idx();
+          op->SetBlockAttr(attr.name(), *this->MutableBlock(blk_idx));
+        }
+      }
+    }
+  }
 }
 
 ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
@@ -54,6 +64,16 @@ ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
   for (auto &block_desc : *desc_.mutable_blocks()) {
     blocks_.emplace_back(new BlockDesc(this, &block_desc));
   }
+  for (auto &block : blocks_) {
+    for (auto *op : block->AllOps()) {
+      for (const auto &attr : op->Proto()->attrs()) {
+        if (attr.type() == proto::AttrType::BLOCK) {
+          size_t blk_idx = attr.block_idx();
+          op->SetBlockAttr(attr.name(), *this->MutableBlock(blk_idx));
+        }
+      }
+    }
+  }
 }
 
 ProgramDesc::ProgramDesc(const std::string &binary_str) {
@@ -64,5 +84,27 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) {
   }
 }
 
+const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
+  BlockDesc *global_block = blocks_[0].get();
+  std::vector<std::string> feed_target_names;
+  for (auto *op : global_block->AllOps()) {
+    if (op->Type() == kFeedOpType) {
+      feed_target_names.insert(feed_target_names.begin(), op->Output("Out")[0]);
+    }
+  }
+  return feed_target_names;
+}
+
+const std::vector<std::string> ProgramDesc::GetFetchTargetNames() {
+  BlockDesc *global_block = blocks_[0].get();
+  std::vector<std::string> fetch_target_names;
+  for (auto *op : global_block->AllOps()) {
+    if (op->Type() == kFetchOpType) {
+      fetch_target_names.push_back(op->Input("X")[0]);
+    }
+  }
+  return fetch_target_names;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/program_desc.h b/paddle/framework/program_desc.h
index 15a962bb696d6172acd1a83cf9bb1ffd0846d449..8e958eab6ee08436ca73b13bac010e66c7df2b8b 100644
--- a/paddle/framework/program_desc.h
+++ b/paddle/framework/program_desc.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <memory>
 #include <vector>
+#include "paddle/framework/block_desc.h"
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/proto_desc.h"
 #include "paddle/platform/macros.h"
@@ -45,6 +46,9 @@ class ProgramDesc {
 
   proto::ProgramDesc *Proto();
 
+  const std::vector<std::string> GetFeedTargetNames();
+  const std::vector<std::string> GetFetchTargetNames();
+
  private:
   proto::ProgramDesc desc_;
 
diff --git a/paddle/framework/program_desc_test.cc b/paddle/framework/program_desc_test.cc
index 59947c9f2189348226b7ff6c2b9315196bbf55fa..9945aee31b647a6243971c7e64c8391c0b1c09c5 100644
--- a/paddle/framework/program_desc_test.cc
+++ b/paddle/framework/program_desc_test.cc
@@ -53,7 +53,7 @@ TEST(ProgramDesc, copy_ctor) {
     ASSERT_NE(copy, var_before);
     ASSERT_EQ(copy->Name(), var_before->Name());
     ASSERT_EQ(copy->GetType(), var_before->GetType());
-    ASSERT_EQ(copy->Shape(), var_before->Shape());
+    ASSERT_EQ(copy->GetShape(), var_before->GetShape());
     ASSERT_EQ(copy->Proto()->SerializeAsString(),
               var_before->Proto()->SerializeAsString());
   };
@@ -117,7 +117,7 @@ TEST(ProgramDescBind, serialize_and_deserialize) {
     ASSERT_NE(restored, var_before);
     ASSERT_EQ(restored->Name(), var_before->Name());
     ASSERT_EQ(restored->GetType(), var_before->GetType());
-    ASSERT_EQ(restored->Shape(), var_before->Shape());
+    ASSERT_EQ(restored->GetShape(), var_before->GetShape());
     ASSERT_EQ(restored->Proto()->SerializeAsString(),
               var_before->Proto()->SerializeAsString());
   };
diff --git a/paddle/framework/prune.cc b/paddle/framework/prune.cc
index 25eb813ffb96e9b1e13299421ead9f85c02da59f..ddd6b993d40f72cba919fad95318f70409c98bca 100644
--- a/paddle/framework/prune.cc
+++ b/paddle/framework/prune.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <algorithm>
 #include <set>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include <glog/logging.h>
@@ -48,11 +49,28 @@ bool IsTarget(const proto::OpDesc& op_desc) {
   return false;
 }
 
-void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
-                int block_id) {
-  // TODO(tonyyang-svail):
-  //    - will change to use multiple blocks for RNN op and Cond Op
+int GetSubBlockIndex(const proto::OpDesc& op_desc) {
+  for (auto& attr : op_desc.attrs()) {
+    if (attr.type() == proto::AttrType::BLOCK) {
+      PADDLE_ENFORCE(attr.has_block_idx());
+      return attr.block_idx();
+    }
+  }
+  return -1;
+}
 
+bool HasSubBlock(const proto::OpDesc& op_desc) {
+  return GetSubBlockIndex(op_desc) > 0;
+}
+
+// block_id is the idx of the current block in the input desc
+// parent_block_id is the idx of the parent of the current block
+// in the output desc, -1 means the current block is global block
+// dependent_vars is passed recursively from the parent block to
+// the child block to help pruning
+void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
+                int block_id, int parent_block_id,
+                std::set<std::string>& dependent_vars) {
   auto& block = input.blocks(block_id);
   auto& ops = block.ops();
 
@@ -71,11 +89,9 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
     expect_fetch = (op_desc.type() == kFetchOpType);
   }
 
-  std::set<std::string> dependent_vars;
   std::vector<bool> should_run;
   for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) {
     auto& op_desc = *op_iter;
-
     if (IsTarget(op_desc) || HasDependentVar(op_desc, dependent_vars)) {
       // insert its input to the dependency graph
       for (auto& var : op_desc.inputs()) {
@@ -83,7 +99,6 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
           dependent_vars.insert(argu);
         }
       }
-
       should_run.push_back(true);
     } else {
       should_run.push_back(false);
@@ -94,19 +109,81 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
   // we reverse the should_run vector
   std::reverse(should_run.begin(), should_run.end());
 
-  *output = input;
-  auto* op_field = output->mutable_blocks(block_id)->mutable_ops();
+  // copy the current block from input to output
+  auto* block_field = output->mutable_blocks();
+  *block_field->Add() = input.blocks(block_id);
+
+  int output_block_id = output->blocks_size() - 1;
+  auto* output_block = output->mutable_blocks(output_block_id);
+  output_block->set_idx(output_block_id);
+  output_block->set_parent_idx(parent_block_id);
+
+  auto* op_field = output_block->mutable_ops();
   op_field->Clear();
   for (size_t i = 0; i < should_run.size(); ++i) {
     if (should_run[i]) {
-      *op_field->Add() = input.blocks(block_id).ops(i);
+      auto* op = op_field->Add();
+      *op = input.blocks(block_id).ops(i);
+      if (HasSubBlock(*op)) {
+        // create sub_block_dependent_vars here to help prune the sub block
+        std::set<std::string> sub_block_dependent_vars;
+        for (auto& var : op->inputs()) {
+          for (auto& argu : var.arguments()) {
+            sub_block_dependent_vars.insert(argu);
+          }
+        }
+        for (auto& var : op->outputs()) {
+          for (auto& argu : var.arguments()) {
+            sub_block_dependent_vars.insert(argu);
+          }
+        }
+        // GetSubBlockIndex(*op) is the idx of the sub_block in the input desc
+        // output_block_id is the idx of the current block in the output desc
+        prune_impl(input, output, GetSubBlockIndex(*op), output_block_id,
+                   sub_block_dependent_vars);
+      }
     }
   }
+
+  // remove the VarDescs in BlockDesc that are not referenced in
+  // the pruned OpDescs
+  std::unordered_map<std::string, proto::VarDesc> var_map;
+  auto* var_field = output->mutable_blocks(output_block_id)->mutable_vars();
+  for (const auto& var : *var_field) {
+    var_map[var.name()] = var;
+  }
+
+  std::set<std::string> var_names;
+  for (const auto& op : *op_field) {
+    auto& input_field = op.inputs();
+    for (auto& input_var : input_field) {
+      for (auto& arg : input_var.arguments()) {
+        if (var_map.count(arg) != 0) {
+          var_names.insert(arg);
+        }
+      }
+    }
+    auto& output_field = op.outputs();
+    for (auto& output_var : output_field) {
+      for (auto& arg : output_var.arguments()) {
+        if (var_map.count(arg) != 0) {
+          var_names.insert(arg);
+        }
+      }
+    }
+  }
+
+  var_field->Clear();
+  for (const auto& name : var_names) {
+    *var_field->Add() = var_map[name];
+  }
 }
 
 // TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies
 void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output) {
-  prune_impl(input, output, 0);
+  std::set<std::string> dependent_vars;
+  output->clear_blocks();
+  prune_impl(input, output, 0, -1, dependent_vars);
 }
 
 void inference_optimize_impl(const proto::ProgramDesc& input,
diff --git a/paddle/framework/reader.cc b/paddle/framework/reader.cc
new file mode 100644
index 0000000000000000000000000000000000000000..928b661aaadb4a59294de24cc1c414795c2878d5
--- /dev/null
+++ b/paddle/framework/reader.cc
@@ -0,0 +1,122 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/framework/reader.h"
+
+namespace paddle {
+namespace framework {
+
+DDim ReaderBase::shape(size_t idx) const {
+  PADDLE_ENFORCE_LT(
+      idx, shapes_.size(),
+      "Cannot get the %d'th shape, 'shapes_' only has %d elements.", idx,
+      shapes_.size());
+  return shapes_[idx];
+}
+
+void ShuffleReader::ReadNext(std::vector<LoDTensor>* out) {
+  if (iteration_pos_ >= buffer_.size()) {
+    // Reload buffer with new data
+    buffer_.clear();
+    buffer_.reserve(buffer_size_);
+    for (int i = 0; i < buffer_size_; ++i) {
+      if (reader_->HasNext()) {
+        buffer_.push_back(std::vector<LoDTensor>());
+        reader_->ReadNext(&buffer_.back());
+      } else {
+        break;
+      }
+    }
+    // TODO(fengjiayi): 'std::random_shuffle' can be very slow. It needs to be
+    // optimize.
+    std::random_shuffle(buffer_.begin(), buffer_.end());
+    iteration_pos_ = 0;
+  }
+  out->clear();
+  if (!buffer_.empty()) {
+    std::swap(*out, buffer_[iteration_pos_++]);
+  }
+  // if buffer_ is empty, the 'out' will return as an empty vector.
+}
+
+void BatchReader::ReadNext(std::vector<LoDTensor>* out) {
+  buffer_.clear();
+  buffer_.reserve(batch_size_);
+  for (int i = 0; i < batch_size_; ++i) {
+    if (reader_->HasNext()) {
+      buffer_.push_back(std::vector<LoDTensor>());
+      reader_->ReadNext(&buffer_.back());
+    } else {
+      break;
+    }
+  }
+  // Concat instances
+  out->clear();
+  if (buffer_.empty()) {
+    // if buffer_ is empty, the 'out' will return as an empty vector.
+    return;
+  }
+  int out_num = buffer_[0].size();
+  out->reserve(out_num);
+  for (int j = 0; j < out_num; ++j) {
+    // Merge shape and check date type
+    std::type_index batch_type = buffer_[0][j].type();
+    DDim batch_shape = buffer_[0][j].dims();
+    for (size_t i = 1; i < buffer_.size(); ++i) {
+      std::type_index ins_type = buffer_[i][j].type();
+      DDim ins_shape = buffer_[i][j].dims();
+      PADDLE_ENFORCE_EQ(batch_type, ins_type);
+      PADDLE_ENFORCE_EQ(slice_ddim(batch_shape, 1, batch_shape.size()),
+                        slice_ddim(ins_shape, 1, ins_shape.size()));
+      PADDLE_ENFORCE_GT(ins_shape[0], 0);
+      batch_shape[0] += ins_shape[0];
+    }
+
+    LoDTensor out_tensor;
+    out_tensor.Resize(batch_shape);
+    out_tensor.mutable_data(platform::CPUPlace(), batch_type);
+    int64_t dst_offset = 0;
+
+    // Merge lod and data
+    LoD batch_lod;
+    std::vector<size_t> top_level_lod({0});
+    for (size_t i = 0; i < buffer_.size(); ++i) {
+      DDim ins_shape = buffer_[i][j].dims();
+      LoD ins_lod = buffer_[i][j].lod();
+      if (i == 0) {
+        batch_lod = ins_lod;
+      } else {
+        PADDLE_ENFORCE_EQ(batch_lod.size(), ins_lod.size());
+        for (size_t level_idx = 0; level_idx < batch_lod.size(); ++level_idx) {
+          auto& lod_level = batch_lod[level_idx];
+          for (size_t k = 1; k < ins_lod[level_idx].size(); ++k) {
+            lod_level.push_back(ins_lod[level_idx][k] + lod_level.back());
+          }
+        }
+      }
+      top_level_lod.push_back(
+          top_level_lod.back() +
+          (ins_lod.empty() ? ins_shape[0] : (ins_lod[0].size() - 1)));
+
+      Tensor dst = out_tensor.Slice(dst_offset, dst_offset + ins_shape[0]);
+      Copy(buffer_[i][j], platform::CPUPlace(), &dst);
+      dst_offset += ins_shape[0];
+    }
+    batch_lod.insert(batch_lod.begin(), top_level_lod);
+    out_tensor.set_lod(batch_lod);
+    out->push_back(out_tensor);
+  }
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/reader.h b/paddle/framework/reader.h
new file mode 100644
index 0000000000000000000000000000000000000000..534894cfbd66687fc982f1def4cb0d05d77a4583
--- /dev/null
+++ b/paddle/framework/reader.h
@@ -0,0 +1,161 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/framework/ddim.h"
+#include "paddle/framework/lod_tensor_array.h"
+
+namespace paddle {
+namespace framework {
+
+class ReaderBase {
+ public:
+  explicit ReaderBase(const std::vector<DDim>& shapes) : shapes_(shapes) {
+    PADDLE_ENFORCE(!shapes_.empty());
+  }
+  virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
+  virtual bool HasNext() const = 0;
+
+  virtual void ReInit() = 0;
+
+  DDim shape(size_t idx) const;
+  std::vector<DDim> shapes() const { return shapes_; }
+  void set_shapes(const std::vector<DDim>& shapes) { shapes_ = shapes; }
+
+  virtual ~ReaderBase() {}
+
+ protected:
+  std::vector<DDim> shapes_;
+};
+
+class FileReader : public ReaderBase {
+ public:
+  explicit FileReader(const std::vector<DDim>& shapes) : ReaderBase(shapes) {}
+};
+
+class DecoratedReader : public ReaderBase {
+ public:
+  explicit DecoratedReader(ReaderBase* reader)
+      : ReaderBase(reader->shapes()), reader_(reader) {
+    PADDLE_ENFORCE_NOT_NULL(reader_);
+  }
+
+  bool HasNext() const override { return reader_->HasNext(); }
+
+  void ReInit() override { reader_->ReInit(); }
+
+ protected:
+  ReaderBase* reader_;
+};
+
+// file readers
+
+template <typename T>
+class RandomDataGenerator : public FileReader {
+ public:
+  RandomDataGenerator(const std::vector<DDim>& shapes, float min, float max)
+      : FileReader(shapes), min_(min), max_(max) {
+    PADDLE_ENFORCE_LE(
+        min, max, "'min' shouldn't be greater than 'max'.(%f vs %f)", min, max);
+    unsigned int seed = std::random_device()();
+    engine_.seed(seed);
+    dist_ = std::uniform_real_distribution<float>(min_, max_);
+  }
+
+  void ReadNext(std::vector<LoDTensor>* out) override {
+    out->clear();
+    out->reserve(shapes_.size());
+    for (const DDim& shape : shapes_) {
+      PADDLE_ENFORCE_GE(
+          shape.size(), 2,
+          "The rank of reader's output data should be 2 at least.(Now it's %d)",
+          shape.size());
+      LoDTensor out_tensor;
+      out_tensor.Resize(shape);
+      T* data = out_tensor.mutable_data<T>(platform::CPUPlace());
+      int64_t numel = product(shape);
+      for (int64_t i = 0; i < numel; ++i) {
+        data[i] = dist_(engine_);
+      }
+      out->push_back(out_tensor);
+    }
+  }
+
+  bool HasNext() const override { return true; }
+
+  void ReInit() override { return; }
+
+ private:
+  float min_;
+  float max_;
+  std::minstd_rand engine_;
+  std::uniform_real_distribution<float> dist_;
+};
+
+// decorated readers
+
+class ShuffleReader : public DecoratedReader {
+ public:
+  ShuffleReader(ReaderBase* reader, int buffer_size)
+      : DecoratedReader(reader), buffer_size_(buffer_size), iteration_pos_(0) {
+    buffer_.reserve(buffer_size);
+  }
+
+  void ReadNext(std::vector<LoDTensor>* out) override;
+
+ private:
+  int buffer_size_;
+  std::vector<std::vector<LoDTensor>> buffer_;
+  size_t iteration_pos_;
+};
+
+class BatchReader : public DecoratedReader {
+ public:
+  BatchReader(ReaderBase* reader, int batch_size)
+      : DecoratedReader(reader), batch_size_(batch_size) {
+    buffer_.reserve(batch_size_);
+  }
+
+  void ReadNext(std::vector<LoDTensor>* out) override;
+
+ private:
+  int batch_size_;
+  std::vector<std::vector<LoDTensor>> buffer_;
+};
+
+// The ReaderHolder is used as readers' unified wrapper,
+// making it easier to access different type readers in Variables.
+class ReaderHolder {
+ public:
+  void Reset(ReaderBase* reader) { reader_.reset(reader); }
+
+  ReaderBase* Get() const { return reader_.get(); }
+
+  void ReadNext(std::vector<LoDTensor>* out) { reader_->ReadNext(out); }
+  bool HasNext() const { return reader_->HasNext(); }
+  void ReInit() { reader_->ReInit(); }
+
+  DDim shape(size_t idx) const { return reader_->shape(idx); }
+  std::vector<DDim> shapes() const { return reader_->shapes(); }
+  void set_shapes(const std::vector<DDim>& shapes) {
+    reader_->set_shapes(shapes);
+  }
+
+ private:
+  std::unique_ptr<ReaderBase> reader_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index a67ff910093d93060d07d849f6e968e5f4ce21cd..af08b2ab816f63c05d4c65df9601c787e57994f5 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -20,9 +20,11 @@ limitations under the License. */
 #include "paddle/framework/threadpool.h"
 #include "paddle/string/printf.h"
 
-DEFINE_bool(do_memory_benchmark, false,
+DEFINE_bool(benchmark, false,
             "Doing memory benchmark. It will make deleting scope synchronized, "
-            "and add some memory usage logs");
+            "and add some memory usage logs."
+            "Default cuda is asynchronous device, set to True will"
+            "force op run in synchronous mode.");
 
 namespace paddle {
 namespace framework {
@@ -93,7 +95,7 @@ void Scope::DeleteScope(Scope* scope) {
   PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
   this->kids_.erase(it);
   // When making memory benchmark on Fluid, we have to delete scope sync.
-  if (FLAGS_do_memory_benchmark) {
+  if (FLAGS_benchmark) {
     delete scope;
   } else {
     Async([scope] { delete scope; });
diff --git a/paddle/framework/shape_inference.cc b/paddle/framework/shape_inference.cc
index e53cc0cdabc623ae358f1a3e21823a2f38ec3c62..2f4d45057715d2c6f26bca74d1d691207b528207 100644
--- a/paddle/framework/shape_inference.cc
+++ b/paddle/framework/shape_inference.cc
@@ -18,10 +18,28 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-std::vector<framework::DDim> InferShapeContext::GetInputsDim(
+DDim InferShapeContext::GetInputDim(const std::string &name) const {
+  const std::vector<std::string> &arg_names = Inputs(name);
+  PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
+                    "Input(%s) should hold one element, but now it holds %d",
+                    name, arg_names.size());
+  return this->GetDim(arg_names[0]);
+}
+
+std::vector<DDim> InferShapeContext::GetInputsDim(
     const std::string &name) const {
-  const std::vector<std::string> &names = Inputs(name);
-  return GetDims(names);
+  const std::vector<std::string> &arg_names = Inputs(name);
+  return GetDims(arg_names);
+}
+
+std::vector<DDim> InferShapeContext::GetReaderDims(
+    const std::string &name) const {
+  const std::vector<std::string> &arg_names = Inputs(name);
+  PADDLE_ENFORCE_EQ(
+      arg_names.size(), 1UL,
+      "Reader input '%s' should hold one element, but now it holds %d", name,
+      arg_names.size());
+  return this->GetRepeatedDims(arg_names[0]);
 }
 
 DDim InferShapeContext::GetInputsElementDim(const std::string &name,
@@ -30,15 +48,33 @@ DDim InferShapeContext::GetInputsElementDim(const std::string &name,
   return this->GetDim(names[idx]);
 }
 
-void InferShapeContext::SetOutputsDim(
-    const std::string &name, const std::vector<framework::DDim> &dims) {
+void InferShapeContext::SetOutputDim(const std::string &name, const DDim &dim) {
+  auto &arg_names = Outputs(name);
+  PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
+                    "Output(%s) should hold one element, but now it holds %d",
+                    name, arg_names.size());
+  SetDim(arg_names[0], dim);
+}
+
+void InferShapeContext::SetOutputsDim(const std::string &name,
+                                      const std::vector<DDim> &dims) {
   auto &names = Outputs(name);
   SetDims(names, dims);
 }
 
-std::vector<framework::DDim> InferShapeContext::GetDims(
+void InferShapeContext::SetReaderDims(const std::string &name,
+                                      const std::vector<DDim> &dims) {
+  const std::vector<std::string> &arg_names = Outputs(name);
+  PADDLE_ENFORCE_EQ(
+      arg_names.size(), 1UL,
+      "Reader output '%s' should hold one element, but now it holds %d", name,
+      arg_names.size());
+  return this->SetRepeatedDims(arg_names[0], dims);
+}
+
+std::vector<DDim> InferShapeContext::GetDims(
     const std::vector<std::string> &names) const {
-  std::vector<framework::DDim> ret;
+  std::vector<DDim> ret;
   ret.reserve(names.size());
   std::transform(
       names.begin(), names.end(), std::back_inserter(ret),
@@ -47,7 +83,7 @@ std::vector<framework::DDim> InferShapeContext::GetDims(
 }
 
 void InferShapeContext::SetDims(const std::vector<std::string> &names,
-                                const std::vector<framework::DDim> &dims) {
+                                const std::vector<DDim> &dims) {
   size_t length = names.size();
   PADDLE_ENFORCE_EQ(length, dims.size());
   for (size_t i = 0; i < length; ++i) {
@@ -57,14 +93,17 @@ void InferShapeContext::SetDims(const std::vector<std::string> &names,
     SetDim(names[i], dims[i]);
   }
 }
+
 std::vector<proto::VarDesc::VarType> InferShapeContext::GetInputsVarType(
     const std::string &name) const {
   return GetVarTypes(Inputs(name));
 }
+
 std::vector<proto::VarDesc::VarType> InferShapeContext::GetOutputsVarType(
     const std::string &name) const {
   return GetVarTypes(Outputs(name));
 }
+
 std::vector<proto::VarDesc::VarType> InferShapeContext::GetVarTypes(
     const std::vector<std::string> &names) const {
   std::vector<proto::VarDesc::VarType> retv;
diff --git a/paddle/framework/shape_inference.h b/paddle/framework/shape_inference.h
index f93319d8f2fd4c5d388bd57fd595a6a5edd51775..7bee86985239de73fca9aef1faefc04f7615f3ce 100644
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -35,14 +35,14 @@ class InferShapeContext {
   virtual bool HasInputs(const std::string &name) const = 0;
   virtual bool HasOutputs(const std::string &name) const = 0;
 
-  virtual framework::DDim GetInputDim(const std::string &name) const = 0;
-
-  std::vector<framework::DDim> GetInputsDim(const std::string &name) const;
+  DDim GetInputDim(const std::string &name) const;
+  std::vector<DDim> GetInputsDim(const std::string &name) const;
+  std::vector<DDim> GetReaderDims(const std::string &name) const;
   DDim GetInputsElementDim(const std::string &name, int idx) const;
 
-  virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0;
-  void SetOutputsDim(const std::string &name,
-                     const std::vector<framework::DDim> &dims);
+  void SetOutputDim(const std::string &name, const DDim &dim);
+  void SetOutputsDim(const std::string &name, const std::vector<DDim> &dims);
+  void SetReaderDims(const std::string &name, const std::vector<DDim> &dims);
 
   virtual AttrReader Attrs() const = 0;
   virtual const std::vector<std::string> &Inputs(
@@ -57,15 +57,16 @@ class InferShapeContext {
 
   // Note: In while op, we need this to be public
   void SetDims(const std::vector<std::string> &names,
-               const std::vector<framework::DDim> &dims);
+               const std::vector<DDim> &dims);
 
  protected:
-  virtual framework::DDim GetDim(const std::string &name) const = 0;
-  virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0;
-
-  std::vector<framework::DDim> GetDims(
-      const std::vector<std::string> &names) const;
+  virtual DDim GetDim(const std::string &name) const = 0;
+  virtual void SetDim(const std::string &name, const DDim &dim) = 0;
+  virtual std::vector<DDim> GetRepeatedDims(const std::string &name) const = 0;
+  virtual void SetRepeatedDims(const std::string &name,
+                               const std::vector<DDim> &dims) = 0;
 
+  std::vector<DDim> GetDims(const std::vector<std::string> &names) const;
   std::vector<proto::VarDesc::VarType> GetVarTypes(
       const std::vector<std::string> &names) const;
 
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 4aaa29d794c95592832a1fe990e2dce274eba9d5..f0ea709a5c37e769e3ffa1b2e9d1e39721979251 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -47,6 +47,11 @@ class Tensor {
  public:
   Tensor() : offset_(0) {}
 
+  /*! Constructor with place should only be used in pybind. */
+  explicit Tensor(const platform::Place& place) : offset_(0) {
+    holder_->set_place(place);
+  }
+
   /*! Return a pointer to mutable memory block. */
   template <typename T>
   inline T* data();
@@ -137,6 +142,7 @@ class Tensor {
     virtual std::type_index type() const = 0;
     virtual platform::Place place() const = 0;
     virtual void set_type(std::type_index type) = 0;
+    virtual void set_place(platform::Place place) = 0;
   };
 
   template <typename Place>
@@ -156,6 +162,7 @@ class Tensor {
     virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
     virtual std::type_index type() const { return type_; }
     virtual void set_type(std::type_index type) { type_ = type; }
+    virtual void set_place(platform::Place place) { place_ = place; }
 
     /*! the pointer of memory block. */
     std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_;
diff --git a/paddle/framework/threadpool.cc b/paddle/framework/threadpool.cc
index 109a7e7dc440d91e8223f2c0924f489f54a06f64..b7d7c00bcf9d9770f58284023ca2defcda299d64 100644
--- a/paddle/framework/threadpool.cc
+++ b/paddle/framework/threadpool.cc
@@ -1,24 +1,95 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0
 
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 
 #include "paddle/framework/threadpool.h"
 
+#include "paddle/platform/enforce.h"
+
 namespace paddle {
 namespace framework {
 
-std::unique_ptr<ThreadPool> ThreadPool::threadpool(nullptr);
-std::once_flag ThreadPool::init_flag;
+std::unique_ptr<ThreadPool> ThreadPool::threadpool_(nullptr);
+std::once_flag ThreadPool::init_flag_;
+
+ThreadPool* ThreadPool::GetInstance() {
+  std::call_once(init_flag_, &ThreadPool::Init);
+  return threadpool_.get();
+}
+
+void ThreadPool::Init() {
+  if (threadpool_.get() == nullptr) {
+    // TODO(Yancey1989): specify the max threads number
+    int num_threads = std::thread::hardware_concurrency();
+    PADDLE_ENFORCE_GT(num_threads, 0);
+    threadpool_.reset(new ThreadPool(num_threads));
+  }
+}
+
+ThreadPool::ThreadPool(int num_threads)
+    : total_threads_(num_threads), idle_threads_(num_threads), running_(true) {
+  threads_.resize(num_threads);
+  for (auto& thread : threads_) {
+    // TODO(Yancey1989): binding the thread on the specify CPU number
+    thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this)));
+  }
+}
+
+ThreadPool::~ThreadPool() {
+  {
+    // notify all threads to stop running
+    running_ = false;
+    scheduled_.notify_all();
+  }
+
+  for (auto& t : threads_) {
+    t->join();
+    t.reset(nullptr);
+  }
+}
+
+void ThreadPool::Wait() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  completed_.wait(lock, [=] { return Done() == true; });
+}
+
+void ThreadPool::TaskLoop() {
+  while (running_) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; });
+
+    if (!running_) {
+      break;
+    }
+    // pop a task from the task queue
+    auto task = std::move(tasks_.front());
+    tasks_.pop();
+
+    --idle_threads_;
+    lock.unlock();
+
+    // run the task
+    task();
+
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+      ++idle_threads_;
+      if (Done()) {
+        completed_.notify_all();
+      }
+    }
+  }
+}
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/threadpool.h b/paddle/framework/threadpool.h
index 3ac345851c38557f82698786dd3bc8e1202a4256..77d31a1176d5947655b57c70846294093a7bb5ef 100644
--- a/paddle/framework/threadpool.h
+++ b/paddle/framework/threadpool.h
@@ -20,141 +20,117 @@ limitations under the License. */
 #include <mutex>
 #include <queue>
 #include <thread>
-
+#include <vector>
+#include "glog/logging.h"
 #include "paddle/platform/enforce.h"
+#include "paddle/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 
 namespace paddle {
 namespace framework {
 
+// ThreadPool maintains a queue of tasks, and runs them using a fixed
+// number of threads.
 class ThreadPool {
  public:
-  typedef std::packaged_task<void()> Task;
-
-  /**
-   * @brief   Get a instance of threadpool, the thread number will
-   *          be specified as the number of hardware thread contexts
-   */
-  static ThreadPool* GetInstance() {
-    std::call_once(init_flag, &ThreadPool::Init);
-    return threadpool.get();
-  }
+  using Task = std::packaged_task<std::unique_ptr<platform::EnforceNotMet>()>;
 
-  ~ThreadPool() {
-    {
-      // notify all threads to stop running
-      running_ = false;
-      scheduled_.notify_all();
-    }
+  // Returns the singleton of ThreadPool.
+  static ThreadPool* GetInstance();
 
-    for (auto& t : threads_) {
-      t->join();
-      t.reset(nullptr);
-    }
-  }
+  ~ThreadPool();
 
-  int GetNumThreads() const { return num_threads_; }
+  // Returns the number of threads created by the constructor.
+  size_t Threads() const { return total_threads_; }
 
-  int GetAvailable() {
+  // Returns the number of currently idle threads.
+  size_t IdleThreads() {
     std::unique_lock<std::mutex> lock(mutex_);
-    return available_;
+    return idle_threads_;
   }
 
-  /**
-   * @brief   Push a function to the queue, and will be scheduled and
-   *          executed if a thread is available.
-   * @param[in] Task, will be pushed to the task queue.
-   * @return    std::future<void>, we could wait for the task finished by
-   *            f.wait().
-   */
+  // Run pushes a function to the task queue and returns a std::future
+  // object.  To wait for the completion of the task, call
+  // std::future::wait().
   template <typename Callback>
   std::future<void> Run(Callback fn) {
+    auto f = this->RunAndGetException(fn);
+    return std::async(std::launch::deferred, ExceptionHandler(std::move(f)));
+  }
+
+  template <typename Callback>
+  std::future<std::unique_ptr<platform::EnforceNotMet>> RunAndGetException(
+      Callback fn) {
     std::unique_lock<std::mutex> lock(mutex_);
-    Task task(std::bind(fn));
-    std::future<void> f = task.get_future();
+    Task task([fn]() -> std::unique_ptr<platform::EnforceNotMet> {
+      try {
+        fn();
+        return nullptr;
+      } catch (platform::EnforceNotMet ex) {
+        return std::unique_ptr<platform::EnforceNotMet>(
+            new platform::EnforceNotMet(ex));
+      } catch (...) {
+        LOG(FATAL)
+            << "Unexpected exception is catched in thread pool. All "
+               "throwable exception in Fluid should be an EnforceNotMet.";
+      }
+    });
+    std::future<std::unique_ptr<platform::EnforceNotMet>> f = task.get_future();
     tasks_.push(std::move(task));
     lock.unlock();
     scheduled_.notify_one();
     return f;
   }
 
-  /**
-   * @brief   Wait until all the tasks are completed.
-   */
-  void Wait() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    completed_.wait(lock, [=] { return Done() == true; });
-  }
+  // Wait until all the tasks are completed.
+  void Wait();
 
  private:
+  struct ExceptionHandler {
+    mutable std::future<std::unique_ptr<platform::EnforceNotMet>> future_;
+    explicit ExceptionHandler(
+        std::future<std::unique_ptr<platform::EnforceNotMet>>&& f)
+        : future_(std::move(f)) {}
+    void operator()() const {
+      auto ex = this->future_.get();
+      if (ex != nullptr) {
+        LOG(FATAL) << "The exception is thrown inside the thread pool. You "
+                      "should use RunAndGetException to handle the exception.\n"
+                      "The default exception handler is LOG(FATAL)."
+                   << ex->what();
+      }
+    }
+  };
+
   DISABLE_COPY_AND_ASSIGN(ThreadPool);
 
-  explicit ThreadPool(int num_threads)
-      : num_threads_(num_threads), available_(num_threads), running_(true) {
-    threads_.resize(num_threads);
-    for (auto& thread : threads_) {
-      // TODO(Yancey1989): binding the thread on the specify CPU number
-      thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this)));
-    }
-  }
+  explicit ThreadPool(int num_threads);
 
-  /**
-   * @brief   If the task queue is empty and avaialbe
-   *          is equal to the number of threads, means that
-   *          all tasks are completed.
-   *
-   *          Note: this function is not thread-safe.
-   *
-   * @return true if all tasks are completed.
-   */
-  bool Done() { return tasks_.empty() && available_ == num_threads_; }
-
-  void TaskLoop() {
-    while (running_) {
-      std::unique_lock<std::mutex> lock(mutex_);
-      scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; });
-
-      if (!running_) {
-        break;
-      }
-      // pop a task from the task queue
-      auto task = std::move(tasks_.front());
-      tasks_.pop();
-
-      --available_;
-      lock.unlock();
-
-      // run the task
-      task();
-
-      {
-        std::unique_lock<std::mutex> lock(mutex_);
-        ++available_;
-        if (Done()) {
-          completed_.notify_all();
-        }
-      }
-    }
-  }
+  // If the task queue is empty and avaialbe is equal to the number of
+  // threads, means that all tasks are completed.  Note: this function
+  // is not thread-safe.  Returns true if all tasks are completed.
+  // Note: don't delete the data member total_threads_ and use
+  // threads_.size() instead; because you'd need to lock the mutex
+  // before accessing threads_.
+  bool Done() { return tasks_.empty() && idle_threads_ == total_threads_; }
 
-  static void Init() {
-    if (threadpool.get() == nullptr) {
-      // TODO(Yancey1989): specify the max threads number
-      int num_threads = std::thread::hardware_concurrency();
-      PADDLE_ENFORCE_GT(num_threads, 0);
-      threadpool.reset(new ThreadPool(num_threads));
-    }
-  }
+  // The constructor starts threads to run TaskLoop, which retrieves
+  // and runs tasks from the queue.
+  void TaskLoop();
+
+  // Init is called by GetInstance.
+  static void Init();
 
  private:
-  static std::unique_ptr<ThreadPool> threadpool;
-  static std::once_flag init_flag;
+  static std::unique_ptr<ThreadPool> threadpool_;
+  static std::once_flag init_flag_;
 
-  int num_threads_;
-  int available_;
-  bool running_;
-  std::queue<Task> tasks_;
   std::vector<std::unique_ptr<std::thread>> threads_;
+  const size_t total_threads_;
+  size_t idle_threads_;
+
+  std::queue<Task> tasks_;
   std::mutex mutex_;
+  bool running_;
   std::condition_variable scheduled_;
   std::condition_variable completed_;
 };
diff --git a/paddle/framework/threadpool_test.cc b/paddle/framework/threadpool_test.cc
index 50b6238cd8786be9d8cf2d5f821daadea12bd208..3fbfe7efc867144dbd0dd2613c824c6a3c41b7d8 100644
--- a/paddle/framework/threadpool_test.cc
+++ b/paddle/framework/threadpool_test.cc
@@ -22,11 +22,7 @@ namespace framework = paddle::framework;
 void do_sum(framework::ThreadPool* pool, std::atomic<int>& sum, int cnt) {
   std::vector<std::future<void>> fs;
   for (int i = 0; i < cnt; ++i) {
-    auto f = pool->Run([&sum]() { sum.fetch_add(1); });
-    fs.push_back(std::move(f));
-  }
-  for (auto& f : fs) {
-    f.wait();
+    fs.push_back(framework::Async([&sum]() { sum.fetch_add(1); }));
   }
 }
 
diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc
index 62ab6593ef23c195e3caa2336574796ecaf35bc8..11a4daf2c991fc85a65c242403a0c83d06c4c44c 100644
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@@ -26,18 +26,98 @@ void VarDesc::SetShape(const std::vector<int64_t> &dims) {
   VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims());
 }
 
+void VarDesc::SetTensorDescNum(size_t num) {
+  switch (desc_.type()) {
+    case proto::VarDesc::READER: {
+      auto *lod_tensors_ptr = desc_.mutable_reader()->mutable_lod_tensor();
+      lod_tensors_ptr->Clear();
+      for (size_t i = 0; i < num; ++i) {
+        lod_tensors_ptr->Add();
+      }
+      return;
+    } break;
+    default:
+      PADDLE_THROW(
+          "Setting 'sub_tensor_number' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+size_t VarDesc::GetTensorDescNum() const {
+  switch (desc_.type()) {
+    case proto::VarDesc::READER:
+      return desc_.reader().lod_tensor_size();
+      break;
+    default:
+      PADDLE_THROW(
+          "Getting 'sub_tensor_number' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+void VarDesc::SetShapes(
+    const std::vector<std::vector<int64_t>> &multiple_dims) {
+  if (multiple_dims.size() != GetTensorDescNum()) {
+    VLOG(3) << "WARNING: The number of given shapes(" << multiple_dims.size()
+            << ") doesn't match the existing tensor number("
+            << GetTensorDescNum()
+            << "). The Reader is going to be reinitialized.";
+    SetTensorDescNum(multiple_dims.size());
+  }
+  std::vector<proto::TensorDesc *> tensors = mutable_tensor_descs();
+  for (size_t i = 0; i < multiple_dims.size(); ++i) {
+    VectorToRepeated(multiple_dims[i], tensors[i]->mutable_dims());
+  }
+}
+
+std::vector<int64_t> VarDesc::GetShape() const {
+  return RepeatedToVector(tensor_desc().dims());
+}
+
+std::vector<std::vector<int64_t>> VarDesc::GetShapes() const {
+  std::vector<proto::TensorDesc> descs = tensor_descs();
+  std::vector<std::vector<int64_t>> res;
+  res.reserve(descs.size());
+  for (const auto &tensor_desc : descs) {
+    res.push_back(RepeatedToVector(tensor_desc.dims()));
+  }
+  return res;
+}
+
 void VarDesc::SetDataType(proto::DataType data_type) {
   mutable_tensor_desc()->set_data_type(data_type);
 }
 
-std::vector<int64_t> VarDesc::Shape() const {
-  return RepeatedToVector(tensor_desc().dims());
+void VarDesc::SetDataTypes(
+    const std::vector<proto::DataType> &multiple_data_type) {
+  if (multiple_data_type.size() != GetTensorDescNum()) {
+    VLOG(3) << "WARNING: The number of given data types("
+            << multiple_data_type.size()
+            << ") doesn't match the existing tensor number("
+            << GetTensorDescNum()
+            << "). The Reader is going to be reinitialized.";
+    SetTensorDescNum(multiple_data_type.size());
+  }
+  std::vector<proto::TensorDesc *> tensor_descs = mutable_tensor_descs();
+  for (size_t i = 0; i < multiple_data_type.size(); ++i) {
+    tensor_descs[i]->set_data_type(multiple_data_type[i]);
+  }
 }
 
 proto::DataType VarDesc::GetDataType() const {
   return tensor_desc().data_type();
 }
 
+std::vector<proto::DataType> VarDesc::GetDataTypes() const {
+  std::vector<proto::TensorDesc> descs = tensor_descs();
+  std::vector<proto::DataType> res;
+  res.reserve(descs.size());
+  for (const auto &tensor_desc : descs) {
+    res.push_back(tensor_desc.data_type());
+  }
+  return res;
+}
+
 void VarDesc::SetLoDLevel(int32_t lod_level) {
   switch (desc_.type()) {
     case proto::VarDesc::LOD_TENSOR:
@@ -47,8 +127,32 @@ void VarDesc::SetLoDLevel(int32_t lod_level) {
       desc_.mutable_tensor_array()->set_lod_level(lod_level);
       break;
     default:
-      PADDLE_THROW("Tensor type=%d does not support LoDLevel",
-                   desc_.tensor_array().lod_level());
+      PADDLE_THROW(
+          "Setting 'lod_level' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) {
+  if (multiple_lod_level.size() != GetTensorDescNum()) {
+    VLOG(3) << "WARNING: The number of given lod_levels("
+            << multiple_lod_level.size()
+            << ") doesn't match the existing tensor number("
+            << GetTensorDescNum()
+            << "). The Reader is going to be reinitialized.";
+    SetTensorDescNum(multiple_lod_level.size());
+  }
+  switch (desc_.type()) {
+    case proto::VarDesc::READER: {
+      size_t i = 0;
+      for (auto &lod_tensor : *desc_.mutable_reader()->mutable_lod_tensor()) {
+        lod_tensor.set_lod_level(multiple_lod_level[i++]);
+      }
+    } break;
+    default:
+      PADDLE_THROW(
+          "Setting 'lod_levels' is not supported by the type of var %s.",
+          this->Name());
   }
 }
 
@@ -59,13 +163,31 @@ int32_t VarDesc::GetLoDLevel() const {
     case proto::VarDesc::LOD_TENSOR_ARRAY:
       return desc_.tensor_array().lod_level();
     default:
-      PADDLE_THROW("Tensor type=%d does not support LoDLevel",
-                   desc_.tensor_array().lod_level());
+      PADDLE_THROW(
+          "Getting 'lod_level' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+std::vector<int32_t> VarDesc::GetLoDLevels() const {
+  std::vector<int32_t> res;
+  switch (desc_.type()) {
+    case proto::VarDesc::READER:
+      res.reserve(desc_.reader().lod_tensor_size());
+      for (auto &lod_tensor : desc_.reader().lod_tensor()) {
+        res.push_back(lod_tensor.lod_level());
+      }
+      return res;
+      break;
+    default:
+      PADDLE_THROW(
+          "Getting 'lod_levels' is not supported by the type of var %s.",
+          this->Name());
   }
 }
 
 const proto::TensorDesc &VarDesc::tensor_desc() const {
-  PADDLE_ENFORCE(desc_.has_type(), "invoke TensorDesc must after set type");
+  PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set.");
   switch (desc_.type()) {
     case proto::VarDesc::SELECTED_ROWS:
       return desc_.selected_rows();
@@ -74,13 +196,32 @@ const proto::TensorDesc &VarDesc::tensor_desc() const {
     case proto::VarDesc::LOD_TENSOR_ARRAY:
       return desc_.tensor_array().tensor();
     default:
-      PADDLE_THROW("The type of var %s is unsupported.", this->Name());
+      PADDLE_THROW(
+          "Getting 'tensor_desc' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+std::vector<proto::TensorDesc> VarDesc::tensor_descs() const {
+  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
+  std::vector<proto::TensorDesc> res;
+  res.reserve(GetTensorDescNum());
+  switch (desc_.type()) {
+    case proto::VarDesc::READER:
+      for (const auto &lod_tensor : desc_.reader().lod_tensor()) {
+        res.push_back(lod_tensor.tensor());
+      }
+      return res;
+    default:
+      PADDLE_THROW(
+          "Getting 'tensor_descs' is not supported by the type of var "
+          "%s.",
+          this->Name());
   }
 }
 
 proto::TensorDesc *VarDesc::mutable_tensor_desc() {
-  PADDLE_ENFORCE(desc_.has_type(),
-                 "invoke MutableTensorDesc must after set type");
+  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
   switch (desc_.type()) {
     case proto::VarDesc::SELECTED_ROWS:
       return desc_.mutable_selected_rows();
@@ -89,8 +230,30 @@ proto::TensorDesc *VarDesc::mutable_tensor_desc() {
     case proto::VarDesc::LOD_TENSOR_ARRAY:
       return desc_.mutable_tensor_array()->mutable_tensor();
     default:
-      PADDLE_THROW("Unexpected branch.");
+      PADDLE_THROW(
+          "Getting 'mutable_tensor_desc' is not supported by the type of var "
+          "%s.",
+          this->Name());
   }
 }
+
+std::vector<proto::TensorDesc *> VarDesc::mutable_tensor_descs() {
+  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
+  std::vector<proto::TensorDesc *> res;
+  res.reserve(GetTensorDescNum());
+  switch (desc_.type()) {
+    case proto::VarDesc::READER:
+      for (auto &lod_tensor : *desc_.mutable_reader()->mutable_lod_tensor()) {
+        res.push_back(lod_tensor.mutable_tensor());
+      }
+      return res;
+    default:
+      PADDLE_THROW(
+          "Getting 'tensor_descs' is not supported by the type of var "
+          "%s.",
+          this->Name());
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/var_desc.h b/paddle/framework/var_desc.h
index 9316b14bb695c185efd6db4296d422ef0c476d57..72da2fbb0a66ec7ca8c0c274dc4273bfbfcf303e 100644
--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
@@ -68,18 +68,34 @@ class VarDesc {
 
   void SetName(std::string name) { desc_.set_name(name); }
 
+  void SetTensorDescNum(size_t num);
+
+  size_t GetTensorDescNum() const;
+
   void SetShape(const std::vector<int64_t> &dims);
 
+  void SetShapes(const std::vector<std::vector<int64_t>> &multiple_dims);
+
+  std::vector<int64_t> GetShape() const;
+
+  std::vector<std::vector<int64_t>> GetShapes() const;
+
   void SetDataType(proto::DataType data_type);
 
-  std::vector<int64_t> Shape() const;
+  void SetDataTypes(const std::vector<proto::DataType> &multiple_data_type);
 
   proto::DataType GetDataType() const;
 
+  std::vector<proto::DataType> GetDataTypes() const;
+
   void SetLoDLevel(int32_t lod_level);
 
+  void SetLoDLevels(const std::vector<int32_t> &multiple_lod_level);
+
   int32_t GetLoDLevel() const;
 
+  std::vector<int32_t> GetLoDLevels() const;
+
   proto::VarDesc::VarType GetType() const;
 
   void SetType(proto::VarDesc::VarType type);
@@ -90,7 +106,9 @@ class VarDesc {
 
  private:
   const proto::TensorDesc &tensor_desc() const;
+  std::vector<proto::TensorDesc> tensor_descs() const;
   proto::TensorDesc *mutable_tensor_desc();
+  std::vector<proto::TensorDesc *> mutable_tensor_descs();
 
   proto::VarDesc desc_;
 };
diff --git a/paddle/framework/var_type.h b/paddle/framework/var_type.h
index 5b7a08a08732a6ccbc206f6a4f0aa4788ce4a219..599d45149024ca0fb395c2a1c6deeb7d8cd5eb17 100644
--- a/paddle/framework/var_type.h
+++ b/paddle/framework/var_type.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/reader.h"
 #include "paddle/framework/selected_rows.h"
 #include "paddle/framework/variable.h"
 
@@ -31,6 +32,8 @@ inline proto::VarDesc::VarType ToVarType(std::type_index type) {
     return proto::VarDesc_VarType_LOD_TENSOR_ARRAY;
   } else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
     return proto::VarDesc_VarType_SELECTED_ROWS;
+  } else if (type.hash_code() == typeid(ReaderHolder).hash_code()) {
+    return proto::VarDesc_VarType_READER;
   } else {
     PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
   }
@@ -40,7 +43,7 @@ template <typename Visitor>
 inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
   switch (ToVarType(var.Type())) {
     case proto::VarDesc_VarType_LOD_TENSOR:
-      visitor(var.Get<framework::LoDTensor>());
+      visitor(var.Get<LoDTensor>());
       return;
     case proto::VarDesc_VarType_LOD_RANK_TABLE:
       visitor(var.Get<LoDRankTable>());
@@ -51,6 +54,9 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
     case proto::VarDesc_VarType_SELECTED_ROWS:
       visitor(var.Get<SelectedRows>());
       return;
+    case proto::VarDesc_VarType_READER:
+      visitor(var.Get<ReaderHolder>());
+      return;
     default:
       PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type()));
   }
diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index cbdbf5335d32d55a0221728758025c9d2cb3e7d1..a9876cec2aabf7d116443b685391ee9d20bc1370 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -178,19 +178,22 @@ public:
     real* inputData = inputs[0].data<real>();
     real* filterData = inputs[1].data<real>();
     real* outputData = outputs[0].data<real>();
+    real* colData = NULL;
     bool needIm2col = isNeedIm2col(filter);
 
     TensorShape imShape =
         TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-
     TensorShape colShape;
-    real* colData = NULL;
 
-    size_t colHeight = inputChannels / groups_ * filterHeight * filterWidth;
-    size_t colWidth = outputHeight * outputWidth;
-    // Max col matrix height 256, Max col matrix width 1024
-    size_t stepColHeight = std::min(colHeight, static_cast<size_t>(256));
-    size_t stepColWidth = std::min(colWidth, static_cast<size_t>(2048));
+    // Max col matrix width 4096, Max col matrix size 4M.
+    size_t outputHeightSteps =
+        std::min(std::max(4096 / outputWidth, (size_t)1), outputHeight);
+    size_t maxColWidth = outputHeightSteps * outputWidth;
+    size_t channelSteps =
+        std::min(std::max((1048576 / maxColWidth) / filterHeight * filterWidth,
+                          (size_t)1),
+                 inputChannels / groups_);
+    size_t maxColHeight = channelSteps * filterHeight * filterWidth;
 
     if (needIm2col) {
       colShape = TensorShape({inputChannels / groups_,
@@ -199,7 +202,7 @@ public:
                               outputHeight,
                               outputWidth});
 
-      resizeBuffer<Device>(stepColHeight * stepColWidth * sizeof(real));
+      resizeBuffer<Device>(maxColHeight * maxColWidth * sizeof(real));
       colData = reinterpret_cast<real*>(memory_->getBuf());
     }
 
@@ -209,20 +212,24 @@ public:
         (outputChannels / groups_) * outputHeight * outputWidth;
     size_t filterOffset = filter.getElements() / groups_;
 
-    int nStride = colWidth;
-    int kStride = colHeight;
+    int nStride = outputHeight * outputWidth;
+    int kStride = inputChannels / groups_ * filterHeight * filterWidth;
     for (size_t i = 0; i < batchSize; i++) {
+      filterData = inputs[1].data<real>();
       for (size_t g = 0; g < groups_; g++) {
         if (needIm2col) {
           real beta_ = beta;
-          for (size_t colHeightStart = 0; colHeightStart < colHeight;
-               colHeightStart += stepColHeight) {
-            for (size_t colWidthStart = 0; colWidthStart < colWidth;
-                 colWidthStart += stepColWidth) {
-              int N = std::min(colWidth - colWidthStart, stepColWidth);
-              int K = std::min(colHeight - colHeightStart, stepColHeight);
+          for (size_t ic = 0; ic < inputChannels / groups_;
+               ic += channelSteps) {
+            int channels = std::min(inputChannels / groups_ - ic, channelSteps);
+            for (size_t oh = 0; oh < outputHeight; oh += outputHeightSteps) {
+              int height = std::min(outputHeight - oh, outputHeightSteps);
+
+              int M = outputChannels / groups_;
+              int N = height * outputWidth;
+              int K = channels * filterHeight * filterWidth;
               // im2col
-              im2col(inputData + g * inputOffset,
+              im2col(inputData,
                      imShape,
                      colData,
                      colShape,
@@ -232,13 +239,12 @@ public:
                      paddingW(),
                      dilationH(),
                      dilationW(),
-                     colHeightStart,
-                     K,
-                     colWidthStart,
+                     channels,
+                     oh,
+                     height,
                      N);
 
               // gemm
-              int M = outputChannels / groups_;
               BlasGemm<Device, real>::compute(
                   false,
                   false,
@@ -246,12 +252,12 @@ public:
                   N,
                   K,
                   1.0f,
-                  filterData + g * filterOffset + colHeightStart,
+                  filterData + ic * filterHeight * filterWidth,
                   kStride,
                   colData,
                   N,
                   beta_,
-                  outputData + g * outputOffset + colWidthStart,
+                  outputData + oh * outputWidth,
                   nStride);
             }
             beta_ = 1.0;
@@ -266,17 +272,18 @@ public:
                                           N,
                                           K,
                                           1.0f,
-                                          filterData + g * filterOffset,
+                                          filterData,
                                           K,
-                                          inputData + g * inputOffset,
+                                          inputData,
                                           N,
                                           beta,
-                                          outputData + g * outputOffset,
+                                          outputData,
                                           N);
         }
+        inputData += inputOffset;
+        outputData += outputOffset;
+        filterData += filterOffset;
       }
-      inputData += inputChannels * inputHeight * inputWidth;
-      outputData += outputChannels * outputHeight * outputWidth;
     }
 
     memory_.reset();
diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h
index 36a9bcf84e4b14965c83627821b71d1c7c0da1b2..915119e291caaa223249cf8e37078723621517b0 100644
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -111,39 +111,42 @@ public:
                   int paddingWidth,
                   int dilationHeight,
                   int dilationWidth,
-                  int colHeightStart,
-                  int colHeightSize,
-                  int colWidthStart,
-                  int colWidthSize) {
+                  int inputChannels,
+                  int colOffset,
+                  int colOutputHeight,
+                  int colWidth) {
     int inputHeight = imShape[1];
     int inputWidth = imShape[2];
     int filterHeight = colShape[1];
     int filterWidth = colShape[2];
     int outputWidth = colShape[4];
 
-    for (int colh = 0; colh < colHeightSize; colh++) {
-      int wOffset = (colHeightStart + colh) % filterWidth;
-      int hOffset = ((colHeightStart + colh) / filterWidth) % filterHeight;
-      int c_im = (colHeightStart + colh) / filterWidth / filterHeight;
-
-      for (int colw = 0; colw < colWidthSize; colw++) {
-        int h = (colWidthStart + colw) / outputWidth;
-        int w = (colWidthStart + colw) % outputWidth;
-
-        int imRowIdx = h * strideHeight + hOffset * dilationHeight;
-        int imColIdx = w * strideWidth + wOffset * dilationWidth;
-        if ((imRowIdx - paddingHeight) < 0 ||
-            (imRowIdx - paddingHeight) >= inputHeight ||
-            (imColIdx - paddingWidth) < 0 ||
-            (imColIdx - paddingWidth) >= inputWidth) {
-          colData[colh * colWidthSize + colw] = static_cast<T>(0);
-        } else {
-          imRowIdx += c_im * inputHeight - paddingHeight;
-          imColIdx -= paddingWidth;
-          colData[colh * colWidthSize + colw] =
-              imData[imRowIdx * inputWidth + imColIdx];
+    for (int ic = 0; ic < inputChannels; ic++) {
+      for (int oh = 0; oh < colOutputHeight; oh++) {
+        T* dstData = colData + oh * outputWidth;
+        for (int fh = 0; fh < filterHeight; fh++) {
+          for (int fw = 0; fw < filterWidth; fw++) {
+            int imRowIdx = (oh + colOffset) * strideHeight +
+                           fh * dilationHeight - paddingHeight;
+            if (imRowIdx < 0 || imRowIdx >= inputHeight) {
+              memset(dstData, 0, outputWidth * sizeof(T));
+            } else {
+              for (int ow = 0; ow < outputWidth; ow++) {
+                int imColIdx =
+                    ow * strideWidth + fw * dilationWidth - paddingWidth;
+                if (imColIdx < 0 || imColIdx >= inputWidth) {
+                  dstData[ow] = T(0);
+                } else {
+                  dstData[ow] = imData[imRowIdx * inputWidth + imColIdx];
+                }
+              }
+            }
+            dstData += colWidth;
+          }
         }
       }
+      colData += filterHeight * filterWidth * colWidth;
+      imData += inputHeight * inputWidth;
     }
   }
 };
diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp
index 3ba866dcdd845403d52f7a85adfef08cbb11c305..fe44a8bf79005efb87c56f6a79f46421129bab22 100644
--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
@@ -202,10 +202,10 @@ void TestIm2ColMobileFunctor() {
                           padding,
                           dilation,
                           dilation,
+                          channels,
                           0,
-                          height,
-                          0,
-                          width);
+                          outputHeight,
+                          outputHeight * outputWidth);
 
                   autotest::TensorCheckEqual(*output1, *output2);
                 }
diff --git a/paddle/gserver/tests/test_CompareSparse.cpp b/paddle/gserver/tests/test_CompareSparse.cpp
index c6e07650fc4805a25baf38b9059f6c996d00cafc..2495d8b60a56713ba554156d2d9b25e4f6a567d7 100644
--- a/paddle/gserver/tests/test_CompareSparse.cpp
+++ b/paddle/gserver/tests/test_CompareSparse.cpp
@@ -212,6 +212,10 @@ TEST(compareSparse, NeuralNetwork) {
 }
 
 int main(int argc, char** argv) {
+  // FIXME(tonyyang-svail):
+  //   Turn off this test due CI failure:
+  //   https://paddleci.ngrok.io/viewLog.html?buildId=27608&buildTypeId=Paddle_PrCi&tab=buildLog&_focus=10430
+  return 0;
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
   initPython(argc, argv);
diff --git a/paddle/inference/CMakeLists.txt b/paddle/inference/CMakeLists.txt
index ae4d3fd2f58daf87a650428e04722581610ed780..bdb147955ca0700dc0854b54c38d961caf8845f3 100644
--- a/paddle/inference/CMakeLists.txt
+++ b/paddle/inference/CMakeLists.txt
@@ -1,42 +1,18 @@
-set(FLUID_CORE_MODULES proto_desc paddle_memory executor prune init)
+set(FLUID_CORE_MODULES proto_desc paddle_memory lod_tensor executor prune init)
 
 cc_library(paddle_fluid_api
-    SRCS inference.cc
+    SRCS io.cc
     DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 
-# Merge all modules into a single static library
+# Create static library
 cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 
 # Create shared library
-add_library(paddle_fluid_shared SHARED inference.cc)
+cc_library(paddle_fluid_shared SHARED
+    SRCS io.cc
+    DEPS ARCHIVE_START ${GLOB_OP_LIB} ${FLUID_CORE_MODULES} ARCHIVE_END)
+set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
 
-target_circle_link_libraries(paddle_fluid_shared
-  ARCHIVE_START
-  ${GLOB_OP_LIB}
-  ARCHIVE_END
-  ${FLUID_CORE_MODULES})
-
-SET_TARGET_PROPERTIES(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
-
-# install library & headers
-if(NOT WITH_C_API AND WITH_FLUID)
-  install(FILES inference.h DESTINATION include/paddle/inference)
-  install(TARGETS paddle_fluid_shared DESTINATION lib)
-endif()
-
-add_executable(example example.cc)
-if(APPLE)
-  set(OPTIONAL_LINK_FLAGS)
-  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
-    set(OPTIONAL_LINK_FLAGS "-undefined dynamic_lookup")
-  endif()
-  target_link_libraries(example
-      -Wl,-force_load paddle_fluid
-      ${OPTIONAL_LINK_FLAGS}
-      ${PTOOLS_LIB})
-else()
-  target_link_libraries(example
-      -Wl,--start-group -Wl,--whole-archive paddle_fluid
-      -Wl,--no-whole-archive -Wl,--end-group
-      ${PTOOLS_LIB})
+if(WITH_TESTING)
+  add_subdirectory(tests/book)
 endif()
diff --git a/paddle/inference/example.cc b/paddle/inference/example.cc
deleted file mode 100644
index 0c18b45624dedcb5839d4b771e044b4a7b32af52..0000000000000000000000000000000000000000
--- a/paddle/inference/example.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <time.h>
-#include <iostream>
-#include "gflags/gflags.h"
-#include "paddle/inference/inference.h"
-
-DEFINE_string(dirname, "", "Directory of the inference model.");
-
-int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, true);
-  if (FLAGS_dirname.empty()) {
-    // Example:
-    //   ./example --dirname=recognize_digits_mlp.inference.model
-    std::cout << "Usage: ./example --dirname=path/to/your/model" << std::endl;
-    exit(1);
-  }
-
-  std::cout << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
-  std::string dirname = FLAGS_dirname;
-
-  paddle::InferenceEngine* engine = new paddle::InferenceEngine();
-  engine->LoadInferenceModel(dirname);
-
-  paddle::framework::LoDTensor input;
-  srand(time(0));
-  float* input_ptr =
-      input.mutable_data<float>({1, 784}, paddle::platform::CPUPlace());
-  for (int i = 0; i < 784; ++i) {
-    input_ptr[i] = rand() / (static_cast<float>(RAND_MAX));
-  }
-
-  std::vector<paddle::framework::LoDTensor> feeds;
-  feeds.push_back(input);
-  std::vector<paddle::framework::LoDTensor> fetchs;
-  engine->Execute(feeds, fetchs);
-
-  for (size_t i = 0; i < fetchs.size(); ++i) {
-    auto dims_i = fetchs[i].dims();
-    std::cout << "dims_i:";
-    for (int j = 0; j < dims_i.size(); ++j) {
-      std::cout << " " << dims_i[j];
-    }
-    std::cout << std::endl;
-    std::cout << "result:";
-    float* output_ptr = fetchs[i].data<float>();
-    for (int j = 0; j < paddle::framework::product(dims_i); ++j) {
-      std::cout << " " << output_ptr[j];
-    }
-    std::cout << std::endl;
-  }
-
-  delete engine;
-  return 0;
-}
diff --git a/paddle/inference/inference.cc b/paddle/inference/inference.cc
deleted file mode 100644
index 49001778808173b82865a4b6632a6b175ef96242..0000000000000000000000000000000000000000
--- a/paddle/inference/inference.cc
+++ /dev/null
@@ -1,213 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "inference.h"
-#include <fstream>
-#include "paddle/framework/executor.h"
-#include "paddle/framework/feed_fetch_method.h"
-#include "paddle/framework/init.h"
-#include "paddle/framework/scope.h"
-
-#ifdef PADDLE_USE_PTOOLS
-#include "chooseser.h"
-#endif
-
-namespace paddle {
-
-void InferenceEngine::LoadInferenceModel(const std::string& dirname) {
-  std::string model_filename = dirname + "/__model__.dat";
-  LOG(INFO) << "loading model from " << model_filename;
-  std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
-  std::string program_desc_str;
-  inputfs.seekg(0, std::ios::end);
-  program_desc_str.resize(inputfs.tellg());
-  inputfs.seekg(0, std::ios::beg);
-  LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
-  inputfs.read(&program_desc_str[0], program_desc_str.size());
-  inputfs.close();
-
-  program_ = new framework::ProgramDesc(program_desc_str);
-  GenerateLoadProgram(dirname);
-
-  framework::BlockDesc* global_block = program_->MutableBlock(0);
-  feed_var_names_.clear();
-  fetch_var_names_.clear();
-  for (auto* op : global_block->AllOps()) {
-    if (op->Type() == "feed") {
-      feed_var_names_.insert(feed_var_names_.begin(), op->Output("Out")[0]);
-    } else if (op->Type() == "fetch") {
-      fetch_var_names_.push_back(op->Input("X")[0]);
-    }
-  }
-}
-
-void InferenceEngine::LoadInferenceModel(
-    const std::string& dirname,
-    const std::vector<std::string>& feed_var_names,
-    const std::vector<std::string>& fetch_var_names) {
-  std::string model_filename = dirname + "/__model__.dat";
-  LOG(INFO) << "loading model from " << model_filename;
-  std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
-  std::string program_desc_str;
-  inputfs.seekg(0, std::ios::end);
-  program_desc_str.resize(inputfs.tellg());
-  inputfs.seekg(0, std::ios::beg);
-  LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
-  inputfs.read(&program_desc_str[0], program_desc_str.size());
-  inputfs.close();
-
-  program_ = new framework::ProgramDesc(program_desc_str);
-  GenerateLoadProgram(dirname);
-
-  if (feed_var_names.empty() || fetch_var_names.empty()) {
-    LOG(FATAL) << "Please specify the feed_var_names and fetch_var_names.";
-  }
-  feed_var_names_ = feed_var_names;
-  fetch_var_names_ = fetch_var_names;
-  PrependFeedOp();
-  AppendFetchOp();
-}
-
-bool InferenceEngine::IsParameter(const framework::VarDesc* var) {
-  if (var->Persistable() && var->Name() != "feed" && var->Name() != "fetch") {
-    // There are many unreachable variables in the program
-    for (size_t i = 0; i < program_->Size(); ++i) {
-      const framework::BlockDesc& block = program_->Block(i);
-      for (auto* op : block.AllOps()) {
-        for (auto input_argument_name : op->InputArgumentNames()) {
-          if (input_argument_name == var->Name()) {
-            return true;
-          }
-        }
-      }
-    }
-  }
-  return false;
-}
-
-void InferenceEngine::GenerateLoadProgram(const std::string& dirname) {
-  framework::BlockDesc* global_block = program_->MutableBlock(0);
-
-  load_program_ = new framework::ProgramDesc();
-  framework::BlockDesc* load_block = load_program_->MutableBlock(0);
-  for (auto* var : global_block->AllVars()) {
-    if (IsParameter(var)) {
-      LOG(INFO) << "parameter's name: " << var->Name();
-
-      framework::VarDesc* new_var = load_block->Var(var->Name());
-      new_var->SetShape(var->Shape());
-      new_var->SetDataType(var->GetDataType());
-      new_var->SetType(var->GetType());
-      new_var->SetLoDLevel(var->GetLoDLevel());
-      new_var->SetPersistable(true);
-
-      // append_op
-      framework::OpDesc* op = load_block->AppendOp();
-      op->SetType("load");
-      op->SetOutput("Out", {new_var->Name()});
-      op->SetAttr("file_path", {dirname + "/" + new_var->Name()});
-      op->CheckAttrs();
-    }
-  }
-}
-
-void InferenceEngine::PrependFeedOp() {
-  if (!program_) {
-    LOG(FATAL) << "Please initialize the program_ first.";
-  }
-
-  framework::BlockDesc* global_block = program_->MutableBlock(0);
-
-  // create_var
-  framework::VarDesc* feed_var = global_block->Var("feed");
-  feed_var->SetType(framework::proto::VarDesc::FEED_MINIBATCH);
-  feed_var->SetPersistable(true);
-
-  // prepend feed_op
-  for (size_t i = 0; i < feed_var_names_.size(); ++i) {
-    std::string var_name = feed_var_names_[i];
-    LOG(INFO) << "feed var's name: " << var_name;
-
-    // prepend_op
-    framework::OpDesc* op = global_block->PrependOp();
-    op->SetType("feed");
-    op->SetInput("X", {"feed"});
-    op->SetOutput("Out", {var_name});
-    op->SetAttr("col", {static_cast<int>(i)});
-    op->CheckAttrs();
-  }
-}
-
-void InferenceEngine::AppendFetchOp() {
-  if (!program_) {
-    LOG(FATAL) << "Please initialize the program_ first.";
-  }
-
-  framework::BlockDesc* global_block = program_->MutableBlock(0);
-
-  // create_var
-  framework::VarDesc* fetch_var = global_block->Var("fetch");
-  fetch_var->SetType(framework::proto::VarDesc::FETCH_LIST);
-  fetch_var->SetPersistable(true);
-
-  // append fetch_op
-  for (size_t i = 0; i < fetch_var_names_.size(); ++i) {
-    std::string var_name = fetch_var_names_[i];
-    LOG(INFO) << "fetch var's name: " << var_name;
-
-    // append_op
-    framework::OpDesc* op = global_block->AppendOp();
-    op->SetType("fetch");
-    op->SetInput("X", {var_name});
-    op->SetOutput("Out", {"fetch"});
-    op->SetAttr("col", {static_cast<int>(i)});
-    op->CheckAttrs();
-  }
-}
-
-void InferenceEngine::Execute(const std::vector<framework::LoDTensor>& feeds,
-                              std::vector<framework::LoDTensor>& fetchs) {
-  if (!program_ || !load_program_) {
-    LOG(FATAL) << "Please initialize the program_ and load_program_ first.";
-  }
-
-  if (feeds.size() < feed_var_names_.size()) {
-    LOG(FATAL) << "Please feed " << feed_var_names_.size() << " input Tensors.";
-  }
-
-  auto* place = new platform::CPUPlace();
-  framework::InitDevices();
-  framework::Executor* executor = new framework::Executor(*place);
-  framework::Scope* scope = new framework::Scope();
-
-  executor->Run(*load_program_, scope, 0, true, true);
-
-  // set_feed_variable
-  for (size_t i = 0; i < feed_var_names_.size(); ++i) {
-    framework::SetFeedVariable(scope, feeds[i], "feed", i);
-  }
-
-  executor->Run(*program_, scope, 0, true, true);
-
-  // get_fetch_variable
-  fetchs.resize(fetch_var_names_.size());
-  for (size_t i = 0; i < fetch_var_names_.size(); ++i) {
-    fetchs[i] = framework::GetFetchVariable(*scope, "fetch", i);
-  }
-
-  delete place;
-  delete scope;
-  delete executor;
-}
-}  // namespace paddle
diff --git a/paddle/inference/inference.h b/paddle/inference/inference.h
deleted file mode 100644
index 7fc09cb9e539a65a8cd3cceb1543bc7d111c22b3..0000000000000000000000000000000000000000
--- a/paddle/inference/inference.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/block_desc.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/program_desc.h"
-
-namespace paddle {
-
-class InferenceEngine {
-public:
-  InferenceEngine() : program_(nullptr), load_program_(nullptr) {}
-  ~InferenceEngine() {
-    delete program_;
-    delete load_program_;
-  }
-
-  void LoadInferenceModel(const std::string& dirname);
-  void LoadInferenceModel(const std::string& dirname,
-                          const std::vector<std::string>& feed_var_names,
-                          const std::vector<std::string>& fetch_var_names);
-  void Execute(const std::vector<framework::LoDTensor>& feeds,
-               std::vector<framework::LoDTensor>& fetchs);
-
-private:
-  bool IsParameter(const framework::VarDesc* var);
-  void GenerateLoadProgram(const std::string& dirname);
-  void PrependFeedOp();
-  void AppendFetchOp();
-
-private:
-  framework::ProgramDesc* program_;
-  framework::ProgramDesc* load_program_;
-  std::vector<std::string> feed_var_names_;
-  std::vector<std::string> fetch_var_names_;
-};
-
-}  // namespace paddle
diff --git a/paddle/inference/io.cc b/paddle/inference/io.cc
new file mode 100644
index 0000000000000000000000000000000000000000..784e87970f77857e7f3182df904dc0133c44d6c9
--- /dev/null
+++ b/paddle/inference/io.cc
@@ -0,0 +1,139 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/inference/io.h"
+
+#include <fstream>
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/feed_fetch_type.h"
+
+namespace paddle {
+namespace inference {
+
+void ReadBinaryFile(const std::string& filename, std::string& contents) {
+  VLOG(3) << "loading model from " << filename;
+  std::ifstream inputfs(filename, std::ios::in | std::ios::binary);
+  inputfs.seekg(0, std::ios::end);
+  contents.clear();
+  contents.resize(inputfs.tellg());
+  inputfs.seekg(0, std::ios::beg);
+  inputfs.read(&contents[0], contents.size());
+  inputfs.close();
+}
+
+bool IsParameter(const framework::VarDesc* var,
+                 const framework::ProgramDesc& main_program) {
+  if (var->Persistable()) {
+    // There are many unreachable variables in the program
+    for (size_t i = 0; i < main_program.Size(); ++i) {
+      const framework::BlockDesc& block = main_program.Block(i);
+      for (auto* op : block.AllOps()) {
+        if (op->Type() == framework::kFeedOpType) {
+          continue;
+        }
+        for (auto input_argument_name : op->InputArgumentNames()) {
+          if (input_argument_name == var->Name()) {
+            return true;
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+
+void LoadPersistables(framework::Executor& executor,
+                      framework::Scope& scope,
+                      const framework::ProgramDesc& main_program,
+                      const std::string& dirname,
+                      const std::string& param_filename) {
+  const framework::BlockDesc& global_block = main_program.Block(0);
+
+  framework::ProgramDesc* load_program = new framework::ProgramDesc();
+  framework::BlockDesc* load_block = load_program->MutableBlock(0);
+  std::vector<std::string> paramlist;
+
+  for (auto* var : global_block.AllVars()) {
+    if (IsParameter(var, main_program)) {
+      VLOG(3) << "parameter's name: " << var->Name();
+
+      framework::VarDesc* new_var = load_block->Var(var->Name());
+      new_var->SetShape(var->GetShape());
+      new_var->SetDataType(var->GetDataType());
+      new_var->SetType(var->GetType());
+      new_var->SetLoDLevel(var->GetLoDLevel());
+      new_var->SetPersistable(true);
+
+      if (!param_filename.empty()) {
+        paramlist.push_back(new_var->Name());
+      } else {
+        // append_op
+        framework::OpDesc* op = load_block->AppendOp();
+        op->SetType("load");
+        op->SetOutput("Out", {new_var->Name()});
+        op->SetAttr("file_path", {dirname + "/" + new_var->Name()});
+        op->CheckAttrs();
+      }
+    }
+  }
+
+  if (!param_filename.empty()) {
+    // sort paramlist to have consistent ordering
+    std::sort(paramlist.begin(), paramlist.end());
+    // append just the load_combine op
+    framework::OpDesc* op = load_block->AppendOp();
+    op->SetType("load_combine");
+    op->SetOutput("Out", paramlist);
+    op->SetAttr("file_path", {param_filename});
+    op->CheckAttrs();
+  }
+
+  executor.Run(*load_program, &scope, 0, true, true);
+
+  VLOG(3) << "Ran loading successfully";
+  delete load_program;
+}
+
+std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
+                                             framework::Scope& scope,
+                                             const std::string& dirname) {
+  std::string model_filename = dirname + "/__model__";
+  std::string program_desc_str;
+  ReadBinaryFile(model_filename, program_desc_str);
+
+  std::unique_ptr<framework::ProgramDesc> main_program(
+      new framework::ProgramDesc(program_desc_str));
+
+  LoadPersistables(executor, scope, *main_program, dirname, "");
+  return main_program;
+}
+
+std::unique_ptr<framework::ProgramDesc> Load(
+    framework::Executor& executor,
+    framework::Scope& scope,
+    const std::string& prog_filename,
+    const std::string& param_filename) {
+  std::string model_filename = prog_filename;
+  std::string program_desc_str;
+  ReadBinaryFile(model_filename, program_desc_str);
+
+  std::unique_ptr<framework::ProgramDesc> main_program(
+      new framework::ProgramDesc(program_desc_str));
+
+  LoadPersistables(executor, scope, *main_program, "", param_filename);
+  return main_program;
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/inference/io.h b/paddle/inference/io.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7d7c499690620740d8627e7f5085d728d67f7c3
--- /dev/null
+++ b/paddle/inference/io.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/framework/executor.h"
+#include "paddle/framework/program_desc.h"
+#include "paddle/framework/scope.h"
+
+namespace paddle {
+namespace inference {
+
+void LoadPersistables(framework::Executor& executor,
+                      framework::Scope& scope,
+                      const framework::ProgramDesc& main_program,
+                      const std::string& dirname,
+                      const std::string& param_filename);
+
+std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
+                                             framework::Scope& scope,
+                                             const std::string& dirname);
+
+std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
+                                             framework::Scope& scope,
+                                             const std::string& prog_filename,
+                                             const std::string& param_filename);
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/inference/tests/book/CMakeLists.txt b/paddle/inference/tests/book/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5c866eb1e2ee2c4dc4e39c134b9fabd445082c89
--- /dev/null
+++ b/paddle/inference/tests/book/CMakeLists.txt
@@ -0,0 +1,31 @@
+function(inference_test TARGET_NAME)
+  set(options "")
+  set(oneValueArgs "")
+  set(multiValueArgs ARGS)
+  cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests)
+  set(arg_list "")
+  if(inference_test_ARGS)
+    foreach(arg ${inference_test_ARGS})
+      list(APPEND arg_list "_${arg}")
+    endforeach()
+  else()
+    list(APPEND arg_list "_")
+  endif()
+  foreach(arg ${arg_list})
+    string(REGEX REPLACE "^_$" "" arg "${arg}")
+    cc_test(test_inference_${TARGET_NAME}${arg}
+        SRCS test_inference_${TARGET_NAME}.cc
+        DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
+        ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model)
+    set_tests_properties(test_inference_${TARGET_NAME}${arg}
+        PROPERTIES DEPENDS test_${TARGET_NAME})
+  endforeach()
+endfunction(inference_test)
+
+inference_test(recognize_digits ARGS mlp)
+inference_test(image_classification ARGS vgg resnet)
+inference_test(label_semantic_roles)
+inference_test(rnn_encoder_decoder)
+inference_test(recommender_system)
diff --git a/paddle/inference/tests/book/test_helper.h b/paddle/inference/tests/book/test_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..22ce903c7250b84fd0b08e82cfda03df411a3068
--- /dev/null
+++ b/paddle/inference/tests/book/test_helper.h
@@ -0,0 +1,139 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <time.h>
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/inference/io.h"
+
+template <typename T>
+void SetupTensor(paddle::framework::LoDTensor& input,
+                 paddle::framework::DDim dims,
+                 T lower,
+                 T upper) {
+  srand(time(0));
+  T* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
+  for (int i = 0; i < input.numel(); ++i) {
+    input_ptr[i] =
+        (static_cast<T>(rand()) / static_cast<T>(RAND_MAX)) * (upper - lower) +
+        lower;
+  }
+}
+
+template <typename T>
+void SetupTensor(paddle::framework::LoDTensor& input,
+                 paddle::framework::DDim dims,
+                 std::vector<T>& data) {
+  CHECK_EQ(paddle::framework::product(dims), data.size());
+  T* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
+  memcpy(input_ptr, data.data(), input.numel() * sizeof(T));
+}
+
+template <typename T>
+void SetupLoDTensor(paddle::framework::LoDTensor& input,
+                    paddle::framework::LoD& lod,
+                    T lower,
+                    T upper) {
+  input.set_lod(lod);
+  int dim = lod[0][lod[0].size() - 1];
+  SetupTensor<T>(input, {dim, 1}, lower, upper);
+}
+
+template <typename T>
+void SetupLoDTensor(paddle::framework::LoDTensor& input,
+                    paddle::framework::DDim dims,
+                    paddle::framework::LoD lod,
+                    std::vector<T>& data) {
+  const size_t level = lod.size() - 1;
+  CHECK_EQ(dims[0], (lod[level]).back());
+  input.set_lod(lod);
+  SetupTensor<T>(input, dims, data);
+}
+
+template <typename T>
+void CheckError(paddle::framework::LoDTensor& output1,
+                paddle::framework::LoDTensor& output2) {
+  // Check lod information
+  EXPECT_EQ(output1.lod(), output2.lod());
+
+  EXPECT_EQ(output1.dims(), output2.dims());
+  EXPECT_EQ(output1.numel(), output2.numel());
+
+  T err = static_cast<T>(0);
+  if (typeid(T) == typeid(float)) {
+    err = 1E-3;
+  } else if (typeid(T) == typeid(double)) {
+    err = 1E-6;
+  } else {
+    err = 0;
+  }
+
+  size_t count = 0;
+  for (int64_t i = 0; i < output1.numel(); ++i) {
+    if (fabs(output1.data<T>()[i] - output2.data<T>()[i]) > err) {
+      count++;
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
+}
+
+template <typename Place, bool IsCombined = false>
+void TestInference(const std::string& dirname,
+                   const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
+                   std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
+  // 1. Define place, executor, scope and inference_program
+  auto place = Place();
+  auto executor = paddle::framework::Executor(place);
+  auto* scope = new paddle::framework::Scope();
+
+  // 2. Initialize the inference_program and load parameters
+  std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
+  if (IsCombined) {
+    // All parameters are saved in a single file.
+    // Hard-coding the file names of program and parameters in unittest.
+    // Users are free to specify different filename.
+    std::string prog_filename = "__model_combined__";
+    std::string param_filename = "__params_combined__";
+    inference_program = paddle::inference::Load(executor,
+                                                *scope,
+                                                dirname + "/" + prog_filename,
+                                                dirname + "/" + param_filename);
+  } else {
+    // Parameters are saved in separate files sited in the specified `dirname`.
+    inference_program = paddle::inference::Load(executor, *scope, dirname);
+  }
+
+  // 3. Get the feed_target_names and fetch_target_names
+  const std::vector<std::string>& feed_target_names =
+      inference_program->GetFeedTargetNames();
+  const std::vector<std::string>& fetch_target_names =
+      inference_program->GetFetchTargetNames();
+
+  // 4. Prepare inputs: set up maps for feed targets
+  std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
+  for (size_t i = 0; i < feed_target_names.size(); ++i) {
+    // Please make sure that cpu_feeds[i] is right for feed_target_names[i]
+    feed_targets[feed_target_names[i]] = cpu_feeds[i];
+  }
+
+  // 5. Define Tensor to get the outputs: set up maps for fetch targets
+  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
+  for (size_t i = 0; i < fetch_target_names.size(); ++i) {
+    fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
+  }
+
+  // 6. Run the inference program
+  executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+
+  delete scope;
+}
diff --git a/paddle/inference/tests/book/test_inference_image_classification.cc b/paddle/inference/tests/book/test_inference_image_classification.cc
new file mode 100644
index 0000000000000000000000000000000000000000..36ea7c77a75fc0540922eb0f9eb3899733a0afa2
--- /dev/null
+++ b/paddle/inference/tests/book/test_inference_image_classification.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, image_classification) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  int64_t batch_size = 1;
+
+  paddle::framework::LoDTensor input;
+  // Use normilized image pixels as input data,
+  // which should be in the range [0.0, 1.0].
+  SetupTensor<float>(input,
+                     {batch_size, 3, 32, 32},
+                     static_cast<float>(0),
+                     static_cast<float>(1));
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&input);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
diff --git a/paddle/inference/tests/book/test_inference_label_semantic_roles.cc b/paddle/inference/tests/book/test_inference_label_semantic_roles.cc
new file mode 100644
index 0000000000000000000000000000000000000000..922dbfd3338433a58632592667307e4da4dac9da
--- /dev/null
+++ b/paddle/inference/tests/book/test_inference_label_semantic_roles.cc
@@ -0,0 +1,77 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, label_semantic_roles) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  paddle::framework::LoDTensor word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1,
+      ctx_p2, mark;
+  paddle::framework::LoD lod{{0, 4, 10}};
+
+  SetupLoDTensor(word, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(
+      predicate, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(ctx_n2, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(ctx_n1, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(ctx_0, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(ctx_p1, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(ctx_p2, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(mark, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&word);
+  cpu_feeds.push_back(&predicate);
+  cpu_feeds.push_back(&ctx_n2);
+  cpu_feeds.push_back(&ctx_n1);
+  cpu_feeds.push_back(&ctx_0);
+  cpu_feeds.push_back(&ctx_p1);
+  cpu_feeds.push_back(&ctx_p2);
+  cpu_feeds.push_back(&mark);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.lod();
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.lod();
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
diff --git a/paddle/inference/tests/book/test_inference_recognize_digits.cc b/paddle/inference/tests/book/test_inference_recognize_digits.cc
new file mode 100644
index 0000000000000000000000000000000000000000..af8c2b14c3b1651a3714de10422a1b5dd8e1519f
--- /dev/null
+++ b/paddle/inference/tests/book/test_inference_recognize_digits.cc
@@ -0,0 +1,105 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, recognize_digits) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  int64_t batch_size = 1;
+
+  paddle::framework::LoDTensor input;
+  // Use normilized image pixels as input data,
+  // which should be in the range [-1.0, 1.0].
+  SetupTensor<float>(input,
+                     {batch_size, 1, 28, 28},
+                     static_cast<float>(-1),
+                     static_cast<float>(1));
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&input);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
+
+TEST(inference, recognize_digits_combine) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  paddle::framework::LoDTensor input;
+  // Use normilized image pixels as input data,
+  // which should be in the range [-1.0, 1.0].
+  SetupTensor<float>(
+      input, {1, 28, 28}, static_cast<float>(-1), static_cast<float>(1));
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&input);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace, true>(
+      dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace, true>(
+      dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
diff --git a/paddle/inference/tests/book/test_inference_recommender_system.cc b/paddle/inference/tests/book/test_inference_recommender_system.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ec24c7e6ab7d1573b02bed294f43053ee53e4e57
--- /dev/null
+++ b/paddle/inference/tests/book/test_inference_recommender_system.cc
@@ -0,0 +1,87 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, recommender_system) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  int64_t batch_size = 1;
+
+  paddle::framework::LoDTensor user_id, gender_id, age_id, job_id, movie_id,
+      category_id, movie_title;
+
+  // Use the first data from paddle.dataset.movielens.test() as input
+  std::vector<int64_t> user_id_data = {1};
+  SetupTensor<int64_t>(user_id, {batch_size, 1}, user_id_data);
+
+  std::vector<int64_t> gender_id_data = {1};
+  SetupTensor<int64_t>(gender_id, {batch_size, 1}, gender_id_data);
+
+  std::vector<int64_t> age_id_data = {0};
+  SetupTensor<int64_t>(age_id, {batch_size, 1}, age_id_data);
+
+  std::vector<int64_t> job_id_data = {10};
+  SetupTensor<int64_t>(job_id, {batch_size, 1}, job_id_data);
+
+  std::vector<int64_t> movie_id_data = {783};
+  SetupTensor<int64_t>(movie_id, {batch_size, 1}, movie_id_data);
+
+  std::vector<int64_t> category_id_data = {10, 8, 9};
+  SetupLoDTensor<int64_t>(category_id, {3, 1}, {{0, 3}}, category_id_data);
+
+  std::vector<int64_t> movie_title_data = {1069, 4140, 2923, 710, 988};
+  SetupLoDTensor<int64_t>(movie_title, {5, 1}, {{0, 5}}, movie_title_data);
+
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&user_id);
+  cpu_feeds.push_back(&gender_id);
+  cpu_feeds.push_back(&age_id);
+  cpu_feeds.push_back(&job_id);
+  cpu_feeds.push_back(&movie_id);
+  cpu_feeds.push_back(&category_id);
+  cpu_feeds.push_back(&movie_title);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
diff --git a/paddle/inference/tests/book/test_inference_rnn_encoder_decoder.cc b/paddle/inference/tests/book/test_inference_rnn_encoder_decoder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..248b9dce217232f1b88d74af9df31648f7779f98
--- /dev/null
+++ b/paddle/inference/tests/book/test_inference_rnn_encoder_decoder.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, rnn_encoder_decoder) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  paddle::framework::LoDTensor word_data, trg_word;
+  paddle::framework::LoD lod{{0, 4, 10}};
+
+  SetupLoDTensor(
+      word_data, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(
+      trg_word, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&word_data);
+  cpu_feeds.push_back(&trg_word);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.lod();
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.lod();
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 1ec4336cabbc7d3073b7638b7484bf61e83a2dc5..cc86b12be08ba987f9682ebf3fda56c2f07fb576 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -2015,13 +2015,6 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
     CHECK_EQ(channels * outLength, maskMatP->getWidth());
   }
 
-  /* initialize the data_ */
-  for (size_t i = 0; i < height_; i++) {
-    for (size_t j = 0; j < width_; j++) {
-      outData[i * outStride + j] = -(real)FLT_MAX;
-    }
-  }
-
   /* pool max one by one */
   for (size_t n = 0; n < num; ++n) {  // frame by frame
     if (!isContiguous()) {
@@ -2030,19 +2023,24 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
     for (size_t c = 0; c < channels; ++c) {  // channel by channel
       for (size_t ph = 0; ph < outputH; ++ph) {
         int hstart = ph * strideH - paddingH;
-        int hend = std::min(hstart + sizeY, imgSizeH);
-        hstart = std::max(hstart, 0);
+        int hend = hstart + sizeY;
+        hstart = hstart < 0 ? 0 : hstart;
+        hend = hend < (int)imgSizeH ? hend : (int)imgSizeH;
         for (size_t pw = 0; pw < outputW; ++pw) {
           int wstart = pw * strideW - paddingW;
-          int wend = std::min(wstart + sizeX, imgSizeW);
-          wstart = std::max(wstart, 0);
+          int wend = wstart + sizeX;
+          wstart = wstart < 0 ? 0 : wstart;
+          wend = wend < (int)imgSizeW ? wend : (int)imgSizeW;
           if (maskData == NULL) {
+            real tmp = -(real)FLT_MAX;
             for (int h = hstart; h < hend; ++h) {
               for (int w = wstart; w < wend; ++w) {
-                outData[ph * outputW + pw] = std::max(
-                    outData[ph * outputW + pw], inputData[h * imgSizeW + w]);
+                tmp = tmp < inputData[h * imgSizeW + w]
+                          ? inputData[h * imgSizeW + w]
+                          : tmp;
               }
             }
+            outData[ph * outputW + pw] = tmp;
           } else {
             for (int h = hstart; h < hend; ++h) {
               for (int w = wstart; w < wend; ++w) {
diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index 496098f80423854be62dc99b8601209ff6a6b182..1a61c484823b292234d4758cdc1959d7a21510e6 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -14,10 +14,3 @@ cc_library(paddle_memory
     system_allocator)
 
 cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
-
-if(NOT WITH_C_API AND WITH_FLUID)
-  file(GLOB MEMORY_HEADERS *.h)
-  file(GLOB MEMORY_DETAIL_HEADERS detail/*.h)
-  install(FILES ${MEMORY_HEADERS} DESTINATION include/paddle/memory)
-  install(FILES ${MEMORY_DETAIL_HEADERS} DESTINATION include/paddle/memory/detail)
-endif()
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index 7012b6d331d0c4631a3d120fbaf3db7c97298ac7..30ed68c6e0ea95d206658d16684800e169ededc5 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -81,5 +81,23 @@ class PODDeleter {
   Place place_;
 };
 
+/**
+ * \brief   Free memory block in one place does not meet POD
+ *
+ * \note    In some cases, custom deleter is used to
+ *          deallocate the memory automatically for
+ *          std::unique_ptr<T> in tensor.h.
+ *
+ */
+template <typename T, typename Place>
+class PlainDeleter {
+ public:
+  explicit PlainDeleter(Place place) : place_(place) {}
+  void operator()(T* ptr) { Free(place_, reinterpret_cast<void*>(ptr)); }
+
+ private:
+  Place place_;
+};
+
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 15f7cb6b560590f55e276fde4900d2e3c0045fb8..25bb7187d36c5f696890ef72d4cb91bce94fddf8 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -62,7 +62,7 @@ function(op_library TARGET)
     endif()
 
     # Define operators that don't need pybind here.
-    foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
+    foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op" "create_reader_op")
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
         endif()
@@ -122,9 +122,11 @@ if(WITH_DISTRIBUTE)
     set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
     set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
+    op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor)
 else()
-    set(DEPS_OPS ${DEPS_OPS} send_op recv_op)
+    set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op)
 endif()
 
 op_library(cond_op DEPS framework_proto tensor net_op)
@@ -147,15 +149,20 @@ op_library(max_sequence_len_op DEPS lod_rank_table)
 op_library(sequence_conv_op DEPS context_project)
 op_library(sequence_pool_op DEPS sequence_pooling)
 op_library(lstm_op DEPS sequence2batch lstm_compute)
+op_library(lstmp_op DEPS sequence2batch lstm_compute)
 op_library(gru_op DEPS sequence2batch gru_compute)
 op_library(recurrent_op DEPS executor)
 op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale math_function)
 op_library(cos_sim_op DEPS cos_sim_functor)
 op_library(parallel_do_op DEPS executor)
+op_library(create_reader_op DEPS reader)
 
 # Regist multiple Kernel to pybind
 if (WITH_GPU)
-op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS vol2col)
+
+op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS
+    vol2col depthwise_conv)
+
 op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function)
 op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling)
 op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc
@@ -172,12 +179,14 @@ endif()
 # FIXME(typhoonzero): save/load depends lodtensor serialization functions
 op_library(save_op DEPS lod_tensor)
 op_library(load_op DEPS lod_tensor)
+op_library(save_combine_op DEPS lod_tensor)
+op_library(load_combine_op DEPS lod_tensor)
 
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
     op_library(${src})
 endforeach()
-file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
+file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(create_random_data_generator);\n")
 
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 
@@ -191,3 +200,4 @@ if(WITH_GPU)
     cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 endif()
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
+cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h
index 88c3d1c597a853abdee7753a5110be4a1726e905..c0809abc05104c1e8c1f42331c0530724dd1472f 100644
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -323,7 +323,7 @@ template <typename T>
 struct FloorFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Out>
   void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.ceil();
+    out.device(d) = x.floor();
   }
 };
 
diff --git a/paddle/operators/adagrad_op.cu b/paddle/operators/adagrad_op.cu
index 4e579387924a5b0499f29609bc6b1322030a3c0d..00cb6e9cafb4e79ed3d59cd4a6e40ea132e5efda 100644
--- a/paddle/operators/adagrad_op.cu
+++ b/paddle/operators/adagrad_op.cu
@@ -82,7 +82,7 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
     math::scatter::MergeAdd<platform::CUDADeviceContext, T> merge_func;
     auto grad_merge = merge_func(context, grad);
     auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
-    auto& merge_rows = grad_merge.rows();
+    framework::Vector<int64_t> merge_rows(grad_merge.rows());
     // 2. m += g_m * g_m
     math::scatter::Mul<platform::CUDADeviceContext, T> sqare_func;
     auto grad_square = sqare_func(context, grad_merge, grad_merge);
@@ -101,8 +101,8 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
     SparseAdagradFunctorKernel<
         T, 256><<<grid2, threads, 0,
                   reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(grad_merge_data, grad_merge.rows().data(),
-                                   lr, param_data, moment_data, grad_width,
+                      .stream()>>>(grad_merge_data, merge_rows.cuda_data(), lr,
+                                   param_data, moment_data, grad_width,
                                    epsilon);
   }
 };
diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h
index 9cc34bdded780e61e8700eb4fa4a295c84fb48bc..bf536687d398b8342e6ae76a07c11e5fe47483e0 100644
--- a/paddle/operators/adam_op.h
+++ b/paddle/operators/adam_op.h
@@ -199,7 +199,12 @@ class AdamOpKernel : public framework::OpKernel<T> {
           merge_func(ctx.template device_context<DeviceContext>(), grad);
       auto& grad_tensor = grad_merge.value();
       const T* grad_data = grad_tensor.template data<T>();
-      auto* rows = grad_merge.rows().data();
+      int64_t* rows = nullptr;
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+        rows = grad_merge.mutable_rows()->cuda_data();
+      } else {
+        rows = grad_merge.mutable_rows()->data();
+      }
       auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
 
       SparseAdamFunctor<T> functor(
diff --git a/paddle/operators/bipartite_match_op.cc b/paddle/operators/bipartite_match_op.cc
index 83c8778fe4cec4d9d80de691e117a39fdd92f494..1e6fa2091de25218e2bdafeb740ce884234638a5 100644
--- a/paddle/operators/bipartite_match_op.cc
+++ b/paddle/operators/bipartite_match_op.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -28,12 +28,18 @@ class BipartiteMatchOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("DistMat"),
                    "Input(DistMat) of BipartiteMatch should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("ColToRowMatchIndices"),
+        "Output(ColToRowMatchIndices) of BipartiteMatch should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("ColToRowMatchDist"),
+        "Output(ColToRowMatchDist) of BipartiteMatch should not be null.");
 
     auto dims = ctx->GetInputDim("DistMat");
     PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DistMat) must be 2.");
 
     ctx->SetOutputDim("ColToRowMatchIndices", dims);
-    ctx->SetOutputDim("ColToRowMatchDis", dims);
+    ctx->SetOutputDim("ColToRowMatchDist", dims);
   }
 };
 
@@ -91,7 +97,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto* dist_mat = context.Input<LoDTensor>("DistMat");
     auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices");
-    auto* match_dist = context.Output<Tensor>("ColToRowMatchDis");
+    auto* match_dist = context.Output<Tensor>("ColToRowMatchDist");
 
     auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
 
@@ -148,13 +154,13 @@ class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {
               "Otherwise, it means B[j] is matched to row "
               "ColToRowMatchIndices[i][j] in i-th instance. The row number of "
               "i-th instance is saved in ColToRowMatchIndices[i][j].");
-    AddOutput("ColToRowMatchDis",
+    AddOutput("ColToRowMatchDist",
               "(Tensor) A 2-D Tensor with shape [N, M] in float type. "
               "N is batch size. If ColToRowMatchIndices[i][j] is -1, "
-              "ColToRowMatchDis[i][j] is also -1.0. Otherwise, assumed "
+              "ColToRowMatchDist[i][j] is also -1.0. Otherwise, assumed "
               "ColToRowMatchIndices[i][j] = d, and the row offsets of each "
               "instance are called LoD. Then "
-              "ColToRowMatchDis[i][j] = DistMat[d+LoD[i]][j]");
+              "ColToRowMatchDist[i][j] = DistMat[d+LoD[i]][j]");
     AddComment(R"DOC(
 This operator is a greedy bipartite matching algorithm, which is used to
 obtain the matching with the maximum distance based on the input
diff --git a/paddle/operators/box_coder_op.cc b/paddle/operators/box_coder_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..539813d4858b8faef386047f9ef64aa232aefca1
--- /dev/null
+++ b/paddle/operators/box_coder_op.cc
@@ -0,0 +1,121 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/box_coder_op.h"
+
+namespace paddle {
+namespace operators {
+
+class BoxCoderOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
+                   "Input(PriorBox) of BoxCoderOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("PriorBoxVar"),
+                   "Input(PriorBoxVar) of BoxCoderOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("TargetBox"),
+                   "Input(TargetBox) of BoxCoderOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutputBox"),
+                   "Output(OutputBox) of BoxCoderOp should not be null.");
+
+    auto prior_box_dims = ctx->GetInputDim("PriorBox");
+    auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
+    auto target_box_dims = ctx->GetInputDim("TargetBox");
+
+    PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2,
+                      "The rank of Input of PriorBoxVar must be 2");
+    PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]");
+    PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims);
+    PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
+                      "The rank of Input of TargetBox must be 2");
+    PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
+                      "The shape of TargetBox is [M, 4]");
+
+    GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
+
+    ctx->SetOutputDim(
+        "OutputBox",
+        framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4}));
+    ctx->ShareLoD("TargetBox", /*->*/ "OutputBox");
+  }
+};
+
+class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BoxCoderOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "PriorBox",
+        "(Tensor, default Tensor<float>) "
+        "Box list PriorBox is a 2-D Tensor with shape [M, 4] holds M boxes, "
+        "each box is represented as [xmin, ymin, xmax, ymax], "
+        "[xmin, ymin] is the left top coordinate of the anchor box, "
+        "if the input is image feature map, they are close to the origin "
+        "of the coordinate system. [xmax, ymax] is the right bottom "
+        "coordinate of the anchor box.");
+    AddInput("PriorBoxVar",
+             "(Tensor, default Tensor<float>) "
+             "PriorBoxVar is a 2-D Tensor with shape [M, 4] holds M group "
+             "of variance.");
+    AddInput(
+        "TargetBox",
+        "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
+        "[N, 4], each box is represented as [xmin, ymin, xmax, ymax], "
+        "[xmin, ymin] is the left top coordinate of the box if the input "
+        "is image feature map, they are close to the origin of the coordinate "
+        "system. [xmax, ymax] is the right bottom coordinate of the box. "
+        "This tensor can contain LoD information to represent a batch "
+        "of inputs. One instance of this batch can contain different "
+        "numbers of entities.");
+    AddAttr<std::string>("code_type",
+                         "(string, default encode_center_size) "
+                         "the code type used with the target box")
+        .SetDefault("encode_center_size")
+        .InEnum({"encode_center_size", "decode_center_size"});
+    AddOutput(
+        "OutputBox",
+        "(LoDTensor or Tensor) "
+        "(Tensor) The output of box_coder_op, a tensor with shape [N, M, 4] "
+        "representing the result of N target boxes encoded/decoded with "
+        "M Prior boxes and variances.");
+
+    AddComment(R"DOC(
+Bounding Box Coder Operator.
+Encode/Decode the target bounding box with the priorbox information.
+The Encoding schema described below:
+ox = (tx - px) / pw / pxv
+oy = (ty - py) / ph / pyv
+ow = log(abs(tw / pw)) / pwv 
+oh = log(abs(th / ph)) / phv 
+The Decoding schema described below:
+ox = (pw * pxv * tx * + px) - tw / 2
+oy = (ph * pyv * ty * + py) - th / 2
+ow = exp(pwv * tw) * pw + tw / 2
+oh = exp(phv * th) * ph + th / 2
+where tx, ty, tw, th denote the target box's center coordinates, width and
+height respectively. Similarly, px, py, pw, ph denote the priorbox's(anchor)
+center coordinates, width and height. pxv, pyv, pwv, phv denote the variance
+of the priorbox and ox, oy, ow, oh denote the encoded/decoded coordinates,
+width and height.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker);
+REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel<float>,
+                       ops::BoxCoderKernel<double>);
diff --git a/paddle/operators/box_coder_op.cu b/paddle/operators/box_coder_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..98bd93457fafb49f2af5e1ff258fbfa9f9985600
--- /dev/null
+++ b/paddle/operators/box_coder_op.cu
@@ -0,0 +1,150 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/box_coder_op.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void EncodeCenterSizeKernel(const T* prior_box_data,
+                                       const T* prior_box_var_data,
+                                       const T* target_box_data, const int row,
+                                       const int col, const int len,
+                                       T* output) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < row * col) {
+    const int row_idx = idx / col;
+    const int col_idx = idx % col;
+    T prior_box_width =
+        prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
+    T prior_box_height =
+        prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
+    T prior_box_center_x =
+        (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
+    T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
+                            prior_box_data[col_idx * len + 1]) /
+                           2;
+
+    T target_box_center_x =
+        (target_box_data[row_idx * len + 2] + target_box_data[row_idx * len]) /
+        2;
+    T target_box_center_y = (target_box_data[row_idx * len + 3] +
+                             target_box_data[row_idx * len + 1]) /
+                            2;
+    T target_box_width =
+        target_box_data[row_idx * len + 2] - target_box_data[row_idx * len];
+    T target_box_height =
+        target_box_data[row_idx * len + 3] - target_box_data[row_idx * len + 1];
+
+    output[idx * len] = (target_box_center_x - prior_box_center_x) /
+                        prior_box_width / prior_box_var_data[col_idx * len];
+    output[idx * len + 1] = (target_box_center_y - prior_box_center_y) /
+                            prior_box_height /
+                            prior_box_var_data[col_idx * len + 1];
+    output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)) /
+                            prior_box_var_data[col_idx * len + 2];
+    output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)) /
+                            prior_box_var_data[col_idx * len + 3];
+  }
+}
+
+template <typename T>
+__global__ void DecodeCenterSizeKernel(const T* prior_box_data,
+                                       const T* prior_box_var_data,
+                                       const T* target_box_data, const int row,
+                                       const int col, const int len,
+                                       T* output) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < row * col) {
+    const int row_idx = idx / col;
+    const int col_idx = idx % col;
+    T prior_box_width =
+        prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
+    T prior_box_height =
+        prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
+    T prior_box_center_x =
+        (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
+    T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
+                            prior_box_data[col_idx * len + 1]) /
+                           2;
+
+    T target_box_width = exp(prior_box_var_data[col_idx * len + 2] *
+                             target_box_data[row_idx * len + 2]) *
+                         prior_box_width;
+    T target_box_height = exp(prior_box_var_data[col_idx * len + 3] *
+                              target_box_data[row_idx * len + 3]) *
+                          prior_box_height;
+    T target_box_center_x = prior_box_var_data[col_idx * len] *
+                                target_box_data[row_idx * len] *
+                                prior_box_width +
+                            prior_box_center_x;
+    T target_box_center_y = prior_box_var_data[col_idx * len + 1] *
+                                target_box_data[row_idx * len + 1] *
+                                prior_box_height +
+                            prior_box_center_y;
+
+    output[idx * len] = target_box_center_x - target_box_width / 2;
+    output[idx * len + 1] = target_box_center_y - target_box_height / 2;
+    output[idx * len + 2] = target_box_center_x + target_box_width / 2;
+    output[idx * len + 3] = target_box_center_y + target_box_height / 2;
+  }
+}
+
+template <typename T>
+class BoxCoderCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto* prior_box = context.Input<framework::Tensor>("PriorBox");
+    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
+    auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
+    auto* output_box = context.Output<framework::Tensor>("OutputBox");
+
+    if (target_box->lod().size()) {
+      PADDLE_ENFORCE_EQ(target_box->lod().size(), 1,
+                        "Only support 1 level of LoD.");
+    }
+    auto row = target_box->dims()[0];
+    auto col = prior_box->dims()[0];
+    auto len = prior_box->dims()[1];
+    int block = 512;
+    int grid = (row * col + block - 1) / block;
+    auto& device_ctx = context.cuda_device_context();
+
+    const T* prior_box_data = prior_box->data<T>();
+    const T* prior_box_var_data = prior_box_var->data<T>();
+    const T* target_box_data = target_box->data<T>();
+
+    output_box->mutable_data<T>({row, col, len}, context.GetPlace());
+    T* output = output_box->data<T>();
+
+    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
+    if (code_type == BoxCodeType::kEncodeCenterSize) {
+      EncodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
+          prior_box_data, prior_box_var_data, target_box_data, row, col, len,
+          output);
+    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
+      DecodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
+          prior_box_data, prior_box_var_data, target_box_data, row, col, len,
+          output);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(box_coder, ops::BoxCoderCUDAKernel<float>,
+                        ops::BoxCoderCUDAKernel<double>);
diff --git a/paddle/operators/box_coder_op.h b/paddle/operators/box_coder_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..086251f6e066f082743f332ce72918c6e572ce19
--- /dev/null
+++ b/paddle/operators/box_coder_op.h
@@ -0,0 +1,151 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 };
+
+inline BoxCodeType GetBoxCodeType(const std::string& type) {
+  if (type == "encode_center_size") {
+    return BoxCodeType::kEncodeCenterSize;
+  } else if (type == "decode_center_size") {
+    return BoxCodeType::kDecodeCenterSize;
+  }
+  PADDLE_THROW("Not support type %s.", type);
+}
+
+template <typename T>
+class BoxCoderKernel : public framework::OpKernel<T> {
+ public:
+  void EncodeCenterSize(const framework::Tensor& target_box,
+                        const framework::Tensor& prior_box,
+                        const framework::Tensor& prior_box_var,
+                        T* output) const {
+    int64_t row = target_box.dims()[0];
+    int64_t col = prior_box.dims()[0];
+    int64_t len = prior_box.dims()[1];
+    auto* target_box_data = target_box.data<T>();
+    auto* prior_box_data = prior_box.data<T>();
+    auto* prior_box_var_data = prior_box_var.data<T>();
+
+    for (int64_t i = 0; i < row; ++i) {
+      for (int64_t j = 0; j < col; ++j) {
+        T prior_box_width =
+            prior_box_data[j * len + 2] - prior_box_data[j * len];
+        T prior_box_height =
+            prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
+        T prior_box_center_x =
+            (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
+        T prior_box_center_y =
+            (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
+
+        T target_box_center_x =
+            (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
+        T target_box_center_y =
+            (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
+        T target_box_width =
+            target_box_data[i * len + 2] - target_box_data[i * len];
+        T target_box_height =
+            target_box_data[i * len + 3] - target_box_data[i * len + 1];
+
+        size_t offset = i * col * len + j * len;
+        output[offset] = (target_box_center_x - prior_box_center_x) /
+                         prior_box_width / prior_box_var_data[j * len];
+        output[offset + 1] = (target_box_center_y - prior_box_center_y) /
+                             prior_box_height / prior_box_var_data[j * len + 1];
+        output[offset + 2] =
+            std::log(std::fabs(target_box_width / prior_box_width)) /
+            prior_box_var_data[j * len + 2];
+        output[offset + 3] =
+            std::log(std::fabs(target_box_height / prior_box_height)) /
+            prior_box_var_data[j * len + 3];
+      }
+    }
+  }
+  void DecodeCenterSize(const framework::Tensor& target_box,
+                        const framework::Tensor& prior_box,
+                        const framework::Tensor& prior_box_var,
+                        T* output) const {
+    int64_t row = target_box.dims()[0];
+    int64_t col = prior_box.dims()[0];
+    int64_t len = prior_box.dims()[1];
+
+    auto* target_box_data = target_box.data<T>();
+    auto* prior_box_data = prior_box.data<T>();
+    auto* prior_box_var_data = prior_box_var.data<T>();
+
+    for (int64_t i = 0; i < row; ++i) {
+      for (int64_t j = 0; j < col; ++j) {
+        T prior_box_width =
+            prior_box_data[j * len + 2] - prior_box_data[j * len];
+        T prior_box_height =
+            prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
+        T prior_box_center_x =
+            (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
+        T prior_box_center_y =
+            (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
+
+        T target_box_center_x = prior_box_var_data[j * len] *
+                                    target_box_data[i * len] * prior_box_width +
+                                prior_box_center_x;
+        T target_box_center_y = prior_box_var_data[j * len + 1] *
+                                    target_box_data[i * len + 1] *
+                                    prior_box_height +
+                                prior_box_center_y;
+        T target_box_width = std::exp(prior_box_var_data[j * len + 2] *
+                                      target_box_data[i * len + 2]) *
+                             prior_box_width;
+        T target_box_height = std::exp(prior_box_var_data[j * len + 3] *
+                                       target_box_data[i * len + 3]) *
+                              prior_box_height;
+
+        size_t offset = i * col * len + j * len;
+        output[offset] = target_box_center_x - target_box_width / 2;
+        output[offset + 1] = target_box_center_y - target_box_height / 2;
+        output[offset + 2] = target_box_center_x + target_box_width / 2;
+        output[offset + 3] = target_box_center_y + target_box_height / 2;
+      }
+    }
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* prior_box = context.Input<framework::Tensor>("PriorBox");
+    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
+    auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
+    auto* output_box = context.Output<framework::Tensor>("OutputBox");
+
+    if (target_box->lod().size()) {
+      PADDLE_ENFORCE_EQ(target_box->lod().size(), 1UL,
+                        "Only support 1 level of LoD.");
+    }
+    auto row = target_box->dims()[0];
+    auto col = prior_box->dims()[0];
+    auto len = prior_box->dims()[1];
+
+    output_box->mutable_data<T>({row, col, len}, context.GetPlace());
+
+    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
+    T* output = output_box->data<T>();
+    if (code_type == BoxCodeType::kEncodeCenterSize) {
+      EncodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
+    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
+      DecodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/compare_op.h b/paddle/operators/compare_op.h
index 9c655d6c0d8e5fe04ee6d85f7e9d9da68105230c..79b8c6f59c7ad3d77aa969f6b4f36f8050cfe823 100644
--- a/paddle/operators/compare_op.h
+++ b/paddle/operators/compare_op.h
@@ -54,7 +54,15 @@ class CompareOpKernel
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     using T = typename Functor::ELEM_TYPE;
-    ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context);
+    using Tensor = framework::Tensor;
+
+    auto* x = context.Input<Tensor>("X");
+    auto* y = context.Input<Tensor>("Y");
+    auto* z = context.Output<Tensor>("Out");
+    z->mutable_data<T>(context.GetPlace());
+    int axis = context.Attr<int>("axis");
+    ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, axis,
+                                                          Functor(), z);
   }
 };
 
diff --git a/paddle/operators/conditional_block_op.cc b/paddle/operators/conditional_block_op.cc
index 3cae61a438431e72cb24d714c761676cc0c3a41f..bdcdb85be7a94a748961048ac97e69f2f3b78677 100644
--- a/paddle/operators/conditional_block_op.cc
+++ b/paddle/operators/conditional_block_op.cc
@@ -41,6 +41,21 @@ class ConditionalOp : public framework::OperatorBase {
         });
     return retv;
   }
+
+  bool ScalarCondition(
+      const std::vector<const framework::LoDTensor *> &ips) const {
+    if (!(ips.size() == 1UL && ips[0]->IsInitialized())) {
+      PADDLE_THROW("should have one initialized input as condition");
+    }
+    if (!(ips[0]->type().hash_code() == typeid(bool).hash_code() &&
+          ips[0]->numel() == 1)) {
+      PADDLE_THROW(
+          "condition input's data type should be bool, "
+          "numel should be 1, actual numel is %d",
+          ips[0]->numel());
+    }
+    return ips[0]->data<bool>()[0];
+  }
 };
 
 class ConditionalBlockOp : public ConditionalOp {
@@ -53,9 +68,15 @@ class ConditionalBlockOp : public ConditionalOp {
   void Run(const framework::Scope &scope,
            const platform::Place &dev_place) const override {
     auto xs = InputTensors(scope);
-    bool need_run = std::all_of(
-        xs.begin(), xs.end(),
-        [](const framework::LoDTensor *t) { return t->numel() != 0; });
+
+    bool need_run;
+    if (Attr<bool>("is_scalar_condition")) {
+      need_run = ScalarCondition(xs);
+    } else {
+      need_run = std::all_of(
+          xs.begin(), xs.end(),
+          [](const framework::LoDTensor *t) { return t->numel() != 0; });
+    }
 
     if (need_run) {
       auto *scope_var = scope.FindVar(Output("Scope"));
@@ -88,6 +109,10 @@ class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
               "scope is std::vector<Scope*>");
     AddAttr<framework::BlockDesc *>(
         "sub_block", "The step block of conditional block operator");
+    AddAttr<bool>("is_scalar_condition",
+                  "the input X is used as scalar "
+                  "condition")
+        .SetDefault(false);
     AddComment(R"DOC(Conditional block operator
 
 Run the sub-block if X is not empty. Params is the other inputs and Out is the
@@ -106,9 +131,15 @@ class ConditionalBlockGradOp : public ConditionalOp {
   void Run(const framework::Scope &scope,
            const platform::Place &dev_place) const override {
     auto xs = this->InputTensors(scope);
-    bool need_run = std::all_of(
-        xs.begin(), xs.end(),
-        [](const framework::LoDTensor *t) { return t->numel() != 0; });
+
+    bool need_run;
+    if (Attr<bool>("is_scalar_condition")) {
+      need_run = ScalarCondition(xs);
+    } else {
+      need_run = std::all_of(
+          xs.begin(), xs.end(),
+          [](const framework::LoDTensor *t) { return t->numel() != 0; });
+    }
 
     if (need_run) {
       auto *scope_var = scope.FindVar(Input("Scope"));
@@ -182,6 +213,7 @@ class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker {
     grad_op->SetOutput(framework::GradVarName("Params"),
                        InputGrad("Params", false));
     grad_op->SetBlockAttr("sub_block", *this->grad_block_[0]);
+    grad_op->SetAttr("is_scalar_condition", GetAttr("is_scalar_condition"));
     return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc
index d6882b275b22b9a2a2b6ff8cfb53a3462bbdbefe..cef7ddd5fe7e12a374fb9cc79211bd2eb97c6c52 100644
--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -318,9 +318,25 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
 namespace ops = paddle::operators;
 REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
             ops::ConvOpGrad);
+
+// depthwise convolution op
+REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
+            depthwise_conv2d_grad, ops::ConvOpGrad);
 REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
             ops::ConvOpGrad);
 
+// depthwise conv kernel
+// TODO(xingzhaolong): neon kernel for mobile
+REGISTER_OP_CPU_KERNEL(
+    depthwise_conv2d,
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    depthwise_conv2d_grad,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
+
 REGISTER_OP_CPU_KERNEL(
     conv2d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
     ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/conv_op.cu.cc b/paddle/operators/conv_op.cu.cc
index 4f942444f3eb5584f07399b8d1b4d6a5087496d4..d0bd40ee95dab3b2589742b8a0c3a5de7918b5b9 100644
--- a/paddle/operators/conv_op.cu.cc
+++ b/paddle/operators/conv_op.cu.cc
@@ -16,6 +16,16 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 
+REGISTER_OP_CUDA_KERNEL(
+    depthwise_conv2d,
+    ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    depthwise_conv2d_grad,
+    ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, double>);
+
 REGISTER_OP_CUDA_KERNEL(
     conv2d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
     ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/conv_op.h b/paddle/operators/conv_op.h
index 5a8933e7915960f9fcbe92ae73c4f37b3b69ecaf..3c1d0e9c1c4bb964bfaebc3bfed115548bd53f97 100644
--- a/paddle/operators/conv_op.h
+++ b/paddle/operators/conv_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/depthwise_conv.h"
 #include "paddle/operators/math/im2col.h"
 #include "paddle/operators/math/math_function.h"
 #include "paddle/operators/math/vol2col.h"
@@ -350,5 +351,72 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
     }
   }
 };
+
+template <typename DeviceContext, typename T>
+class DepthwiseConvKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    Tensor filter = *context.Input<Tensor>("Filter");
+    Tensor* output = context.Output<Tensor>("Output");
+    output->mutable_data<T>(context.GetPlace());
+
+    PADDLE_ENFORCE_EQ(
+        output->dims()[1] % input->dims()[1], 0,
+        "The output channels must be a multiple of the input channels");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+
+    math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    depthwiseConv(dev_ctx, *input, filter, strides, paddings, output);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class DepthwiseConvGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    const Tensor* output_grad =
+        context.Input<Tensor>(framework::GradVarName("Output"));
+    Tensor* input_grad =
+        context.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* filter_grad =
+        context.Output<Tensor>(framework::GradVarName("Filter"));
+    Tensor filter = *context.Input<Tensor>("Filter");
+
+    if (!input_grad && !filter_grad) return;
+
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    math::DepthwiseConvInputGradFunctor<DeviceContext, T>
+        depthwiseConvInputGrad;
+    math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
+        depthwiseConvFilterGrad;
+
+    if (input_grad) {
+      input_grad->mutable_data<T>(context.GetPlace());
+      set_zero(dev_ctx, input_grad, static_cast<T>(0));
+      depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
+                             paddings, input_grad);
+    }
+
+    if (filter_grad) {
+      filter_grad->mutable_data<T>(context.GetPlace());
+      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+      depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings,
+                              filter_grad);
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/create_reader_op.cc b/paddle/operators/create_reader_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5ba2a25ab4c679f638e928a9e04c20d683a93630
--- /dev/null
+++ b/paddle/operators/create_reader_op.cc
@@ -0,0 +1,205 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/reader.h"
+
+namespace paddle {
+namespace operators {
+
+static std::vector<framework::DDim> RestoreShapes(
+    const std::vector<int>& shape_concat, const std::vector<int>& ranks) {
+  std::vector<framework::DDim> res;
+  int offset = 0;
+  for (int len : ranks) {
+    auto start_it = shape_concat.begin() + offset;
+    auto end_it = start_it + len;
+    res.push_back(framework::make_ddim(std::vector<int>(start_it, end_it)));
+    offset += len;
+  }
+  return res;
+}
+
+// general infershape for file readers
+class CreateFileReaderInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "The output file reader should not be null.");
+    const auto shape_concat =
+        ctx->Attrs().Get<std::vector<int>>("shape_concat");
+    const auto ranks = ctx->Attrs().Get<std::vector<int>>("ranks");
+    std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
+    ctx->SetReaderDims("Out", shapes);
+  }
+};
+
+// general infershape for decorated readers
+class CreateDecoratedReaderInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("UnderlyingReader"),
+                   "Input(UnderlyingReader) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "The output decorated reader should not be null.");
+    ctx->SetReaderDims("Out", ctx->GetReaderDims("UnderlyingReader"));
+  }
+};
+
+// general var type inference for all readers
+class CreateReaderInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    std::string reader_name = op_desc.Output("Out")[0];
+    framework::VarDesc* reader = block->FindVarRecursive(reader_name);
+    reader->SetType(framework::proto::VarDesc::READER);
+  }
+};
+
+template <typename T>
+class CreateRandomDataGeneratorOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+  void Run(const framework::Scope& scope,
+           const platform::Place& dev_place) const override {
+    const auto& shape_concat = Attr<std::vector<int>>("shape_concat");
+    const auto& ranks = Attr<std::vector<int>>("ranks");
+    PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
+    PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
+                      int(shape_concat.size()),
+                      "The accumulate of all ranks should be equal to the "
+                      "shape concat's length.");
+    std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    out->Reset(new framework::RandomDataGenerator<T>(shapes, Attr<float>("min"),
+                                                     Attr<float>("max")));
+  }
+};
+
+class CreateRandomDataGeneratorOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  CreateRandomDataGeneratorOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(op_proto, op_checker) {
+    AddOutput("Out", "(ReaderHolder) The created random reader.");
+    AddAttr<std::vector<int>>("shape_concat",
+                              "The concat of all data's shapes.");
+    AddAttr<std::vector<int>>(
+        "ranks",
+        "The ranks of each data."
+        "e.g."
+        "shape_concat = [2,3,4,5,6]"
+        "ranks = [3,2]"
+        "It means the reader will generate two data each time,"
+        "whose shapes are [2,3,4] and [5,6] respectively.");
+    AddAttr<float>("min", "The lower bound of reader's uniform distribution.");
+    AddAttr<float>("max", "The upper bound of reader's uniform distribution.");
+    AddComment(R"DOC(
+      CreateRandomDataGenerator Operator
+
+      This Op creates a random reader. 
+      The reader generates random data instead of really reading from files.
+      Generated data follow an uniform distribution between 'min' and 'max'.
+    )DOC");
+  }
+};
+
+class CreateShuffleReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+  void Run(const framework::Scope& scope,
+           const platform::Place& dev_place) const override {
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    out->Reset(new framework::ShuffleReader(underlying_reader.Get(),
+                                            Attr<int>("buffer_size")));
+  }
+};
+
+class CreateShuffleReaderOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CreateShuffleReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(op_proto, op_checker) {
+    AddInput(
+        "UnderlyingReader",
+        "(ReaderHolder) The underlying reader for creating a shuffle reader.");
+    AddOutput("Out", "(ReaderHolder) The created shuffle reader.");
+    AddAttr<int>("buffer_size", "The shuffle buffer size.").GreaterThan(0);
+    AddComment(R"DOC(
+      CreateShuffleReader Operator
+
+      A shuffle reader takes another reader as its 'underlying reader'
+      and yields the underlying reader's outputs in a shuffled order. 
+    )DOC");
+  }
+};
+
+class CreateBatchReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+  void Run(const framework::Scope& scope,
+           const platform::Place& dev_place) const override {
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    out->Reset(new framework::BatchReader(underlying_reader.Get(),
+                                          Attr<int>("batch_size")));
+  }
+};
+
+class CreateBatchReaderOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CreateBatchReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(op_proto, op_checker) {
+    AddInput(
+        "UnderlyingReader",
+        "(ReaderHolder) The underlying reader for creating a batch reader.");
+    AddOutput("Out", "(ReaderHolder) The created batch reader.");
+    AddAttr<int>("batch_size",
+                 "How many instances the batch reader yields each time.")
+        .GreaterThan(0);
+    AddComment(R"DOC(
+      CreateBatchReader Operator
+
+      A batch reader takes another reader as its 'underlying reader', 
+      gathers the underlying reader's outputs and then yields them in batches. 
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(create_random_data_generator,
+                  ops::CreateRandomDataGeneratorOp<float>,
+                  ops::CreateFileReaderInferShape,
+                  ops::CreateRandomDataGeneratorOpMaker,
+                  paddle::framework::EmptyGradOpMaker,
+                  ops::CreateReaderInferVarType);
+REGISTER_OPERATOR(create_shuffle_reader, ops::CreateShuffleReaderOp,
+                  ops::CreateDecoratedReaderInferShape,
+                  ops::CreateShuffleReaderOpMaker,
+                  paddle::framework::EmptyGradOpMaker,
+                  ops::CreateReaderInferVarType);
+REGISTER_OPERATOR(create_batch_reader, ops::CreateBatchReaderOp,
+                  ops::CreateDecoratedReaderInferShape,
+                  ops::CreateBatchReaderOpMaker,
+                  paddle::framework::EmptyGradOpMaker,
+                  ops::CreateReaderInferVarType);
diff --git a/paddle/operators/ctc_align_op.cu b/paddle/operators/ctc_align_op.cu
index 45635f16745346b08f7e31db2f25905bdbc3aeeb..cea595d7c5d461b40198e622abf08248e7ca69e1 100644
--- a/paddle/operators/ctc_align_op.cu
+++ b/paddle/operators/ctc_align_op.cu
@@ -69,18 +69,25 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
 
     auto stream = ctx.cuda_device_context().stream();
     MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>(
-        num_tokens, tokens, num_seq, input_lod[level].data(), blank,
+        num_tokens, tokens, num_seq, input_lod[level].cuda_data(), blank,
         merge_repeated, dev_out_lod0_ptr, output_data);
 
     // set output lod
-    thrust::host_vector<size_t> host_out_lod0(dev_out_lod0.begin(),
-                                              dev_out_lod0.end());
+    std::vector<size_t> host_out_lod0(dev_out_lod0.begin(), dev_out_lod0.end());
     framework::LoD out_lod;
     out_lod.push_back(host_out_lod0);
     output->set_lod(out_lod);
 
     // resize output dims
     output->Resize({static_cast<int64_t>(host_out_lod0.back()), 1});
+
+    if (host_out_lod0.back() == 0) {
+      output->Resize({1, 1});
+      output->mutable_data<T>(ctx.GetPlace());
+      math::SetConstant<platform::CUDADeviceContext, T> set_constant;
+      set_constant(ctx.template device_context<platform::CUDADeviceContext>(),
+                   output, -1);
+    }
   }
 };
 
diff --git a/paddle/operators/ctc_align_op.h b/paddle/operators/ctc_align_op.h
index fed89aa1e899a2450b315f352b9695056ed13aec..54ad1d6f5cc96c884c9e0c101c44d8d629792f8f 100644
--- a/paddle/operators/ctc_align_op.h
+++ b/paddle/operators/ctc_align_op.h
@@ -16,6 +16,8 @@ limitations under the License. */
 
 #include <string.h>
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
 namespace paddle {
 namespace operators {
 
@@ -65,9 +67,14 @@ class CTCAlignKernel : public framework::OpKernel<T> {
     framework::LoD output_lod;
     output_lod.push_back(output_lod0);
     output->set_lod(output_lod);
-
     // resize output dims
     output->Resize({static_cast<int64_t>(output_lod0.back()), 1});
+    // for empty sequence
+    if (output_lod0.back() == 0) {
+      output->Resize({1, 1});
+      output_data = output->mutable_data<T>(ctx.GetPlace());
+      output_data[0] = -1;
+    }
   }
 };
 
diff --git a/paddle/operators/detail/grpc_client.cc b/paddle/operators/detail/grpc_client.cc
index 1e41587c418fb0ce4e452d5c6735c54e2d42f798..9b5f7afc6a48f13ff999f635efeb9e7bf0a76fb5 100644
--- a/paddle/operators/detail/grpc_client.cc
+++ b/paddle/operators/detail/grpc_client.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "grpc_client.h"
+#include "paddle/framework/threadpool.h"
 namespace paddle {
 namespace operators {
 namespace detail {
@@ -22,25 +23,32 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
                                   const framework::Scope& scope,
                                   const std::string& var_name,
                                   int64_t time_out) {
-  sendrecv::VariableMessage req;
-  auto* var = scope.FindVar(var_name);
-  SerializeToMessage(var_name, var, ctx, &req);
-
-  // varhandle
-  VarHandle var_h;
-  var_h.ep = ep;
-  var_h.scope = &scope;
-  var_h.name = var_name;
-  var_h.ctx = &ctx;
-
-  // stub context
-  auto ch = GetChannel(ep);
-  SendProcessor* s = new SendProcessor(ch);
-  s->Prepare(var_h, time_out);
-  s->response_call_back_ = NULL;
-
-  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, (void*)s);
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string var_name_val = var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
+
+  framework::Async([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, this] {
+    auto* var = p_scope->FindVar(var_name_val);
+    sendrecv::VariableMessage req;
+    SerializeToMessage(var_name_val, var, *p_ctx, &req);
+
+    // varhandle
+    VarHandle var_h;
+    var_h.ep = ep_val;
+    var_h.scope = p_scope;
+    var_h.name = var_name_val;
+    var_h.ctx = p_ctx;
+
+    // stub context
+    SendProcessor* s = new SendProcessor(ch);
+    s->Prepare(var_h, time_out);
+    s->response_call_back_ = NULL;
+
+    auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
+    rpc->Finish(&s->reply_, &s->status_, (void*)s);
+  });
 
   req_count_++;
 
@@ -50,8 +58,6 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
 void ProcGetResponse(const VarHandle& var_h,
                      const sendrecv::VariableMessage& ret_msg) {
   auto* outvar = var_h.scope->FindVar(var_h.name);
-
-  std::istringstream iss(ret_msg.serialized());
   DeserializeFromMessage(ret_msg, *var_h.ctx, outvar);
 }
 
@@ -60,44 +66,78 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
                                  const framework::Scope& scope,
                                  const std::string& var_name,
                                  int64_t time_out) {
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string var_name_val = var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
+
+  framework::Async([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {
+    sendrecv::VariableMessage req;
+    req.set_varname(var_name_val);
+
+    // varhandle
+    VarHandle var_h;
+    var_h.ep = ep_val;
+    var_h.scope = p_scope;
+    var_h.name = var_name_val;
+    var_h.ctx = p_ctx;
+
+    // stub context
+    GetProcessor* s = new GetProcessor(ch);
+    s->Prepare(var_h, time_out);
+    s->response_call_back_ = ProcGetResponse;
+
+    auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
+    rpc->Finish(&s->reply_, &s->status_, (void*)s);
+  });
+
+  req_count_++;
+
+  return true;
+}
+
+bool RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
+  const auto ch = GetChannel(ep);
+
+  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
+  s->Prepare(time_out);
+
   sendrecv::VariableMessage req;
-  req.set_varname(var_name);
-
-  // varhandle
-  VarHandle var_h;
-  var_h.ep = ep;
-  var_h.scope = &scope;
-  var_h.name = var_name;
-  var_h.ctx = &ctx;
-
-  // stub context
-  auto ch = GetChannel(ep);
-  GetProcessor* s = new GetProcessor(ch);
-  s->Prepare(var_h, time_out);
-  s->response_call_back_ = ProcGetResponse;
-
-  auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
+  req.set_varname(BATCH_BARRIER_MESSAGE);
+  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, (void*)s);
-
   req_count_++;
 
   return true;
 }
 
 bool RPCClient::Wait() {
-  bool ok = true;
+  if (req_count_ <= 0) {
+    return true;
+  }
+  const size_t kReqCnt = req_count_;
+  bool a[kReqCnt];
+  std::vector<std::future<void>> waits(req_count_);
 
-  while (true) {
-    if (req_count_ <= 0) {
-      break;
-    }
+  for (int i = 0; i < req_count_; i++) {
+    waits[i] = framework::Async([i, &a, this] { a[i] = Proceed(); });
+  }
+
+  for (int i = 0; i < req_count_; i++) {
+    waits[i].wait();
+  }
 
-    if (!Proceed()) {
+  int last_req_count = req_count_;
+  req_count_ = 0;
+
+  for (int i = 0; i < last_req_count; i++) {
+    if (!a[i]) {
       return false;
     }
   }
 
-  return ok;
+  return true;
 }
 
 bool RPCClient::Proceed() {
@@ -124,7 +164,6 @@ bool RPCClient::Proceed() {
 
   c->Process();
   delete c;
-  req_count_--;
   return true;
 }
 
diff --git a/paddle/operators/detail/grpc_client.h b/paddle/operators/detail/grpc_client.h
index a62e70a2533ae52d84d010504b19fed5aeb15dc0..f9499f6dc70c541c214e0b659f10b2ed1e8e8581 100644
--- a/paddle/operators/detail/grpc_client.h
+++ b/paddle/operators/detail/grpc_client.h
@@ -71,6 +71,15 @@ class ClientBase {
     context_->set_deadline(deadline);
   }
 
+  virtual void Prepare(int64_t time_out) {
+    context_.reset(new grpc::ClientContext());
+
+    std::chrono::system_clock::time_point deadline =
+        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
+
+    context_->set_deadline(deadline);
+  }
+
   virtual void Process() = 0;
 
   std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
@@ -117,6 +126,17 @@ class GetProcessor : public ClientBase {
   RequestGetCallBack response_call_back_ = ProcGetResponse;
 };
 
+class BatchBarrierProcessor : public ClientBase {
+ public:
+  explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
+      : ClientBase(ch) {}
+
+  virtual ~BatchBarrierProcessor() {}
+
+  virtual void Process() {}
+  sendrecv::VoidMessage reply_;
+};
+
 class RPCClient {
  public:
   bool AsyncSendVariable(const std::string& ep,
@@ -130,6 +150,10 @@ class RPCClient {
                         const framework::Scope& scope,
                         const std::string& var_name,
                         int64_t time_out = 600 * 1000);
+
+  bool AsyncSendBatchBarrier(const std::string& ep,
+                             int64_t time_out = 600 * 1000);
+
   bool Wait();
 
  private:
diff --git a/paddle/operators/detail/grpc_server.cc b/paddle/operators/detail/grpc_server.cc
index 3ddcd839bdd23547216465dfaf44a3cd8285fe6d..4f94e1315fbd2810a05354f7c3fc54ea30967e8a 100644
--- a/paddle/operators/detail/grpc_server.cc
+++ b/paddle/operators/detail/grpc_server.cc
@@ -132,6 +132,7 @@ void AsyncGRPCServer::RunSyncUpdate() {
 
   cq_send_ = builder.AddCompletionQueue();
   cq_get_ = builder.AddCompletionQueue();
+
   server_ = builder.BuildAndStart();
   LOG(INFO) << "Server listening on " << address_ << std::endl;
 
@@ -141,11 +142,11 @@ void AsyncGRPCServer::RunSyncUpdate() {
       std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this);
 
   t_send_.reset(
-      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, false,
+      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
                                 cq_send_.get(), "cq_send", send_register)));
 
   t_get_.reset(
-      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, true,
+      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
                                 cq_get_.get(), "cq_get", get_register)));
 
   // wait server
@@ -174,7 +175,7 @@ void AsyncGRPCServer::TryToRegisterNewSendOne() {
   }
   RequestSend* send =
       new RequestSend(&service_, cq_send_.get(), &var_recv_queue_);
-  VLOG(4) << "create RequestSend status:" << send->Status();
+  VLOG(4) << "Create RequestSend status:" << send->Status();
 }
 
 void AsyncGRPCServer::TryToRegisterNewGetOne() {
@@ -184,11 +185,11 @@ void AsyncGRPCServer::TryToRegisterNewGetOne() {
   }
   RequestGet* get = new RequestGet(&service_, cq_get_.get(), scope_, dev_ctx_,
                                    &var_get_queue_);
-  VLOG(4) << "create Requestget status:" << get->Status();
+  VLOG(4) << "Create RequestGet status:" << get->Status();
 }
 
-// FIXME(typhoonzero): remove wait argument and change cq_name to enum.
-void AsyncGRPCServer::HandleRequest(bool wait, grpc::ServerCompletionQueue* cq,
+// FIXME(typhoonzero): change cq_name to enum.
+void AsyncGRPCServer::HandleRequest(grpc::ServerCompletionQueue* cq,
                                     std::string cq_name,
                                     std::function<void()> TryToRegisterNewOne) {
   TryToRegisterNewOne();
diff --git a/paddle/operators/detail/grpc_server.h b/paddle/operators/detail/grpc_server.h
index 1ca9086c744c558fd05fb4fc1a7280729afbec28..3f8b9d93176148619d6820f6a365d9da2e73b10d 100644
--- a/paddle/operators/detail/grpc_server.h
+++ b/paddle/operators/detail/grpc_server.h
@@ -57,8 +57,7 @@ class AsyncGRPCServer final : public sendrecv::SendRecvService::Service {
   void ShutDown();
 
  protected:
-  void HandleRequest(bool wait, grpc::ServerCompletionQueue *cq,
-                     std::string cq_name,
+  void HandleRequest(grpc::ServerCompletionQueue *cq, std::string cq_name,
                      std::function<void()> TryToRegisterNewOne);
   void TryToRegisterNewSendOne();
   void TryToRegisterNewGetOne();
diff --git a/paddle/operators/detail/sendrecvop_utils.h b/paddle/operators/detail/sendrecvop_utils.h
index bc6581afab93c626c7c2439d699c6c2d858df9fa..8e66f7299c7b4d30bc5a6fe6a18b7cb3ae3827a5 100644
--- a/paddle/operators/detail/sendrecvop_utils.h
+++ b/paddle/operators/detail/sendrecvop_utils.h
@@ -30,6 +30,9 @@ namespace paddle {
 namespace operators {
 namespace detail {
 
+#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
+#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
+
 void SerializeToMessage(const std::string& name, const framework::Variable* var,
                         const platform::DeviceContext& ctx,
                         sendrecv::VariableMessage* msg);
diff --git a/paddle/operators/dropout_op.cc b/paddle/operators/dropout_op.cc
index 35cb18797ff66cb87a6658e73ce02b0bfae29baa..5274aa204e6629c9c5ea850c433e0948c89015bd 100644
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
@@ -51,6 +51,13 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
                          "'dropout_prob' must be between 0.0 and 1.0.");
         });
     AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
+    AddAttr<bool>("fix_seed",
+                  "A flag indicating whether to use a fixed seed to generate "
+                  "random mask. NOTE: DO NOT set this flag to true in "
+                  "training. Setting this flag to true is only useful in "
+                  "unittest or for debug that always the same output units "
+                  "will be dropped.")
+        .SetDefault(false);
     AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
 
     AddComment(R"DOC(
diff --git a/paddle/operators/dropout_op.cu b/paddle/operators/dropout_op.cu
index c56930336e865079f1b96df0f35b0a051fe63a27..84d78445a4fa340ba3c066bb48b96b2a890db652 100644
--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
@@ -62,7 +62,11 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
       auto* mask = context.Output<Tensor>("Mask");
       auto* mask_data = mask->mutable_data<T>(context.GetPlace());
       int size = framework::product(mask->dims());
-      int seed = context.Attr<int>("seed");
+
+      std::random_device rnd;
+      int seed =
+          context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
+
       thrust::counting_iterator<unsigned int> index_sequence_begin(0);
       thrust::transform(index_sequence_begin, index_sequence_begin + size,
                         thrust::device_ptr<T>(mask_data),
diff --git a/paddle/operators/dropout_op.h b/paddle/operators/dropout_op.h
index c90b8d277eb78048c001d36a367287146b51c636..46e5dbc64ff9ad3d04a9c1c07f4226932f661baf 100644
--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
@@ -38,9 +38,15 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
     if (!context.Attr<bool>("is_test")) {
       auto* mask = context.Output<Tensor>("Mask");
       auto* mask_data = mask->mutable_data<T>(context.GetPlace());
-      int seed = context.Attr<int>("seed");
+
+      // NOTE: fixed seed should only be used in unittest or for debug.
+      // Guarantee to use random seed in training.
+      std::random_device rnd;
       std::minstd_rand engine;
+      int seed =
+          context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
       engine.seed(seed);
+
       std::uniform_real_distribution<float> dist(0, 1);
       size_t size = framework::product(mask->dims());
       for (size_t i = 0; i < size; ++i) {
diff --git a/paddle/operators/elementwise_add_op.h b/paddle/operators/elementwise_add_op.h
index a8389429f26c17ceab1db22175c90888546ead6f..c24f97a85092ff14e8211ca8bc4bb9b155510a2c 100644
--- a/paddle/operators/elementwise_add_op.h
+++ b/paddle/operators/elementwise_add_op.h
@@ -28,7 +28,15 @@ template <typename DeviceContext, typename T>
 class ElementwiseAddKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx);
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          AddFunctor<T>(), z);
   }
 };
 
@@ -92,9 +100,19 @@ template <typename DeviceContext, typename T>
 class ElementwiseAddGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
     ElementwiseGradCompute<DeviceContext, T, ElementwiseAddGradFunctor<T>,
                            ElementwiseAddBroadCastGradFunctor<T>,
-                           ElementwiseAddBroadCast2GradFunctor<T>>(ctx);
+                           ElementwiseAddBroadCast2GradFunctor<T>>(
+        ctx, x, y, out, dout, axis, dx, dy);
   }
 };
 
diff --git a/paddle/operators/elementwise_div_op.h b/paddle/operators/elementwise_div_op.h
index ef26cb6c914f50ded07cc9d0d8de3f49f2151129..dc863cc598ec6015067f166b1544a5d20223662a 100644
--- a/paddle/operators/elementwise_div_op.h
+++ b/paddle/operators/elementwise_div_op.h
@@ -28,7 +28,15 @@ template <typename DeviceContext, typename T>
 class ElementwiseDivKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx);
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          DivFunctor<T>(), z);
   }
 };
 
@@ -111,9 +119,19 @@ template <typename DeviceContext, typename T>
 class ElementwiseDivGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
     ElementwiseGradCompute<DeviceContext, T, ElementwiseDivGradFunctor<T>,
                            ElementwiseDivBroadCastGradFunctor<T>,
-                           ElementwiseDivBroadCast2GradFunctor<T>>(ctx);
+                           ElementwiseDivBroadCast2GradFunctor<T>>(
+        ctx, x, y, out, dout, axis, dx, dy);
   }
 };
 
diff --git a/paddle/operators/elementwise_max_op.h b/paddle/operators/elementwise_max_op.h
index 255728e8e620665a7de225b228c19d6c510da1c8..67efe4e1511e054d54f91b5aa22ce28f222ed20a 100644
--- a/paddle/operators/elementwise_max_op.h
+++ b/paddle/operators/elementwise_max_op.h
@@ -28,7 +28,15 @@ template <typename DeviceContext, typename T>
 class ElementwiseMaxKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx);
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          MaxFunctor<T>(), z);
   }
 };
 
@@ -110,9 +118,19 @@ template <typename DeviceContext, typename T>
 class ElementwiseMaxGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
     ElementwiseGradCompute<DeviceContext, T, ElementwiseMaxGradFunctor<T>,
                            ElementwiseMaxBroadCastGradFunctor<T>,
-                           ElementwiseMaxBroadCast2GradFunctor<T>>(ctx);
+                           ElementwiseMaxBroadCast2GradFunctor<T>>(
+        ctx, x, y, out, dout, axis, dx, dy);
   }
 };
 
diff --git a/paddle/operators/elementwise_min_op.h b/paddle/operators/elementwise_min_op.h
index e6627a0f1bb468c8e4661b83489cb964b72dddb0..cf11759404d3342b8a1c0080fa09f6cd57e735db 100644
--- a/paddle/operators/elementwise_min_op.h
+++ b/paddle/operators/elementwise_min_op.h
@@ -28,7 +28,15 @@ template <typename DeviceContext, typename T>
 class ElementwiseMinKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx);
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          MinFunctor<T>(), z);
   }
 };
 
@@ -110,9 +118,19 @@ template <typename DeviceContext, typename T>
 class ElementwiseMinGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
     ElementwiseGradCompute<DeviceContext, T, ElementwiseMinGradFunctor<T>,
                            ElementwiseMinBroadCastGradFunctor<T>,
-                           ElementwiseMinBroadCast2GradFunctor<T>>(ctx);
+                           ElementwiseMinBroadCast2GradFunctor<T>>(
+        ctx, x, y, out, dout, axis, dx, dy);
   }
 };
 
diff --git a/paddle/operators/elementwise_mul_op.h b/paddle/operators/elementwise_mul_op.h
index 4b86b00b5a095ae898f9ce0c17cde2cc91060ba9..773125f5ca54e7b529df47a2823d56a5ad71e50d 100644
--- a/paddle/operators/elementwise_mul_op.h
+++ b/paddle/operators/elementwise_mul_op.h
@@ -27,7 +27,15 @@ template <typename DeviceContext, typename T>
 class ElementwiseMulKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx);
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          MulFunctor<T>(), z);
   }
 };
 
@@ -110,9 +118,19 @@ template <typename DeviceContext, typename T>
 class ElementwiseMulGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
     ElementwiseGradCompute<DeviceContext, T, ElementwiseMulGradFunctor<T>,
                            ElementwiseMulBroadCastGradFunctor<T>,
-                           ElementwiseMulBroadCast2GradFunctor<T>>(ctx);
+                           ElementwiseMulBroadCast2GradFunctor<T>>(
+        ctx, x, y, out, dout, axis, dx, dy);
   }
 };
 
diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h
index d749b8e8757d0d433be05876779ccc22b95ca80b..74abf7c4a58788eb0e53025886f10f5a43021a9e 100644
--- a/paddle/operators/elementwise_op_function.h
+++ b/paddle/operators/elementwise_op_function.h
@@ -313,21 +313,18 @@ EIGEN_FUNCTOR(Div, EIGEN_DIV);
 
 template <typename DeviceContext, typename T, typename functor,
           typename broadcastfunctor, typename broadcast2functor>
-void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
-  using Tensor = framework::Tensor;
-
-  auto* x = ctx.Input<Tensor>("X");
-  auto* y = ctx.Input<Tensor>("Y");
-  auto* out = ctx.Input<Tensor>("Out");
-  auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
 
+                            const framework::Tensor* x,
+                            const framework::Tensor* y,
+                            const framework::Tensor* out,
+                            const framework::Tensor* dout, int axis,
+                            framework::Tensor* dx, framework::Tensor* dy) {
   auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
   auto x_dims = x->dims();
   auto y_dims = y->dims();
 
-  auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-  auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
   if (dx) {
     dx->mutable_data<T>(ctx.GetPlace());
   }
@@ -348,7 +345,6 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
     x_dims = framework::make_ddim(extended_dims);
   }
 
-  int axis = ctx.Attr<int>("axis");
   axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
 
   int pre, n, post;
@@ -367,15 +363,12 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
 
 template <typename Functor, typename DeviceContext, typename T,
           typename OutType = T>
-void ElementwiseComputeEx(const framework::ExecutionContext& ctx) {
-  using Tensor = framework::Tensor;
-
-  auto* x = ctx.Input<Tensor>("X");
-  auto* y = ctx.Input<Tensor>("Y");
-  auto* z = ctx.Output<Tensor>("Out");
-  z->mutable_data<OutType>(ctx.GetPlace());
+void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
+                          const framework::Tensor* x,
+                          const framework::Tensor* y, int axis, Functor func,
+                          framework::Tensor* z) {
   TransformFunctor<Functor, T, DeviceContext, OutType> functor(
-      x, y, z, ctx.template device_context<DeviceContext>(), Functor());
+      x, y, z, ctx.template device_context<DeviceContext>(), func);
 
   auto x_dims = x->dims();
   auto y_dims = y->dims();
@@ -394,7 +387,6 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx) {
     x_dims = framework::make_ddim(extended_dims);
   }
 
-  int axis = ctx.Attr<int>("axis");
   axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
   PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
                  "Axis should be in range [0, x_dims)");
diff --git a/paddle/operators/elementwise_pow_op.cc b/paddle/operators/elementwise_pow_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5293cc7dd34ccee860c50e964516da9b4d42d29c
--- /dev/null
+++ b/paddle/operators/elementwise_pow_op.cc
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/elementwise_pow_op.h"
+#include "paddle/operators/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwisePowOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwisePowOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Pow", "Out = X ^ Y");
+    AddComment(comment_);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(elementwise_pow, ops::ElementwiseOp,
+                             ops::ElementwisePowOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_pow,
+    ops::ElementwisePowKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwisePowKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/elementwise_pow_op.cu b/paddle/operators/elementwise_pow_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..643c978e635bc8e9671b47774c2eac5b713f59c2
--- /dev/null
+++ b/paddle/operators/elementwise_pow_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/elementwise_pow_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_pow,
+    ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/elementwise_pow_op.h b/paddle/operators/elementwise_pow_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c5dd031ec46ebecaabb701839c0f69c02678eb0
--- /dev/null
+++ b/paddle/operators/elementwise_pow_op.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cmath>
+#include "paddle/operators/elementwise_op_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct PowFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return std::pow(a, b); }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwisePowKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseComputeEx<PowFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          PowFunctor<T>(), z);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/elementwise_sub_op.h b/paddle/operators/elementwise_sub_op.h
index a2aca793026189ec87e00b52d7c351689f870400..6a88c5f6b4c869f8ab5b4fa3b112ffc264be7145 100644
--- a/paddle/operators/elementwise_sub_op.h
+++ b/paddle/operators/elementwise_sub_op.h
@@ -27,7 +27,15 @@ template <typename DeviceContext, typename T>
 class ElementwiseSubKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx);
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          SubFunctor<T>(), z);
   }
 };
 
@@ -93,9 +101,19 @@ template <typename DeviceContext, typename T>
 class ElementwiseSubGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
     ElementwiseGradCompute<DeviceContext, T, ElementwiseSubGradFunctor<T>,
                            ElementwiseSubBroadCastGradFunctor<T>,
-                           ElementwiseSubBroadCast2GradFunctor<T>>(ctx);
+                           ElementwiseSubBroadCast2GradFunctor<T>>(
+        ctx, x, y, out, dout, axis, dx, dy);
   }
 };
 
diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc
index d738e1850ca4f658f4fca5c9bf643c44f676cce9..789d01e0022b5c36957f295265a9dc42649b310f 100644
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -52,7 +52,11 @@ class FeedOp : public framework::OperatorBase {
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(place);
 
-    framework::Copy(feed_item, place, dev_ctx, out_item);
+    if (platform::is_same_place(feed_item.place(), place)) {
+      out_item->ShareDataWith(feed_item);
+    } else {
+      framework::Copy(feed_item, place, dev_ctx, out_item);
+    }
     out_item->set_lod(feed_item.lod());
   }
 };
diff --git a/paddle/operators/gru_op.cc b/paddle/operators/gru_op.cc
index 76f2adefede3b4bc4035f86f8f8663eed29343ae..fb901b639492a179925ff852f9030fc6674d1f63 100644
--- a/paddle/operators/gru_op.cc
+++ b/paddle/operators/gru_op.cc
@@ -135,14 +135,14 @@ class GRUOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 GRU Operator implements part calculations of the complete GRU as following:
 
-\f[
-update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
-reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
-output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
+$$
+update\_gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
+reset\_gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
+output\_candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
 output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
-\f]
+$$
 
-@note To implement the complete GRU, fully-connected operator must be used  
+@note To implement the complete GRU, fully-connected operator must be used
 before to feed xu, xr and xc as the Input of GRU operator.
 )DOC");
   }
diff --git a/paddle/operators/gru_op.h b/paddle/operators/gru_op.h
index b1957fb9ce6add8628cb206abf2c569d3f615c85..a08bd4233b02d021aaa64bafe4b855f11a60d338 100644
--- a/paddle/operators/gru_op.h
+++ b/paddle/operators/gru_op.h
@@ -30,11 +30,12 @@ using Tensor = framework::Tensor;
 
 template <typename DeviceContext, typename T>
 inline void ReorderInitState(const DeviceContext& ctx,
-                             const framework::Tensor& src, const size_t* index,
+                             const framework::Tensor& src,
+                             framework::Vector<size_t> index_lod,
                              framework::Tensor* dst, bool indexed_src) {
   math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
   dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  row_shuffle(ctx, src, index, *dst, indexed_src);
+  row_shuffle(ctx, src, index_lod, *dst, indexed_src);
 }
 
 template <typename DeviceContext, typename T>
@@ -76,7 +77,9 @@ class GRUKernel : public framework::OpKernel<T> {
     gru_value.state_weight =
         const_cast<T*>(weight_data + 2 * frame_size * frame_size);
     Tensor ordered_h0;
-    const size_t* order = batch_gate->lod()[2].data();
+
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
+
     if (h0) {
       // Since the batch computing for GRU reorders the input sequences
       // according to their length. The initialized cell state also needs
@@ -159,7 +162,9 @@ class GRUGradKernel : public framework::OpKernel<T> {
     zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast<T>(0.0));
 
     Tensor ordered_h0, ordered_h0_grad;
-    const size_t* order = batch_gate->lod()[2].data();
+
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
+
     if (h0) {
       ReorderInitState<DeviceContext, T>(dev_ctx, *h0, order, &ordered_h0,
                                          true);
diff --git a/paddle/operators/label_smooth_op.cc b/paddle/operators/label_smooth_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c89082f44b360cbd171eccb212674040b8688a46
--- /dev/null
+++ b/paddle/operators/label_smooth_op.cc
@@ -0,0 +1,128 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/label_smooth_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LabelSmoothOp : public framework::OperatorWithKernel {
+ public:
+  LabelSmoothOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of LabelSmoothOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of LabelSmoothOp should not be null.");
+    auto in_dims = ctx->GetInputDim("X");
+    if (ctx->HasInput("PriorDist")) {
+      auto noise_dims = ctx->GetInputDim("PriorDist");
+      auto noise_numel = paddle::framework::product(noise_dims);
+      PADDLE_ENFORCE(
+          in_dims[1] == noise_numel,
+          "The number of elements in Input(PriorDist) must be equal to the "
+          "dimension of each label.");
+    }
+    ctx->ShareLoD("X", /*->*/ "Out");
+    ctx->SetOutputDim("Out", in_dims);
+  }
+};
+
+class LabelSmoothOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LabelSmoothOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor) The input labels of LabelSmooth operator. This "
+             "input can be batched labels in one-hot encoding or output from "
+             "softmax, with shape [N x K], where N is the batch size and K is "
+             "the number of classes");
+    AddInput("PriorDist",
+             "(Tensor, optional)"
+             "The prior distribution to be added to the smoothed label. It is "
+             "fixed during training and the number of elements should be equal "
+             "to the dimension K of each label. Default is uniform "
+             "distribution and each element will be set to 1/K if not provided "
+             "in input.")
+        .AsDispensable();
+    AddOutput("Out",
+              "(loDTensor) The smoothed label of LabelSmooth operator. It has"
+              "the same shape and LoD with the Input(LoDTensor).");
+    AddAttr<float>("epsilon",
+                   "(float, default 0.0f)"
+                   "The smoothing parameter of LabelSmooth operator.")
+        .SetDefault(0.0f);
+    AddComment(R"DOC(
+LabelSmooth Operator.
+
+Label smoothing is a mechanism to regularize the classifier layer. In machine 
+learning, optimizing the log-likelihood of the correct label directly may 
+cause two problems. First, it may result in overfitting: if the model learns 
+to assign full probability to the ground-truth label for each training example,
+it is not guaranteed to generalize. Second, it encourages the differences 
+between the largest logit and all others to become large, reducing the ability 
+of the model to adapt. Label smoothing is proposed to encourage the model to 
+be less confident, which replaces the ground-truth label $y$ with the weighted 
+sum of itself and some fixed distribution $\mu$, i.e.
+
+$$
+    \tilde{y} = (1 - \epsilon) * y + \epsilon * \mu,
+$$
+
+where $(1 - \epsilon)$ and $\epsilon$ are the weights respectively, and 
+$\tilde{y}$ is the smoothed label. Usually uniform distribution is used for 
+$\mu$. This change in the ground-truth label is called label-smoothing 
+regularization or LSR.
+
+See more details about label smoothing in https://arxiv.org/abs/1512.00567.
+
+)DOC");
+  }
+};
+
+class LabelSmoothGradOp : public framework::OperatorWithKernel {
+ public:
+  LabelSmoothGradOp(const std::string &type,
+                    const framework::VariableNameMap &inputs,
+                    const framework::VariableNameMap &outputs,
+                    const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OP(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker,
+            label_smooth_grad, ops::LabelSmoothGradOp);
+REGISTER_OP_CPU_KERNEL(
+    label_smooth,
+    ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    label_smooth_grad,
+    ops::LabelSmoothGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LabelSmoothGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/label_smooth_op.cu b/paddle/operators/label_smooth_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5a0cec12bc58a56e4b0c3bd6fbc6c4754ef81fa4
--- /dev/null
+++ b/paddle/operators/label_smooth_op.cu
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/label_smooth_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    label_smooth,
+    ops::LabelSmoothKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LabelSmoothKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    label_smooth_grad,
+    ops::LabelSmoothGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LabelSmoothGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/label_smooth_op.h b/paddle/operators/label_smooth_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..87bc9f793e3b4e249142710243c45d51f3a913b2
--- /dev/null
+++ b/paddle/operators/label_smooth_op.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class LabelSmoothKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out_t = ctx.Output<framework::LoDTensor>("Out");
+    auto* in_t = ctx.Input<framework::LoDTensor>("X");
+    auto* dist_t = ctx.Input<framework::Tensor>("PriorDist");
+    auto label_dim = in_t->dims()[1];
+    out_t->mutable_data<T>(ctx.GetPlace());
+
+    auto epsilon = ctx.Attr<float>("epsilon");
+    auto out = framework::EigenVector<T>::Flatten(*out_t);
+    auto in = framework::EigenVector<T>::Flatten(*in_t);
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+    if (dist_t) {
+      auto dist = framework::EigenVector<T>::Flatten(*dist_t);
+      out.device(dev) =
+          static_cast<T>(1 - epsilon) * in +
+          epsilon * dist.broadcast(Eigen::DSizes<int, 1>(in_t->numel()));
+    } else {
+      out.device(dev) = static_cast<T>(1 - epsilon) * in +
+                        static_cast<T>(epsilon / label_dim);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LabelSmoothGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* d_in_t = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    d_in_t->mutable_data<T>(ctx.GetPlace());
+
+    auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
+    auto d_in = framework::EigenVector<T>::Flatten(*d_in_t);
+
+    auto epsilon = ctx.Attr<float>("epsilon");
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+    d_in.device(dev) = static_cast<T>(1 - epsilon) * d_out;
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/layer_norm_op.cc b/paddle/operators/layer_norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d9b774272cb7c9d87140bf30d2eabb44f49b2b7c
--- /dev/null
+++ b/paddle/operators/layer_norm_op.cc
@@ -0,0 +1,173 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/layer_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+
+class LayerNormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"),
+                   "Output(Y) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Mean"),
+                   "Output(Mean) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Variance"),
+                   "Output(Variance) of LayerNormOp should not be null.");
+
+    auto x_dim = ctx->GetInputDim("X");
+    auto begin_norm_axis = ctx->Attrs().Get<int>("begin_norm_axis");
+    PADDLE_ENFORCE_LT(begin_norm_axis, x_dim.size(),
+                      "'begin_norm_axis' must be less than the rank of X.");
+
+    auto matrix_dim = framework::flatten_to_2d(x_dim, begin_norm_axis);
+    int left = static_cast<int>(matrix_dim[0]);
+    int right = static_cast<int>(matrix_dim[1]);
+    if (ctx->HasInput("Scale")) {
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right);
+    }
+    if (ctx->HasInput("Bias")) {
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right);
+    }
+
+    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
+    ctx->SetOutputDim("Mean", {left});
+    ctx->SetOutputDim("Variance", {left});
+    ctx->ShareLoD("X", "Y");
+  }
+};
+
+class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LayerNormOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensor) The input tensor.");
+    AddInput("Scale",
+             "(Tensor, optional) Scale is a 1-dimensional tensor of size "
+             "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
+             "It is applied to the output.")
+        .AsDispensable();
+    AddInput("Bias",
+             "(Tensor, optional) Bias is a 1-dimensional tensor of size "
+             "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
+             "It is applied to the output.")
+        .AsDispensable();
+    AddOutput("Y", "(LoDTensor) Result after normalization.");
+    AddOutput("Mean", "(Tensor) Mean of the current mini batch.")
+        .AsIntermediate();
+    AddOutput("Variance", "(Tensor) Variance of the current mini batch.")
+        .AsIntermediate();
+
+    AddAttr<float>("epsilon",
+                   "(float, default 1e-5) Constant for "
+                   "numerical stability")
+        .SetDefault(1e-5)
+        .AddCustomChecker([](const float &epsilon) {
+          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
+                         "'epsilon' should be between 0.0 and 0.001.");
+        });
+    AddAttr<int>("begin_norm_axis",
+                 "(int default:1), the "
+                 "axis of `begin_norm_axis ... Rank(X) - 1` will be "
+                 "normalized. `begin_norm_axis` splits the tensor(`X`) to a "
+                 "matrix [N,H].")
+        .SetDefault(1)
+        .AddCustomChecker([](const int &begin_norm_axis) {
+          PADDLE_ENFORCE_GT(begin_norm_axis, 0,
+                            "'begin_norm_axis' should be greater than zero.");
+        });
+
+    AddComment(R"DOC(
+Layer Normalization.
+Layer Norm has been implemented as discussed in the paper:
+https://arxiv.org/abs/1607.06450
+...
+)DOC");
+  }
+};
+
+class LayerNormGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // check input
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Mean"),
+                   "Input(Mean) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Variance"),
+                   "Input(Variance) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
+                   "Input(Y@GRAD) of LayerNormOp should not be null.");
+
+    // check output
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
+      ctx->SetOutputDim(framework::GradVarName("Scale"),
+                        ctx->GetInputDim("Scale"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
+      ctx->SetOutputDim(framework::GradVarName("Bias"),
+                        ctx->GetInputDim("Bias"));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    const auto *var = ctx.InputVar(framework::GradVarName("Y"));
+    if (var == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    const Tensor *t = nullptr;
+    if (var->IsType<Tensor>()) {
+      t = &var->Get<Tensor>();
+    } else if (var->IsType<LoDTensor>()) {
+      t = &var->Get<LoDTensor>();
+    }
+    if (t == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    return framework::OpKernelType(framework::ToDataType(t->type()),
+                                   ctx.GetPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
+            layer_norm_grad, ops::LayerNormGradOp);
+REGISTER_OP_CPU_KERNEL(
+    layer_norm, ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LayerNormKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    layer_norm_grad,
+    ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/layer_norm_op.cu b/paddle/operators/layer_norm_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..77d13b216f0e8d6d4434742908437f1eb74818c9
--- /dev/null
+++ b/paddle/operators/layer_norm_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/layer_norm_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    layer_norm,
+    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    layer_norm_grad,
+    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/layer_norm_op.h b/paddle/operators/layer_norm_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c436b89263758bbc0abcd1bb71cef3e1370d2a5
--- /dev/null
+++ b/paddle/operators/layer_norm_op.h
@@ -0,0 +1,238 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+#include "paddle/operators/elementwise_op_function.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct SubAndSquareFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); }
+};
+
+template <typename T>
+struct DivAndSqrtFunctor {
+  explicit DivAndSqrtFunctor(T epsilon) { epsilon_ = epsilon; }
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    return a / (sqrt(b + epsilon_));
+  }
+
+ private:
+  T epsilon_;
+};
+
+template <typename T>
+struct MulFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a * b; }
+};
+
+template <typename T>
+struct AddFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
+};
+
+template <typename T>
+struct SubFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a - b; }
+};
+
+template <typename T>
+struct MulInvVarFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    return a * std::sqrt(1.0 / b);
+  }
+};
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+
+template <typename DeviceContext, typename T>
+class LayerNormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto *scale = ctx.Input<Tensor>("Scale");
+    auto *bias = ctx.Input<Tensor>("Bias");
+    auto x = *ctx.Input<Tensor>("X");
+
+    auto *y = ctx.Output<Tensor>("Y");
+    auto *mean = ctx.Output<Tensor>("Mean");
+    auto *var = ctx.Output<Tensor>("Variance");
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+
+    const auto x_dims = x.dims();
+
+    y->mutable_data<T>(ctx.GetPlace());
+    mean->mutable_data<T>(ctx.GetPlace());
+    var->mutable_data<T>(ctx.GetPlace());
+
+    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
+    int left = static_cast<int>(matrix_dim[0]);
+    int right = static_cast<int>(matrix_dim[1]);
+    framework::DDim matrix_shape({left, right});
+
+    x.Resize(matrix_shape);
+    Tensor out;
+    out.ShareDataWith(*y);
+    out.Resize(matrix_shape);
+
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    math::RowwiseMean<DeviceContext, T> row_mean;
+
+    // get mean
+    row_mean(dev_ctx, x, mean);
+
+    // get variance
+    ElementwiseComputeEx<SubAndSquareFunctor<T>, DeviceContext, T>(
+        ctx, &x, mean, /*axis*/ 0, SubAndSquareFunctor<T>(), &out);
+    row_mean(dev_ctx, out, var);
+
+    // get x_norm
+    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
+        ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &out);
+    ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
+        ctx, &out, var, /*axis*/ 0,
+        DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &out);
+
+    if (scale) {
+      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+          ctx, &out, scale, /*axis*/ 1, MulFunctor<T>(), &out);
+    }
+    if (bias) {
+      ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
+          ctx, &out, bias, /*axis*/ 1, AddFunctor<T>(), &out);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LayerNormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto x = *ctx.Input<Tensor>("X");
+    auto *y = ctx.Input<Tensor>("Y");
+    auto *mean = ctx.Input<Tensor>("Mean");
+    auto *var = ctx.Input<Tensor>("Variance");
+    auto *scale = ctx.Input<Tensor>("Scale");
+    auto *bias = ctx.Input<Tensor>("Bias");
+    auto d_y = *ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+
+    // init output
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    const auto &x_dims = x.dims();
+    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
+    int left = static_cast<int>(matrix_dim[0]);
+    int right = static_cast<int>(matrix_dim[1]);
+    framework::DDim matrix_shape({left, right});
+
+    d_y.Resize(matrix_shape);
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    math::ColwiseSum<DeviceContext, T> colwise_sum;
+
+    Tensor temp;
+    Tensor temp_norm;
+    if (d_scale || d_x) {
+      x.Resize(matrix_shape);
+      temp.mutable_data<T>(matrix_shape, ctx.GetPlace());
+
+      if (!(bias && scale)) {
+        temp_norm.ShareDataWith(*y);
+        temp_norm.Resize(matrix_shape);
+      } else {
+        temp_norm.mutable_data<T>(matrix_shape, ctx.GetPlace());
+        // get x_norm
+        ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
+            ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &temp_norm);
+        ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
+            ctx, &temp_norm, var, /*axis*/ 0,
+            DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &temp_norm);
+      }
+    }
+
+    if (d_bias) {
+      d_bias->mutable_data<T>(ctx.GetPlace());
+      colwise_sum(dev_ctx, d_y, d_bias);
+    }
+    if (d_scale) {
+      d_scale->mutable_data<T>(ctx.GetPlace());
+      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+          ctx, &temp_norm, &d_y, /*axis*/ 0, MulFunctor<T>(), &temp);
+      colwise_sum(dev_ctx, temp, d_scale);
+    }
+
+    if (d_x) {
+      framework::DDim vec_shape({left});
+      d_x->mutable_data<T>(ctx.GetPlace());
+      auto dx_dim = d_x->dims();
+      Tensor temp_vec;
+      temp_vec.mutable_data<T>(vec_shape, ctx.GetPlace());
+
+      math::RowwiseMean<DeviceContext, T> row_mean;
+
+      if (d_scale) {
+        // dy_dx
+        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+            ctx, &d_y, scale, /*axis*/ 1, MulFunctor<T>(), &temp);
+        framework::Copy(temp, ctx.GetPlace(), ctx.device_context(), d_x);
+
+        // dy_dmean_dx
+        row_mean(dev_ctx, temp, &temp_vec);
+        ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
+            ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor<T>(), d_x);
+
+        // dy_var_dx
+        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+            ctx, &temp, &temp_norm, /*axis*/ 0, MulFunctor<T>(), &temp);
+      } else {
+        // dy_dx
+        framework::Copy(d_y, ctx.GetPlace(), ctx.device_context(), d_x);
+
+        // dy_dmean_dx
+        row_mean(dev_ctx, d_y, &temp_vec);
+        ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
+            ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor<T>(), d_x);
+
+        // dy_var_dx
+        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+            ctx, &d_y, &temp_norm, /*axis*/ 0, MulFunctor<T>(), &temp);
+      }
+      // dy_var_dx
+      row_mean(dev_ctx, temp, &temp_vec);
+      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
+          ctx, &temp_norm, &temp_vec, /*axis*/ 0, MulFunctor<T>(), &temp);
+      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
+          ctx, d_x, &temp, /*axis*/ 0, SubFunctor<T>(), d_x);
+
+      ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
+          ctx, d_x, var, /*axis*/ 0,
+          DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), d_x);
+      d_x->Resize(dx_dim);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/listen_and_serv_op.cc b/paddle/operators/listen_and_serv_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..099f6b23736adcc2a6e9c27dca297178687ae785
--- /dev/null
+++ b/paddle/operators/listen_and_serv_op.cc
@@ -0,0 +1,207 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stdint.h>
+#include <sys/stat.h>
+#include <ostream>
+#include <thread>
+
+#include <unistd.h>
+
+#include "paddle/framework/executor.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/proto_desc.h"
+#include "paddle/operators/detail/grpc_server.h"
+#include "paddle/operators/detail/sendrecvop_utils.h"
+#include "paddle/operators/detail/simple_block_queue.h"
+#include "paddle/string/printf.h"
+
+namespace paddle {
+namespace operators {
+
+constexpr char kOptimizeBlock[] = "OptimizeBlock";
+
+void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
+  service->RunSyncUpdate();
+  VLOG(4) << "RunServer thread end";
+}
+
+static void CreateTensorFromMessageType(framework::Variable *var,
+                                        sendrecv::VarType var_type) {
+  if (var_type == sendrecv::VarType::LOD_TENSOR) {
+    var->GetMutable<framework::LoDTensor>();
+  } else if (var_type == sendrecv::VarType::SELECTED_ROWS) {
+    var->GetMutable<framework::SelectedRows>();
+  } else {
+    PADDLE_THROW(
+        "VariableMessage type %d is not in "
+        "[LoDTensor, SelectedRows]",
+        var_type);
+  }
+}
+
+class ListenAndServOp : public framework::OperatorBase {
+ public:
+  ListenAndServOp(const std::string &type,
+                  const framework::VariableNameMap &inputs,
+                  const framework::VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {
+    if (!rpc_service_) {
+      std::string endpoint = Attr<std::string>("endpoint");
+      rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
+      server_thread_.reset(new std::thread(RunServer, rpc_service_));
+    }
+  }
+
+  void Stop() override {
+    detail::MessageWithName term_msg;
+    term_msg.first = LISTEN_TERMINATE_MESSAGE;
+    rpc_service_->Push(term_msg);
+    rpc_service_->ShutDown();
+    server_thread_->join();
+  }
+
+  std::string GetGradVarNameForTrainer(const std::string &varname) const {
+    if (grads_counter_.find(varname) == grads_counter_.end()) {
+      grads_counter_[varname] = 0;
+    }
+    return string::Sprintf("%s.trainer_%d", varname, grads_counter_[varname]++);
+  }
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+    framework::Scope &recv_scope = scope.NewScope();
+
+    // FIXME(Yancey1989): initialize rpc server with lazy mode.
+    rpc_service_->SetScope(&recv_scope);
+    rpc_service_->SetDevCtx(&dev_ctx);
+    auto param_list = Attr<std::vector<std::string>>("ParamList");
+    auto grad_list = Attr<std::vector<std::string>>("GradList");
+    auto fan_in = Attr<int>("Fanin");
+
+    auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
+    auto *program = block->Program();
+    framework::Executor executor(dev_place);
+
+    // TODO(typhoonzero): change this to a while_op for every cluster-batch.
+    bool exit_flag = false;
+    while (!exit_flag) {
+      // Get from multiple trainers, we don't care about the order in which
+      // the gradients arrives, just add suffix 0~n and merge the gradient.
+      rpc_service_->SetCond(0);
+      size_t recv_var_cnt = 0;
+      int batch_barrier = 0;
+      while (batch_barrier != fan_in) {
+        const detail::MessageWithName &v = rpc_service_->Get();
+        auto grad_var_name = v.first;
+        if (grad_var_name == LISTEN_TERMINATE_MESSAGE) {
+          LOG(INFO) << "received terminate message and exit";
+          exit_flag = true;
+          break;
+        } else if (grad_var_name == BATCH_BARRIER_MESSAGE) {
+          VLOG(3) << "recv batch barrier message";
+          batch_barrier++;
+          continue;
+        } else {
+          // receive a variable
+          recv_var_cnt++;
+          auto it =
+              std::find(grad_list.begin(), grad_list.end(), grad_var_name);
+          std::string param_var_name;
+          if (it != grad_list.end()) {
+            param_var_name = param_list[it - grad_list.begin()];
+          } else {
+            LOG(ERROR) << "grad has no paired param:" << grad_var_name;
+          }
+          VLOG(3) << "received grad: " << grad_var_name
+                  << " updating param: " << param_var_name;
+
+          if (fan_in > 1) {
+            grad_var_name = this->GetGradVarNameForTrainer(grad_var_name);
+          }
+          auto *var = recv_scope.FindVar(grad_var_name);
+          if (var == nullptr) {
+            LOG(ERROR) << "Can not find server side var: " << grad_var_name;
+            PADDLE_THROW("Can not find server side var");
+          }
+          detail::DeserializeFromMessage(v.second, dev_ctx, var);
+        }
+      }
+      VLOG(3) << "recv " << recv_var_cnt << " parmeters for one barrier.";
+      // TODO(Yancey1989): merge SelectedRows variables here
+      if (exit_flag) {
+        rpc_service_->ShutDown();
+      }
+
+      try {
+        executor.Run(*program, &recv_scope, block->ID(), /*global_block*/
+                     false /*create_local_scope*/, false /*create_vars*/);
+      } catch (std::exception &e) {
+        LOG(ERROR) << "run sub program error " << e.what();
+      }
+      rpc_service_->SetCond(1);
+      rpc_service_->WaitClientGet(recv_var_cnt);
+      grads_counter_.clear();
+    }  // while(true)
+  }
+
+ protected:
+  std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
+  std::shared_ptr<std::thread> server_thread_;
+  mutable std::unordered_map<std::string, int> grads_counter_;
+};
+
+class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ListenAndServOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddComment(R"DOC(
+ListenAndServ operator
+
+This operator will start a RPC server which can receive variables
+from send_op and send back variables to recv_op.
+)DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string, default 127.0.0.1:6164)"
+                         "IP address to listen on.")
+        .SetDefault("127.0.0.1:6164")
+        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
+    AddAttr<framework::BlockDesc *>(kOptimizeBlock,
+                                    "BlockID to run on server side.");
+    AddAttr<std::vector<std::string>>(
+        "ParamList", "type list of string",
+        "grad->param name mapping to find which parameters to optimize.")
+        .SetDefault({});
+    AddAttr<std::vector<std::string>>(
+        "GradList", "type list of string",
+        "grad->param name mapping to find which parameters to optimize.")
+        .SetDefault({});
+    AddAttr<int>("Fanin", "type int",
+                 "Number of trainers in the current cluster job")
+        .SetDefault(1);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(listen_and_serv, ops::ListenAndServOp,
+                  ops::ListenAndServOpMaker);
diff --git a/paddle/operators/load_combine_op.cc b/paddle/operators/load_combine_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f4be793d7bf1f346c011842c57fb5b5179a697d6
--- /dev/null
+++ b/paddle/operators/load_combine_op.cc
@@ -0,0 +1,108 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <fstream>
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+class LoadCombineOp : public framework::OperatorBase {
+ public:
+  LoadCombineOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto filename = Attr<std::string>("file_path");
+
+    std::ifstream fin(filename);
+    PADDLE_ENFORCE(static_cast<bool>(fin),
+                   "Cannot open file %s for load_combine op", filename);
+
+    auto out_var_names = Outputs("Out");
+    PADDLE_ENFORCE_GT(
+        static_cast<int>(out_var_names.size()), 0,
+        "The number of output variables should be greater than 0.");
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    for (size_t i = 0; i < out_var_names.size(); i++) {
+      auto *out_var = scope.FindVar(out_var_names[i]);
+
+      PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
+                     out_var_names[i]);
+
+      auto *tensor = out_var->GetMutable<framework::LoDTensor>();
+
+      // Error checking
+      PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot read more from file %s",
+                     filename);
+
+      // Get data from fin to tensor
+      DeserializeFromStream(fin, tensor, dev_ctx);
+
+      if (platform::is_gpu_place(place)) {
+        // copy CPU to GPU
+        framework::LoDTensor cpu_tensor;
+        cpu_tensor.ShareDataWith(*tensor);
+        cpu_tensor.set_lod(tensor->lod());
+
+        // reset tensor
+        out_var->Clear();
+        tensor = out_var->GetMutable<framework::LoDTensor>();
+        tensor->set_lod(cpu_tensor.lod());
+        Copy(cpu_tensor, place, dev_ctx, tensor);
+      }
+    }
+  }
+};
+
+class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LoadCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput(
+        "Out",
+        "(vector) The output LoDTensors that will be read from the input file.")
+        .AsDuplicable();
+    AddAttr<std::string>("file_path",
+                         "(string) "
+                         "LoDTensors will be loaded from \"file_path\".")
+        .AddCustomChecker(
+            [](const std::string &path) { return !path.empty(); });
+    AddComment(R"DOC(
+LoadCombine Operator.
+
+LoadCombine operator loads LoDTensor variables from a file. The file should 
+contain one or more LoDTensors serialized using the SaveCombine operator. The 
+LoadCombine operator applies a deserialization strategy to appropriately load 
+the LodTensors, and this strategy complements the serialization strategy used 
+in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled
+with the SaveCombine operator, and can only deserialize one or more LoDTensors 
+that were saved using the SaveCombine operator.
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(load_combine, ops::LoadCombineOp,
+                  ops::LoadCombineOpProtoMaker);
diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu
index d97390fa1c53fa0bdf16ab34cb209b994621f83c..07372808bbf078bd2e9b0bb5782b95a046253f46 100644
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
@@ -125,8 +125,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
       new_rows.resize(ids_dim[0]);
       auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
 
-      memory::Copy(platform::CPUPlace(), new_rows.data(), gpu_place, ids_data,
-                   ids_dim[0] * sizeof(int64_t), stream);
+      memory::Copy(platform::CPUPlace(), new_rows.cuda_data(), gpu_place,
+                   ids_data, ids_dim[0] * sizeof(int64_t), stream);
 
       d_table->set_rows(new_rows);
 
diff --git a/paddle/operators/lstm_op.h b/paddle/operators/lstm_op.h
index c57ee414dc5b3417549c8ac3a7fd57a9c8f452df..72e95b75e29c88c5944607ceaa40435bac7a745c 100644
--- a/paddle/operators/lstm_op.h
+++ b/paddle/operators/lstm_op.h
@@ -27,11 +27,12 @@ using Tensor = framework::Tensor;
 
 template <typename DeviceContext, typename T>
 inline void ReorderInitState(const DeviceContext& ctx,
-                             const framework::Tensor& src, const size_t* index,
+                             const framework::Tensor& src,
+                             framework::Vector<size_t> index_lod,
                              framework::Tensor* dst, bool indexed_src) {
   math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
   dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  row_shuffle(ctx, src, index, *dst, indexed_src);
+  row_shuffle(ctx, src, index_lod, *dst, indexed_src);
 }
 
 template <typename DeviceContext, typename T>
@@ -84,7 +85,9 @@ class LSTMKernel : public framework::OpKernel<T> {
     }
     lstm_value.prev_state_value = nullptr;
     Tensor ordered_c0;
-    const size_t* order = batch_gate->lod()[2].data();
+
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
+
     if (cell_t0) {
       // Since the batch computing for LSTM reorders the input sequence
       // according to their length. The initialized cell state also needs
@@ -202,7 +205,8 @@ class LSTMGradKernel : public framework::OpKernel<T> {
     // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
     // initialization.
     Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
-    const size_t* order = batch_gate->lod()[2].data();
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
+
     if (c0) {
       ReorderInitState<DeviceContext, T>(device_ctx, *c0, order, &ordered_c0,
                                          true);
diff --git a/paddle/operators/lstmp_op.cc b/paddle/operators/lstmp_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c96b30ba353fabc48630258ea8f88f741b8c415e
--- /dev/null
+++ b/paddle/operators/lstmp_op.cc
@@ -0,0 +1,331 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/lstmp_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LSTMPOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ProjWeight"),
+                   "Input(ProjWeight) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Bias"),
+                   "Input(Bias) of LSTMP operator should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("Projection"),
+                   "Output(Projection) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Cell"),
+                   "Output(Cell) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchGate"),
+                   "Output(BatchGate) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"),
+                   "Output(BatchCellPreAct) of LSTMP operator should not be "
+                   "null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchHidden"),
+                   "Output(BatchHidden) of LSTMP operator should not be null.");
+
+    auto in_dims = ctx->GetInputDim("Input");
+    PADDLE_ENFORCE_EQ(in_dims.size(), 2,
+                      "Input(X)'s rank of LSTMP operator must be 2.");
+
+    int frame_size = in_dims[1] / 4;
+    auto w_dims = ctx->GetInputDim("Weight");
+    auto proj_dims = ctx->GetInputDim("ProjWeight");
+    PADDLE_ENFORCE_EQ(w_dims.size(), 2,
+                      "The rank of Input(Weight) should be 2.");
+    PADDLE_ENFORCE_EQ(w_dims[0], proj_dims[1],
+                      "The first dimension of Input(Weight) "
+                      "should be %d.",
+                      proj_dims[1]);
+    PADDLE_ENFORCE_EQ(w_dims[1], 4 * frame_size,
+                      "The second dimension of Input(Weight) "
+                      "should be 4 * %d.",
+                      frame_size);
+
+    PADDLE_ENFORCE_EQ(proj_dims.size(), 2,
+                      "The rank of Input(ProjWeight) should be 2.");
+    PADDLE_ENFORCE_EQ(proj_dims[0], frame_size,
+                      "The first dimension of Input(ProjWeight) "
+                      "should be %d.",
+                      frame_size);
+
+    if (ctx->HasInput("H0")) {
+      PADDLE_ENFORCE(ctx->HasInput("C0"),
+                     "Input(C0) of LSTMP operator should not be null after "
+                     "Input(H0) provided.");
+      auto h_dims = ctx->GetInputDim("H0");
+      auto c_dims = ctx->GetInputDim("C0");
+      PADDLE_ENFORCE(h_dims == c_dims,
+                     "The dimension of Input(H0) and Input(C0) "
+                     "should be the same.");
+      ctx->SetOutputDim("OrderedP0", {h_dims[0], proj_dims[1]});
+    }
+
+    auto b_dims = ctx->GetInputDim("Bias");
+    PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
+    PADDLE_ENFORCE_EQ(b_dims[0], 1,
+                      "The first dimension of Input(Bias) should be 1.");
+
+    if (ctx->Attrs().Get<bool>("use_peepholes")) {
+      PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size,
+                        "The second dimension of Input(Bias) should be "
+                        "7 * %d if enable peepholes connection",
+                        frame_size);
+    } else {
+      PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
+                        "The second dimension of Input(Bias) should be "
+                        "4 * %d if disable peepholes connection",
+                        frame_size);
+    }
+
+    framework::DDim out_dims({in_dims[0], frame_size});
+    framework::DDim proj_out_dims({in_dims[0], proj_dims[1]});
+    ctx->SetOutputDim("Projection", proj_out_dims);
+    ctx->SetOutputDim("Cell", out_dims);
+    ctx->SetOutputDim("BatchGate", in_dims);
+    ctx->SetOutputDim("BatchCellPreAct", out_dims);
+    ctx->SetOutputDim("BatchHidden", out_dims);
+    ctx->ShareLoD("Input", "Projection");
+    ctx->ShareLoD("Input", "Cell");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
+        ctx.device_context());
+  }
+};
+
+class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LSTMPOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(LoDTensor) the input for sequence data, which supports "
+             "variable-time length input sequence. The underlying tensor in "
+             "this LoDTensor is a matrix with shape (T X 4D), where T is the "
+             "total time steps in this mini-batch, D is the hidden size.");
+    AddInput("H0",
+             "(Tensor, optional) the initial hidden state is an optional "
+             "input. This is a tensor with shape (N x D), where N is the "
+             "batch size and D is the hidden size.")
+        .AsDispensable();
+    AddInput("C0",
+             "(Tensor, optional) the initial cell state is an optional "
+             "input. This is a tensor with shape (N x D), where N is the "
+             "batch size. `C0` should not be null if `H0` provided.")
+        .AsDispensable();
+    AddInput("Weight",
+             "(Tensor) the learnable hidden-hidden weights."
+             " - The shape is (P x 4D), where P is the projection layer size "
+             "and  D is the hidden size."
+             " - Weight = {W_cr, W_ir, W_fr, W_or}");
+    AddInput("ProjWeight",
+             "(Tensor) the learnable weight of the projection layer."
+             " - The shape is (D x P), where P is the recurrent projection "
+             "layer size and  D is the hidden size."
+             " - ProjWeight = {W_rh}");
+    AddInput("Bias",
+             "(Tensor) the learnable biases, which contains two parts: "
+             "input-hidden biases and peephole connections weights if "
+             "setting `use_peepholes` to `True`. "
+             "1. `use_peepholes = False` "
+             " - The shape is (1 x 4D). "
+             " - Bias = {b_c, b_i, b_f, b_o}."
+             "2. `use_peepholes = True` "
+             " - The shape is (1 x 7D). "
+             " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
+    AddOutput("Projection",
+              "(LoDTensor) the projection of the hidden state of LSTMP "
+              "operator. The shape is (T x P), and LoD is the same with the "
+              "`Input`.");
+    AddOutput("Cell",
+              "(LoDTensor) the cell state of LSTMP operator. "
+              "The shape is (T x D), and lod is the same with the `Input`.");
+    AddOutput("BatchGate",
+              "(LoDTensor) This LoDTensor contains input gate, forget gate "
+              "and output gate after the activations. This LoDTensor has the "
+              "same shape as the reorganized input, which is also be called "
+              "batch input. The LoD size is 2. The first-level LoD is the "
+              "batch offsets and the second contains the indices, which "
+              "denotes the position of reorganized sequence in the raw input.")
+        .AsIntermediate();
+    AddOutput("BatchCellPreAct",
+              "(LoDTensor) the pre-activation cell state reorganized in batch. "
+              "This LoDTensor is obtained in the forward and used in the "
+              "backward.")
+        .AsIntermediate();
+    AddOutput("BatchHidden",
+              "(LoDTensor) the hidden state reorganized in batch. "
+              "This LoDTensor is obtained in the forward and used in the "
+              "backward.")
+        .AsIntermediate();
+    AddOutput("OrderedP0",
+              "(Tensor) the projection of the initial hidden state "
+              "H0. This is a tensor with shape (N x P), where N is the "
+              "batch size and P is the hidden size.")
+        .AsIntermediate();
+    AddAttr<bool>("use_peepholes",
+                  "(bool, defalut: True) "
+                  "whether to enable diagonal/peephole connections.")
+        .SetDefault(true);
+    AddAttr<bool>("is_reverse",
+                  "(bool, defalut: False) "
+                  "whether to compute reversed LSTMP.")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "gate_activation",
+        "(string, default: sigmoid)"
+        "The activation for input gate, forget gate and output "
+        "gate, `sigmoid` by default.")
+        .SetDefault("sigmoid")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddAttr<std::string>("cell_activation",
+                         "(string, default: tanh)"
+                         "The activation for cell output, `tanh` by defalut.")
+        .SetDefault("tanh")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddAttr<std::string>("candidate_activation",
+                         "(string, default: tanh)"
+                         "The activation for candidate hidden state, "
+                         "`tanh` by default.")
+        .SetDefault("tanh")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddAttr<std::string>("proj_activation",
+                         "(string, default: tanh)"
+                         "The activation for projection output, "
+                         "`tanh` by defalut.")
+        .SetDefault("tanh")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddComment(R"DOC(
+Long-Short Term Memory with recurrent Projection layer (LSTMP) Operator.
+
+LSTMP has a separate projection layer after the LSTM layer, projecting the 
+original hidden state to a lower-dimensional one, which is proposed to reduce 
+the number of total parameters and furthermore computational complexity for 
+the LSTM, espeacially for the case that the size of output units is relative 
+large (https://research.google.com/pubs/archive/43905.pdf). 
+
+The formula is as follows:
+
+$$
+i_t = \sigma(W_{ix}x_{t} + W_{ir}r_{t-1} + W_{ic}c_{t-1} + b_i) \\
+
+f_t = \sigma(W_{fx}x_{t} + W_{fr}r_{t-1} + W_{fc}c_{t-1} + b_f) \\
+
+\tilde{c_t} = act_g(W_{cx}x_t + W_{cr}r_{t-1} + b_c) \\
+
+o_t = \sigma(W_{ox}x_{t} + W_{or}r_{t-1} + W_{oc}c_t + b_o) \\
+
+c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\
+
+h_t = o_t \odot act_h(c_t) \\
+
+r_t = \overline{act_h}(W_{rh}h_t)
+$$
+
+where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
+of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
+are diagonal weight matrices for peephole connections. In our implementation,
+we use vectors to reprenset these diagonal weight matrices. The b terms
+denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$
+is the activation, such as logistic sigmoid function, and
+$i, f, o$ and $c$ are the input gate, forget gate, output gate,
+and cell activation vectors, respectively, all of which have the same size as
+the cell output activation vector $h$. Here $h$ is usually called the hidden 
+state and $r$ denotes its recurrent projection. And $\tilde{c_t}$ is also 
+called the candidate hidden state, whose computation is based on the current 
+input and previous hidden state.
+
+The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$
+are the cell input and cell output activation functions and `tanh` is usually
+used for them. $\overline{act_h}$ is the activation function for the 
+projection output, usually using `identity` or same as $act_h$.
+
+Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$
+operations on the input $x_{t}$ are NOT included in this operator.
+Users can choose to use fully-connected operator before LSTMP operator.
+
+)DOC");
+  }
+};
+
+class LSTMPGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Projection"),
+                   "Input(Projection) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Cell"),
+                   "Input(Cell) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ProjWeight"),
+                   "Input(ProjWeight) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Bias"),
+                   "Input(Bias) of LSTMP operator should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasInput("BatchGate"),
+                   "Input(BatchGate) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("BatchCellPreAct"),
+                   "Input(BatchGate) of LSTMP operator should not be null.");
+
+    auto SetOutGradDim = [&ctx](const std::string& name) {
+      auto g_name = framework::GradVarName(name);
+      if (ctx->HasOutput(g_name))
+        ctx->SetOutputDim(g_name, ctx->GetInputDim(name));
+    };
+
+    SetOutGradDim("Input");
+    SetOutGradDim("Weight");
+    SetOutGradDim("ProjWeight");
+    SetOutGradDim("Bias");
+    SetOutGradDim("H0");
+    SetOutGradDim("C0");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
+        ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(lstmp, ops::LSTMPOp, ops::LSTMPOpMaker, lstmp_grad,
+            ops::LSTMPGradOp);
+REGISTER_OP_CPU_KERNEL(
+    lstmp, ops::LSTMPKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LSTMPKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    lstmp_grad, ops::LSTMPGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LSTMPGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/lstmp_op.cu b/paddle/operators/lstmp_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7fcbcfecc871976fdfbfffbbb4e0243b91351a29
--- /dev/null
+++ b/paddle/operators/lstmp_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/lstmp_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    lstmp, ops::LSTMPKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LSTMPKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    lstmp_grad,
+    ops::LSTMPGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LSTMPGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/lstmp_op.h b/paddle/operators/lstmp_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e064a155dfadd8104fa80727a962cb2e24ade29f
--- /dev/null
+++ b/paddle/operators/lstmp_op.h
@@ -0,0 +1,496 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/operators/activation_op.h"
+#include "paddle/operators/math/detail/activation_functions.h"
+#include "paddle/operators/math/lstm_compute.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/sequence2batch.h"
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+inline void ReorderInitState(const DeviceContext& ctx,
+                             const framework::Tensor& src,
+                             framework::Vector<size_t> index,
+                             framework::Tensor* dst, bool indexed_src) {
+  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
+  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
+  row_shuffle(ctx, src, index, *dst, indexed_src);
+}
+
+template <typename DeviceContext, typename T>
+class LSTMPKernel : public framework::OpKernel<T> {
+ public:
+  template <typename Device, typename X, typename Y>
+  void ActCompute(const math::detail::ActivationType act_type, const Device& d,
+                  X x, Y y) const {
+    if (act_type == math::detail::ActivationType::kIdentity)
+      y.device(d) = x;
+    else if (act_type == math::detail::ActivationType::kSigmoid)
+      SigmoidFunctor<T>()(d, x, y);
+    else if (act_type == math::detail::ActivationType::kTanh)
+      TanhFunctor<T>()(d, x, y);
+    else if (act_type == math::detail::ActivationType::kReLU)
+      ReluFunctor<T>()(d, x, y);
+    else
+      PADDLE_THROW("unsupported activation type");
+  }
+
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<LoDTensor>("Input");
+    auto* weight = ctx.Input<Tensor>("Weight");
+    auto* proj_weight = ctx.Input<Tensor>("ProjWeight");
+    auto* bias = ctx.Input<Tensor>("Bias");
+
+    auto* hidden_t0 = ctx.Input<Tensor>("H0");
+    auto* ordered_proj0 = ctx.Output<Tensor>("OrderedP0");
+    auto* cell_t0 = ctx.Input<Tensor>("C0");
+
+    auto* batch_gate = ctx.Output<LoDTensor>("BatchGate");
+    batch_gate->mutable_data<T>(ctx.GetPlace());
+    auto* proj_out = ctx.Output<LoDTensor>("Projection");
+    proj_out->mutable_data<T>(ctx.GetPlace());
+    auto* cell_out = ctx.Output<LoDTensor>("Cell");
+    cell_out->mutable_data<T>(ctx.GetPlace());
+
+    bool is_reverse = ctx.Attr<bool>("is_reverse");
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    auto& device_ctx = ctx.template device_context<DeviceContext>();
+    to_batch(device_ctx, *input, *batch_gate, true, is_reverse);
+
+    auto in_dims = input->dims();
+    int frame_size = static_cast<int>(in_dims[1] / 4);
+    framework::DDim dims({in_dims[0], frame_size});
+    framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]});
+
+    if (bias) {
+      Tensor b = *bias;
+      b.Resize({bias->numel(), 1});
+      Tensor gate_bias = b.Slice(0, 4 * frame_size);
+      math::RowwiseAdd<DeviceContext, T> add_bias;
+      add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
+    }
+
+    math::LstmMetaValue<T> lstmp_value;
+    if (bias && ctx.Attr<bool>("use_peepholes")) {
+      T* bias_data = const_cast<T*>(bias->data<T>());
+      // the code style in LstmpMetaValue will be updated later.
+
+      lstmp_value.check_ig = bias_data + 4 * frame_size;
+      lstmp_value.check_fg = lstmp_value.check_ig + frame_size;
+      lstmp_value.check_og = lstmp_value.check_fg + frame_size;
+    } else {
+      lstmp_value.check_ig = nullptr;
+      lstmp_value.check_fg = nullptr;
+      lstmp_value.check_og = nullptr;
+    }
+    lstmp_value.prev_state_value = nullptr;
+    Tensor ordered_c0;
+
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
+
+    if (cell_t0) {
+      // Since the batch computing for LSTMP reorders the input sequence
+      // according to their length. The initialized cell state also needs
+      // to reorder.
+      ReorderInitState<DeviceContext, T>(device_ctx, *cell_t0, order,
+                                         &ordered_c0, true);
+      lstmp_value.prev_state_value = ordered_c0.data<T>();
+    }
+
+    // Use the local variable as here.
+    LoDTensor batch_proj, batch_cell;
+    auto* batch_cell_pre_act = ctx.Output<LoDTensor>("BatchCellPreAct");
+    batch_cell_pre_act->mutable_data<T>(dims, ctx.GetPlace());
+    auto* batch_hidden = ctx.Output<LoDTensor>("BatchHidden");
+    batch_hidden->mutable_data<T>(dims, ctx.GetPlace());    // T x D
+    batch_proj.mutable_data<T>(proj_dims, ctx.GetPlace());  // T x P
+    batch_cell.mutable_data<T>(dims, ctx.GetPlace());       // T x D
+
+    auto batch_starts = batch_gate->lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    auto gate_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("gate_activation"));
+    auto cell_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("cell_activation"));
+    auto cand_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("candidate_activation"));
+    auto proj_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("proj_activation"));
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    for (size_t n = 0; n < num_batch; n++) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+
+      Tensor gate_t = batch_gate->Slice(bstart, bend);
+      Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+      Tensor proj_t = batch_proj.Slice(bstart, bend);
+      Tensor cell_t = batch_cell.Slice(bstart, bend);
+      Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
+
+      int cur_batch_size = bend - bstart;
+
+      if (n > 0) {
+        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
+        int pre_h_end = pre_h_start + cur_batch_size;
+        auto pre_proj_t = batch_proj.Slice(pre_h_start, pre_h_end);
+        math::matmul<DeviceContext, T>(device_ctx, pre_proj_t, false, *weight,
+                                       false, static_cast<T>(1.0), &gate_t,
+                                       static_cast<T>(1.0));
+      } else if (hidden_t0) {
+        // If n == 0 and there is no initialized hidden state, that is to say
+        // the H0 is zeros, the calculation W_h * H0 will be skiped.
+        // If n == 0 and there is initialized hidden state, calculate W_h * H0.
+
+        // Since the batch computing for LSTMP reorders the input sequence
+        // according to their length. The initialized hidden state also needs
+        // to reorder.
+
+        Tensor ordered_h0;
+        ordered_proj0->mutable_data<T>(ctx.GetPlace());
+        ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order,
+                                           &ordered_h0, true);
+        math::matmul<DeviceContext, T>(device_ctx, ordered_h0, false,
+                                       *proj_weight, false, static_cast<T>(1.0),
+                                       ordered_proj0, static_cast<T>(0.0));
+        if (proj_act != math::detail::ActivationType::kIdentity) {
+          auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
+          ActCompute(cell_act, place, proj0_dev, proj0_dev);
+        }
+        math::matmul<DeviceContext, T>(device_ctx, *ordered_proj0, false,
+                                       *weight, false, static_cast<T>(1.0),
+                                       &gate_t, static_cast<T>(1.0));
+      }
+
+      lstmp_value.gate_value = gate_t.data<T>();
+      lstmp_value.output_value = hidden_t.data<T>();
+      lstmp_value.state_value = cell_t.data<T>();
+      lstmp_value.state_active_value = cell_pre_act_t.data<T>();
+      math::LstmUnitFunctor<DeviceContext, T>::compute(
+          device_ctx, lstmp_value, frame_size, cur_batch_size, gate_act,
+          cell_act, cand_act);
+      lstmp_value.prev_state_value = lstmp_value.state_value;
+      math::matmul<DeviceContext, T>(device_ctx, hidden_t, false, *proj_weight,
+                                     false, static_cast<T>(1.0), &proj_t,
+                                     static_cast<T>(0.0));
+      if (proj_act != math::detail::ActivationType::kIdentity) {
+        auto proj_t_dev = EigenMatrix<T>::From(proj_t);
+        ActCompute(cell_act, place, proj_t_dev, proj_t_dev);
+      }
+    }
+
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    batch_proj.set_lod(batch_gate->lod());
+    // restore the output hidden in LoDTensor from the batch hidden
+    to_seq(device_ctx, batch_proj, *proj_out);
+
+    batch_cell.set_lod(batch_gate->lod());
+    // restore the output cell state in LoDTensor from the batch cell
+    to_seq(device_ctx, batch_cell, *cell_out);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LSTMPGradKernel : public framework::OpKernel<T> {
+ public:
+  template <typename Device, typename X, typename Y, typename DX, typename DY>
+  void ActGradCompute(const math::detail::ActivationType act_type,
+                      const Device& d, X x, Y y, DX dx, DY dy) const {
+    // x is dummy and won't be used even in Relu(use y instead)
+    if (act_type == math::detail::ActivationType::kIdentity)
+      dx.device(d) = dy;
+    else if (act_type == math::detail::ActivationType::kSigmoid)
+      SigmoidGradFunctor<T>()(d, x, y, dy, dx);
+    else if (act_type == math::detail::ActivationType::kTanh)
+      TanhGradFunctor<T>()(d, x, y, dy, dx);
+    else if (act_type == math::detail::ActivationType::kReLU)
+      ReluGradFunctor<T>()(d, x, y, dy, dx);
+    else
+      PADDLE_THROW("unsupported activation type");
+  }
+
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<LoDTensor>("Input");
+    auto* weight = ctx.Input<Tensor>("Weight");
+    auto* proj_weight = ctx.Input<Tensor>("ProjWeight");
+    auto* bias = ctx.Input<Tensor>("Bias");
+
+    auto* proj_out = ctx.Input<LoDTensor>("Projection");
+    auto* cell_out = ctx.Input<LoDTensor>("Cell");
+
+    auto* batch_gate = ctx.Input<LoDTensor>("BatchGate");
+    auto* batch_cell_pre_act = ctx.Input<LoDTensor>("BatchCellPreAct");
+    auto* batch_hidden = ctx.Input<LoDTensor>("BatchHidden");
+
+    auto* projection_g =
+        ctx.Input<LoDTensor>(framework::GradVarName("Projection"));
+
+    auto* in_g = ctx.Output<LoDTensor>(framework::GradVarName("Input"));
+    auto* weight_g = ctx.Output<Tensor>(framework::GradVarName("Weight"));
+    auto* proj_weight_g =
+        ctx.Output<Tensor>(framework::GradVarName("ProjWeight"));
+    auto* bias_g = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    auto* h0 = ctx.Input<Tensor>("H0");
+    auto* ordered_proj0 = ctx.Input<Tensor>("OrderedP0");
+    auto* c0 = ctx.Input<Tensor>("C0");
+
+    auto* h0_g = ctx.Output<Tensor>(framework::GradVarName("H0"));
+    auto* c0_g = ctx.Output<Tensor>(framework::GradVarName("C0"));
+
+    auto& device_ctx = ctx.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
+    if (weight_g) {
+      weight_g->mutable_data<T>(ctx.GetPlace());
+      zero(device_ctx, weight_g, static_cast<T>(0.0));
+    }
+    if (proj_weight_g) {
+      proj_weight_g->mutable_data<T>(ctx.GetPlace());
+      zero(device_ctx, proj_weight_g, static_cast<T>(0.0));
+    }
+
+    // ordered_h0/c0 is the reordered hidden/cell initialization.
+    // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
+    // initialization.
+    Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
+
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
+
+    if (c0) {
+      ReorderInitState<DeviceContext, T>(device_ctx, *c0, order, &ordered_c0,
+                                         true);
+    }
+    if (c0 && c0_g) {
+      ordered_c0_g.mutable_data<T>(c0_g->dims(), ctx.GetPlace());
+    }
+
+    auto in_dims = input->dims();
+    auto out_dims = cell_out->dims();
+    framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]});
+    int frame_size = static_cast<int>(in_dims[1] / 4);
+    PADDLE_ENFORCE_EQ(frame_size, out_dims[1]);
+
+    math::LstmMetaValue<T> lstmp_value;
+    if (bias && ctx.Attr<bool>("use_peepholes")) {
+      T* bias_data = const_cast<T*>(bias->data<T>());
+      lstmp_value.check_ig = bias_data + 4 * frame_size;
+      lstmp_value.check_fg = lstmp_value.check_ig + frame_size;
+      lstmp_value.check_og = lstmp_value.check_fg + frame_size;
+    } else {
+      lstmp_value.check_ig = nullptr;
+      lstmp_value.check_fg = nullptr;
+      lstmp_value.check_og = nullptr;
+    }
+
+    math::LstmMetaGrad<T> lstmp_grad;
+
+    if (bias && bias_g) {
+      bias_g->mutable_data<T>(ctx.GetPlace());
+      zero(device_ctx, bias_g, static_cast<T>(0.0));
+    }
+    if (bias && bias_g && ctx.Attr<bool>("use_peepholes")) {
+      T* bias_g_data = bias_g->data<T>();
+      lstmp_grad.check_ig_grad = bias_g_data + 4 * frame_size;
+      lstmp_grad.check_fg_grad = lstmp_grad.check_ig_grad + frame_size;
+      lstmp_grad.check_og_grad = lstmp_grad.check_fg_grad + frame_size;
+    } else {
+      lstmp_grad.check_ig_grad = nullptr;
+      lstmp_grad.check_fg_grad = nullptr;
+      lstmp_grad.check_og_grad = nullptr;
+    }
+
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+
+    auto ToBatch = [&batch_gate, &to_batch](
+        const DeviceContext& ctx, const framework::LoDTensor& src,
+        const framework::DDim& dims, framework::LoDTensor& dst) {
+      dst.mutable_data<T>(dims, ctx.GetPlace());
+      dst.set_lod(batch_gate->lod());
+      to_batch(ctx, src, dst, false);
+    };
+
+    LoDTensor batch_hidden_g, batch_proj, batch_proj_g, batch_cell;
+    batch_hidden_g.mutable_data<T>(out_dims, ctx.GetPlace());
+    ToBatch(device_ctx, *proj_out, proj_dims, batch_proj);        // T x P
+    ToBatch(device_ctx, *projection_g, proj_dims, batch_proj_g);  // T x P
+    ToBatch(device_ctx, *cell_out, out_dims, batch_cell);         // T x D
+
+    LoDTensor batch_cell_g, batch_gate_g;
+    batch_cell_g.mutable_data<T>(out_dims, ctx.GetPlace());
+    // TODO(qingqing) support the case output cell has gradient.
+    // to_batch(device_ctx, *cell_g, batch_cell_g, false);
+    zero(device_ctx, &batch_cell_g, static_cast<T>(0.0));
+    batch_gate_g.mutable_data<T>(batch_gate->dims(), ctx.GetPlace());
+    batch_gate_g.set_lod(batch_gate->lod());
+
+    auto gate_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("gate_activation"));
+    auto cell_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("cell_activation"));
+    auto cand_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("candidate_activation"));
+    auto proj_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("proj_activation"));
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    auto batch_starts = batch_gate->lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+
+      Tensor cur_proj = batch_proj.Slice(bstart, bend);
+      Tensor proj_g = batch_proj_g.Slice(bstart, bend);
+      if (proj_act != math::detail::ActivationType::kIdentity) {
+        auto cur_proj_dev = EigenMatrix<T>::From(cur_proj);
+        auto proj_g_dev = EigenMatrix<T>::From(proj_g);
+        ActGradCompute(cell_act, place, cur_proj_dev, cur_proj_dev, proj_g_dev,
+                       proj_g_dev);
+      }
+      /* hidden state backwarad */
+      Tensor out_g = batch_hidden_g.Slice(bstart, bend);
+      math::matmul<DeviceContext, T>(device_ctx, proj_g, false, *proj_weight,
+                                     true, static_cast<T>(1.0), &out_g,
+                                     static_cast<T>(0.0));
+      /* projection weight backward*/
+      if (proj_weight_g) {
+        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+        math::matmul<DeviceContext, T>(device_ctx, hidden_t, true, proj_g,
+                                       false, static_cast<T>(1.0),
+                                       proj_weight_g, static_cast<T>(1.0));
+      }
+
+      Tensor gate = batch_gate->Slice(bstart, bend);
+      Tensor cell = batch_cell.Slice(bstart, bend);
+      Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
+      lstmp_value.gate_value = gate.data<T>();
+      lstmp_value.state_value = cell.data<T>();
+      lstmp_value.state_active_value = cell_pre_act.data<T>();
+
+      Tensor gate_g = batch_gate_g.Slice(bstart, bend);
+      Tensor cell_g = batch_cell_g.Slice(bstart, bend);
+      lstmp_grad.state_grad = cell_g.data<T>();
+      lstmp_grad.gate_grad = gate_g.data<T>();
+      lstmp_grad.output_grad = out_g.data<T>();
+
+      if (n > 0) {
+        int bstart_pre = static_cast<int>(batch_starts[n - 1]);
+        Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
+        Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
+        lstmp_value.prev_state_value = cell_pre.data<T>();
+        lstmp_grad.prev_state_grad = cell_pre_g.data<T>();
+      } else {
+        lstmp_value.prev_state_value = c0 ? ordered_c0.data<T>() : nullptr;
+        lstmp_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
+      }
+
+      int cur_batch_size = bend - bstart;
+      math::LstmUnitGradFunctor<DeviceContext, T>::compute(
+          device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size,
+          gate_act, cell_act, cand_act);
+
+      if (n > 0) {
+        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
+        int pre_h_end = pre_h_start + cur_batch_size;
+        auto pre_proj_g = batch_proj_g.Slice(pre_h_start, pre_h_end);
+        math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight, true,
+                                       static_cast<T>(1.0), &pre_proj_g,
+                                       static_cast<T>(1.0));
+        if (weight_g) {
+          /* weight backward*/
+          auto pre_proj = batch_proj.Slice(pre_h_start, pre_h_end);
+          math::matmul<DeviceContext, T>(device_ctx, pre_proj, true, gate_g,
+                                         false, static_cast<T>(1.0), weight_g,
+                                         static_cast<T>(1.0));
+        }
+      } else {
+        if (h0 && weight_g) {
+          ReorderInitState<DeviceContext, T>(device_ctx, *h0, order,
+                                             &ordered_h0, true);
+          if (weight_g) {
+            math::matmul<DeviceContext, T>(device_ctx, *ordered_proj0, true,
+                                           gate_g, false, static_cast<T>(1.0),
+                                           weight_g, static_cast<T>(1.0));
+          }
+        }
+        if (h0 && (h0_g || proj_weight_g)) {
+          ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
+          Tensor proj0_g;
+          proj0_g.Resize({in_dims[0], proj_weight->dims()[1]});
+          proj0_g.mutable_data<T>(ctx.GetPlace());
+          math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight,
+                                         true, static_cast<T>(1.0), &proj0_g,
+                                         static_cast<T>(0.0));
+          if (proj_act != math::detail::ActivationType::kIdentity) {
+            auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
+            auto proj0_g_dev = EigenMatrix<T>::From(proj0_g);
+            ActGradCompute(cell_act, place, proj0_dev, proj0_dev, proj0_g_dev,
+                           proj0_g_dev);
+          }
+          if (h0_g) {
+            math::matmul<DeviceContext, T>(
+                device_ctx, proj0_g, false, *proj_weight, true,
+                static_cast<T>(1.0), &ordered_h0_g, static_cast<T>(0.0));
+          }
+          if (proj_weight_g) {
+            math::matmul<DeviceContext, T>(device_ctx, ordered_h0, true,
+                                           proj0_g, false, static_cast<T>(1.0),
+                                           proj_weight_g, static_cast<T>(1.0));
+          }
+        }
+      }
+    }
+
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    if (in_g) {
+      /* backward data */
+      in_g->mutable_data<T>(ctx.GetPlace());
+      to_seq(device_ctx, batch_gate_g, *in_g);
+    }
+    if (bias && bias_g) {
+      /* backward bias */
+      Tensor b_g = *bias_g;
+      b_g.Resize({bias_g->numel(), 1});
+      Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
+      math::ColwiseSum<DeviceContext, T> col_sum;
+      col_sum(device_ctx, batch_gate_g, &gate_bias_g);
+    }
+
+    if (h0 && h0_g) {
+      ReorderInitState<DeviceContext, T>(device_ctx, ordered_h0_g, order, h0_g,
+                                         false);
+    }
+    if (c0 && c0_g) {
+      ReorderInitState<DeviceContext, T>(device_ctx, ordered_c0_g, order, c0_g,
+                                         false);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index c607704efac86982c8c22e462381aaab488a9b69..768106fadf355ea6fb148491e232dc0ef1453a75 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -8,10 +8,11 @@ if(WITH_GPU)
     nv_library(softmax SRCS softmax.cc softmax.cu DEPS device_context)
     nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context)
     nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
+    nv_library(depthwise_conv SRCS depthwise_conv.cu DEPS device_context)
     nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function)
     nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor)
     nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
-    nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor)
+    nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor math_function)
     nv_library(sequence_padding SRCS sequence_padding.cc sequence_padding.cu DEPS lod_tensor device_context)
     nv_library(sequence_scale SRCS sequence_scale.cc sequence_scale.cu DEPS lod_tensor device_context)
     nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
@@ -28,7 +29,7 @@ else()
     cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function)
     cc_library(vol2col SRCS vol2col.cc DEPS device_context tensor)
     cc_library(context_project SRCS context_project.cc DEPS device_context math_function)
-    cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor)
+    cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor math_function)
     cc_library(sequence_padding SRCS sequence_padding.cc DEPS lod_tensor device_context)
     cc_library(sequence_scale SRCS sequence_scale.cc DEPS lod_tensor device_context)
     cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
diff --git a/paddle/operators/math/depthwise_conv.cu b/paddle/operators/math/depthwise_conv.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b212e78208355866516211d276cb8046623babd7
--- /dev/null
+++ b/paddle/operators/math/depthwise_conv.cu
@@ -0,0 +1,311 @@
+/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/depthwise_conv.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// A Cuda kernel to compute the depthwise convolution forward pass
+// in NCHW format.
+template <typename T>
+__global__ void KernelDepthwiseConv(
+    const int nthreads, const T* const input_data, const T* const filter_data,
+    const int batch_size, const int output_channels, const int output_height,
+    const int output_width, const int input_channels, const int input_height,
+    const int input_width, const int filter_multiplier, const int filter_height,
+    const int filter_width, const int stride_height, const int stride_width,
+    const int padding_height, const int padding_width, T* const output_data) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if (index < nthreads) {
+    const int batch = index / output_channels / output_height / output_width;
+    const int c_out = (index / output_height / output_width) % output_channels;
+    const int h_out = (index / output_width) % output_height;
+    const int w_out = index % output_width;
+
+    const int c_in = c_out / filter_multiplier;
+    const T* weight = filter_data + c_out * filter_height * filter_width;
+    T value = 0;
+    const int h_in_start = -padding_height + h_out * stride_height;
+    const int w_in_start = -padding_width + w_out * stride_width;
+    const int h_in_end = h_in_start + filter_height;
+    const int w_in_end = w_in_start + filter_width;
+
+    const int in_offset =
+        ((batch * input_channels + c_in) * input_height) * input_width;
+
+    const int h_end = h_in_end < input_height ? h_in_end : input_height;
+    const int w_end = w_in_end < input_width ? w_in_end : input_width;
+    const int h_start = h_in_start > 0 ? h_in_start : 0;
+    const int w_start = w_in_start > 0 ? w_in_start : 0;
+
+    for (int h_in = h_start; h_in < h_end; h_in++) {
+      for (int w_in = w_start; w_in < w_end; w_in++) {
+        const int offset = in_offset + h_in * input_width + w_in;
+        value +=
+            weight[(h_in - h_in_start) * filter_width + (w_in - w_in_start)] *
+            input_data[offset];
+      }
+    }
+    output_data[index] = value;
+  }
+}
+
+// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
+template <typename T>
+__global__ void KernelDepthwiseConvInputGrad(
+    const int nthreads, const T* const output_grad_data,
+    const T* const filter_data, const int batch_size, const int output_channels,
+    const int output_height, const int output_width, const int input_channels,
+    const int input_height, const int input_width, const int filter_multiplier,
+    const int filter_height, const int filter_width, const int stride_height,
+    const int stride_width, const int padding_height, const int padding_width,
+    T* const input_grad_data) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    const int batch = index / input_channels / input_height / input_width;
+    const int c_in = (index / input_height / input_width) % input_channels;
+    const int h_in = (index / input_width) % input_height;
+    const int w_in = index % input_width;
+
+    const int c_out_start = c_in * filter_multiplier;
+
+    int h_out_start =
+        (h_in - filter_height + padding_height + stride_height) / stride_height;
+    h_out_start = 0 > h_out_start ? 0 : h_out_start;
+
+    int h_out_end = (h_in + padding_height) / stride_height;
+    h_out_end = output_height - 1 < h_out_end ? output_height - 1 : h_out_end;
+
+    int w_out_start =
+        (w_in - filter_width + padding_width + stride_width) / stride_width;
+    w_out_start = 0 > w_out_start ? 0 : w_out_start;
+
+    int w_out_end = (w_in + padding_width) / stride_width;
+    w_out_end = output_width - 1 < w_out_end ? output_width - 1 : w_out_end;
+
+    T value = 0;
+
+    for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier;
+         c_out++) {
+      for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
+        const int filter_h = h_in + padding_height - h_out * stride_height;
+        for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
+          const int filter_w = w_in + padding_width - w_out * stride_width;
+          const int filter_offset = c_out * filter_height * filter_width +
+                                    filter_h * filter_width + filter_w;
+          const int output_grad_offset =
+              ((batch * output_channels + c_out) * output_height + h_out) *
+                  output_width +
+              w_out;
+          value +=
+              output_grad_data[output_grad_offset] * filter_data[filter_offset];
+        }
+      }
+    }
+    input_grad_data[index] += value;
+  }
+}
+
+// Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
+template <typename T>
+__global__ void KernelDepthwiseConvFilterGrad(
+    const int nthreads, const T* const output_grad_data,
+    const T* const input_data, const int num, const int output_channels,
+    const int output_height, const int output_width, const int input_channels,
+    const int input_height, const int input_width, const int filter_multiplier,
+    const int filter_height, const int filter_width, const int stride_height,
+    const int stride_width, const int padding_height, const int padding_width,
+    T* const filter_grad_data) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    const int w_out = index % output_width;
+    const int h_out = (index / output_width) % output_height;
+    const int c_out = (index / output_width / output_height) % output_channels;
+    const int batch = (index / output_width / output_height / output_channels);
+    const int c_in = c_out / filter_multiplier;
+    const int h_in_start = -padding_height + h_out * stride_height;
+    const int w_in_start = -padding_width + w_out * stride_width;
+    const int h_in_end =
+        -padding_height + h_out * stride_height + filter_height;
+    const int w_in_end = -padding_width + w_out * stride_width + filter_width;
+    const int in_offset =
+        (batch * input_channels + c_in) * input_height * input_width;
+
+    T* addr_offset = filter_grad_data + c_out * filter_height * filter_width;
+    const int h_end = h_in_end < input_height ? h_in_end : input_height;
+    const int w_end = w_in_end < input_width ? w_in_end : input_width;
+    const int h_start = h_in_start > 0 ? h_in_start : 0;
+    const int w_start = w_in_start > 0 ? w_in_start : 0;
+
+    for (int h_in = h_start; h_in < h_end; h_in++) {
+      for (int w_in = w_start; w_in < w_end; w_in++) {
+        const int offset = in_offset + h_in * input_width + w_in;
+        const T diff_temp = output_grad_data[index] * input_data[offset];
+        T* addr = addr_offset + (h_in - h_in_start) * filter_width +
+                  (w_in - w_in_start);
+        paddle::platform::CudaAtomicAdd(addr, diff_temp);
+      }
+    }
+  }
+}
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <class T>
+class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& filter,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    const int ksize_height = filter.dims()[2];
+    const int ksize_width = filter.dims()[3];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* input_data = input.data<T>();
+    const T* filter_data = filter.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_height * output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelDepthwiseConv<T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, filter_data, batch_size, output_channels,
+        output_height, output_width, input_channels, input_height, input_width,
+        output_channels / input_channels, ksize_height, ksize_width,
+        stride_height, stride_width, padding_height, padding_width,
+        output_data);
+  }
+};
+
+template <typename T>
+class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& filter,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output_grad.dims()[1];
+    const int output_height = output_grad.dims()[2];
+    const int output_width = output_grad.dims()[3];
+    const int ksize_height = filter.dims()[2];
+    const int ksize_width = filter.dims()[3];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* filter_data = filter.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * input_channels * input_height * input_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelDepthwiseConvInputGrad<T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, output_grad_data, filter_data, batch_size, output_channels,
+        output_height, output_width, input_channels, input_height, input_width,
+        output_channels / input_channels, ksize_height, ksize_width,
+        stride_height, stride_width, padding_height, padding_width,
+        input_grad_data);
+  }
+};
+
+template <typename T>
+class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* filter_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output_grad.dims()[1];
+    const int output_height = output_grad.dims()[2];
+    const int output_width = output_grad.dims()[3];
+    const int ksize_height = filter_grad->dims()[2];
+    const int ksize_width = filter_grad->dims()[3];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* input_data = input.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* filter_grad_data = filter_grad->mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_height * output_width;
+
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelDepthwiseConvFilterGrad<T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, output_grad_data, input_data, batch_size, output_channels,
+        output_height, output_width, input_channels, input_height, input_width,
+        output_channels / input_channels, ksize_height, ksize_width,
+        stride_height, stride_width, padding_height, padding_width,
+        filter_grad_data);
+  }
+};
+
+template class DepthwiseConvFunctor<platform::CUDADeviceContext, float>;
+template class DepthwiseConvFunctor<platform::CUDADeviceContext, double>;
+
+template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
+                                             float>;
+template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
+                                             double>;
+
+template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
+                                              float>;
+template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
+                                              double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/depthwise_conv.h b/paddle/operators/math/depthwise_conv.h
new file mode 100644
index 0000000000000000000000000000000000000000..4708920bb42db90d84fda0c6a1039991cb79e80d
--- /dev/null
+++ b/paddle/operators/math/depthwise_conv.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/*
+ * \brief Compute the depthwise convolution which include
+ * forward process and backpropagation process
+ */
+template <typename DeviceContext, typename T>
+class DepthwiseConvFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& filter,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* output);
+};
+
+template <typename DeviceContext, typename T>
+class DepthwiseConvInputGradFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& filter,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
+};
+
+template <typename DeviceContext, typename T>
+class DepthwiseConvFilterGradFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* filter_grad);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc
index dcf4b85e1aadf88e4b1ca70ac7e8b5416fc58cd8..ce0a5f6cff873166e3308a625978ecefaed2aa29 100644
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -331,6 +331,12 @@ template struct RowwiseAdd<platform::CPUDeviceContext, double>;
 template struct ColwiseSum<platform::CPUDeviceContext, float>;
 template struct ColwiseSum<platform::CPUDeviceContext, double>;
 
+template struct RowwiseSum<platform::CPUDeviceContext, float>;
+template struct RowwiseSum<platform::CPUDeviceContext, double>;
+
+template struct RowwiseMean<platform::CPUDeviceContext, float>;
+template struct RowwiseMean<platform::CPUDeviceContext, double>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu
index d47a7f818ded61baf31e46ea3b8ae3101324111f..c0a107470a4629506fc06dabc78a4a4716be6649 100644
--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -325,6 +325,31 @@ void ColwiseSum<platform::CUDADeviceContext, double>::operator()(
       vector->data<double>());
 }
 
+template struct RowwiseSum<platform::CUDADeviceContext, float>;
+// template struct RowwiseSum<platform::CUDADeviceContext, double>;
+// TODO(zcd): Following ColwiseSum format, need to confirm.
+// The RowwiseSum<platform::CUDADeviceContext, double> failed in debug mode,
+// and only failed for this case. So reimplemented it.
+template <>
+void RowwiseSum<platform::CUDADeviceContext, double>::operator()(
+    const platform::CUDADeviceContext& context, const framework::Tensor& input,
+    framework::Tensor* vector) {
+  auto in_dims = input.dims();
+  auto size = input.numel() / in_dims[0];
+  PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0]);
+  framework::Tensor one;
+  one.mutable_data<double>({size}, context.GetPlace());
+  SetConstant<platform::CUDADeviceContext, double> set;
+  set(context, &one, static_cast<double>(1.0));
+  gemv<platform::CUDADeviceContext, double>(
+      context, true, static_cast<int>(in_dims[1]), static_cast<int>(in_dims[0]),
+      1.0, one.data<double>(), input.data<double>(), 0.0,
+      vector->data<double>());
+}
+
+template struct RowwiseMean<platform::CUDADeviceContext, float>;
+template struct RowwiseMean<platform::CUDADeviceContext, double>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h
index 8cc03c2ba0facae691a0d2b8a4f2ea768cfa5491..cb14d1e57468564710640773fdabd41896c178e0 100644
--- a/paddle/operators/math/math_function.h
+++ b/paddle/operators/math/math_function.h
@@ -128,6 +128,18 @@ struct ColwiseSum {
                   framework::Tensor* vec);
 };
 
+template <typename DeviceContext, typename T>
+struct RowwiseSum {
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* vec);
+};
+
+template <typename DeviceContext, typename T>
+struct RowwiseMean {
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* vec);
+};
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/math_function_impl.h b/paddle/operators/math/math_function_impl.h
index de591626df28e2bc3391b609f909612411398247..af4127788af0aaeb99199f7d6e2138a449b9fe51 100644
--- a/paddle/operators/math/math_function_impl.h
+++ b/paddle/operators/math/math_function_impl.h
@@ -87,6 +87,88 @@ class ColwiseSum<platform::CPUDeviceContext, T> {
   }
 };
 
+template <typename DeviceContext, typename T>
+void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& context,
+                                               const framework::Tensor& input,
+                                               framework::Tensor* out) {
+  auto in_dims = input.dims();
+  PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
+
+  auto in = framework::EigenMatrix<T>::From(input);
+  auto vec = framework::EigenVector<T>::Flatten(*out);
+
+  vec.device(*context.eigen_device()) = in.mean(Eigen::array<int, 1>({{1}}));
+}
+// TODO(zcd): Following ColwiseSum format, need to confirm.
+// Specialize for CPU, since Eigen implement a general reduce. However,
+// rowwise-sum can be easily implemented. General reduce has a huge overhead in
+// CPU
+template <typename T>
+class RowwiseMean<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    auto& in_dims = input.dims();
+    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+    auto height = in_dims[0];
+    auto size = in_dims[1];
+    PADDLE_ENFORCE_EQ(out->numel(), height);
+    auto inv_size = 1.0 / size;
+    T* out_buf = out->mutable_data<T>(out->place());
+    const T* in_buf = input.data<T>();
+
+    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
+      T sum = 0;
+      for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
+        sum += in_buf[i * size + j];
+      }
+      out_buf[i] = sum * inv_size;
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
+                                              const framework::Tensor& input,
+                                              framework::Tensor* out) {
+  auto in_dims = input.dims();
+  PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
+
+  auto in = framework::EigenMatrix<T>::From(input);
+  auto vec = framework::EigenVector<T>::Flatten(*out);
+
+  vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{1}}));
+}
+// TODO(zcd): Following ColwiseSum format, need to confirm.
+// Specialize for CPU, since Eigen implement a general reduce. However,
+// rowwise-sum can be easily implemented. General reduce has a huge overhead in
+// CPU
+template <typename T>
+class RowwiseSum<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    auto& in_dims = input.dims();
+    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+    auto height = in_dims[0];
+    auto size = in_dims[1];
+    PADDLE_ENFORCE_EQ(out->numel(), size);
+
+    T* out_buf = out->mutable_data<T>(out->place());
+    const T* in_buf = input.data<T>();
+
+    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
+      T sum = 0;
+      for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
+        sum += in_buf[i * size + j];
+      }
+      out_buf[i] = sum;
+    }
+  }
+};
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/selected_rows_functor.cu b/paddle/operators/math/selected_rows_functor.cu
index 0ee456f9bc61436bd0f2f8ef20dd1654e7e56d56..acdd87cb3550bc5f3891aed6fefd4301a3395f9f 100644
--- a/paddle/operators/math/selected_rows_functor.cu
+++ b/paddle/operators/math/selected_rows_functor.cu
@@ -31,7 +31,7 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
     PADDLE_ENFORCE_EQ(in1_height, input2.height());
     output->set_height(in1_height);
 
-    auto& in1_rows = input1.rows();
+    framework::Vector<int64_t> in1_rows(input1.rows());
     auto& in2_rows = input2.rows();
     std::vector<int64_t> out_rows;
     out_rows.reserve(in1_rows.size() + in2_rows.size());
@@ -108,7 +108,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
     PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
 
     auto& in1_value = input1.value();
-    auto& in1_rows = input1.rows();
+    framework::Vector<int64_t> in1_rows(input1.rows());
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
     PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
@@ -126,7 +126,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
     dim3 grid(1, in1_rows.size());
     SelectedRowsAddTensorKernel<
         T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, in1_rows.data(), out_data, in1_row_numel);
+        in1_data, in1_rows.cuda_data(), out_data, in1_row_numel);
 
     auto out_eigen = framework::EigenVector<T>::Flatten(*output);
     auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
@@ -146,7 +146,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
     auto in1_height = input1.height();
     PADDLE_ENFORCE_EQ(in1_height, input2->height());
 
-    auto& in1_rows = input1.rows();
+    framework::Vector<int64_t> in1_rows(input1.rows());
     auto& in2_rows = *(input2->mutable_rows());
 
     auto& in1_value = input1.value();
@@ -204,7 +204,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
     PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
 
     auto& in1_value = input1.value();
-    auto& in1_rows = input1.rows();
+    framework::Vector<int64_t> in1_rows(input1.rows());
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
     PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
@@ -216,7 +216,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
     dim3 grid(1, in1_rows.size());
     SelectedRowsAddToTensorKernel<
         T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, in1_rows.data(), in2_data, in1_row_numel);
+        in1_data, in1_rows.cuda_data(), in2_data, in1_row_numel);
   }
 };
 
@@ -257,7 +257,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
   framework::SelectedRows operator()(const platform::CUDADeviceContext& context,
                                      const framework::SelectedRows& input) {
     framework::SelectedRows out;
-    auto input_rows = input.rows();
+    framework::Vector<int64_t> input_rows(input.rows());
     std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
     std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
 
@@ -283,9 +283,9 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
     MergeAddKernel<
         T, 256><<<grid1, threads, 0,
                   reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(input_data, input.rows().data(), out_data,
-                                   out.rows().data(), out.rows().size(),
-                                   input_width);
+                      .stream()>>>(input_data, input_rows.cuda_data(), out_data,
+                                   out.mutable_rows()->cuda_data(),
+                                   out.rows().size(), input_width);
     return out;
   }
 };
@@ -370,8 +370,8 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> {
     dim3 threads(platform::PADDLE_CUDA_NUM_THREADS, 1);
     dim3 grid(1, in1_rows.size());
     UpdateToTensorKernel<T, platform::PADDLE_CUDA_NUM_THREADS><<<
-        grid, threads, 0, context.stream()>>>(in1_data, in1_rows.data(), op,
-                                              in2_data, in1_row_numel);
+        grid, threads, 0, context.stream()>>>(in1_data, in1_rows.cuda_data(),
+                                              op, in2_data, in1_row_numel);
   }
 };
 }  // namespace scatter
diff --git a/paddle/operators/math/sequence2batch.cc b/paddle/operators/math/sequence2batch.cc
index e459a42ca251a9fc79f745f48a118ce898a0f77e..17abce1c2f809f75edb2c5dc46709094c2ce10c3 100644
--- a/paddle/operators/math/sequence2batch.cc
+++ b/paddle/operators/math/sequence2batch.cc
@@ -23,8 +23,10 @@ template <typename T>
 class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
  public:
   void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& src, const size_t* index,
-                  framework::Tensor& dst, bool is_src_index) {
+                  const framework::Tensor& src,
+                  framework::Vector<size_t> index_lod, framework::Tensor& dst,
+                  bool is_src_index) {
+    size_t* index = index_lod.data();
     auto src_dims = src.dims();
     auto dst_dims = dst.dims();
     PADDLE_ENFORCE_EQ(src_dims.size(), 2UL,
diff --git a/paddle/operators/math/sequence2batch.cu b/paddle/operators/math/sequence2batch.cu
index 452ae8951000872b706f7e4227a62dbf98109e7e..f27631271a42b4d64abef00d7f119b85e32edda4 100644
--- a/paddle/operators/math/sequence2batch.cu
+++ b/paddle/operators/math/sequence2batch.cu
@@ -42,8 +42,10 @@ template <typename T>
 class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& src, const size_t* index,
-                  framework::Tensor& dst, bool is_src_index) {
+                  const framework::Tensor& src,
+                  framework::Vector<size_t> index_lod, framework::Tensor& dst,
+                  bool is_src_index) {
+    size_t* index = index_lod.cuda_data();
     auto src_dims = src.dims();
     auto dst_dims = dst.dims();
     PADDLE_ENFORCE_EQ(src_dims.size(), 2,
diff --git a/paddle/operators/math/sequence2batch.h b/paddle/operators/math/sequence2batch.h
index a5c43a2c7d4d729c35a20a27de2a23141e6019bc..6db0427b4174a09dd254d771e8d3d215cc6571a9 100644
--- a/paddle/operators/math/sequence2batch.h
+++ b/paddle/operators/math/sequence2batch.h
@@ -35,7 +35,7 @@ class CopyMatrixRowsFunctor {
   // copy the input src to the indexed rows of output dst.
   // The indexed rows are based on the input index.
   void operator()(const DeviceContext& context, const framework::Tensor& src,
-                  const size_t* index, framework::Tensor& dst,
+                  framework::Vector<size_t> index_lod, framework::Tensor& dst,
                   bool is_src_index);
 };
 
@@ -66,7 +66,7 @@ class LoDTensor2BatchFunctor {
       PADDLE_ENFORCE_EQ(lods[1].size(),
                         static_cast<size_t>(lod_tensor.dims()[0]));
       CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
-      to_batch(context, lod_tensor, lods[1].data(), batch, true);
+      to_batch(context, lod_tensor, lods[1], batch, true);
       return;
     }
 
@@ -144,7 +144,7 @@ class LoDTensor2BatchFunctor {
     batch.set_lod(batch_lods);
 
     CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
-    to_batch(context, lod_tensor, seq2batch_idx, batch, true);
+    to_batch(context, lod_tensor, batch_lods[1], batch, true);
   }
 };
 
@@ -159,8 +159,7 @@ class Batch2LoDTensorFunctor {
     PADDLE_ENFORCE_EQ(in_lod[1].size(),
                       static_cast<size_t>(lod_tensor.dims()[0]));
     CopyMatrixRowsFunctor<DeviceContext, T> to_seq;
-    size_t* index = in_lod[1].data();
-    to_seq(context, batch, index, lod_tensor, false);
+    to_seq(context, batch, in_lod[1], lod_tensor, false);
   }
 };
 
diff --git a/paddle/operators/math/sequence_padding.cu b/paddle/operators/math/sequence_padding.cu
index a38df26f59569c4fd54a1ba5691b2cd5f3245344..65c9cfe4a0ec14d220ad237baa71703a783ed0fa 100644
--- a/paddle/operators/math/sequence_padding.cu
+++ b/paddle/operators/math/sequence_padding.cu
@@ -120,12 +120,14 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
     T* padding_data = padding.data<T>();
     if (norm_by_times) {
       SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>(
-          padding_data, const_cast<T*>(seq_data), abs_offset_lod[level].data(),
-          sequence_width, max_sequence_length, num_sequences);
+          padding_data, const_cast<T*>(seq_data),
+          abs_offset_lod[level].cuda_data(), sequence_width,
+          max_sequence_length, num_sequences);
     } else {
       SequencePaddingKernel<T, 0, 1><<<grid, threads, 0, context.stream()>>>(
-          padding_data, const_cast<T*>(seq_data), abs_offset_lod[level].data(),
-          sequence_width, max_sequence_length, num_sequences);
+          padding_data, const_cast<T*>(seq_data),
+          abs_offset_lod[level].cuda_data(), sequence_width,
+          max_sequence_length, num_sequences);
     }
   }
 };
@@ -193,12 +195,14 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
     T* seq_data = seq.data<T>();
     if (norm_by_times) {
       SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>(
-          const_cast<T*>(padding_data), seq_data, abs_offset_lod[level].data(),
-          sequence_width, max_sequence_length, num_sequences);
+          const_cast<T*>(padding_data), seq_data,
+          abs_offset_lod[level].cuda_data(), sequence_width,
+          max_sequence_length, num_sequences);
     } else {
       SequencePaddingKernel<T, 0, 0><<<grid, threads, 0, context.stream()>>>(
-          const_cast<T*>(padding_data), seq_data, abs_offset_lod[level].data(),
-          sequence_width, max_sequence_length, num_sequences);
+          const_cast<T*>(padding_data), seq_data,
+          abs_offset_lod[level].cuda_data(), sequence_width,
+          max_sequence_length, num_sequences);
     }
   }
 };
diff --git a/paddle/operators/math/sequence_pooling.cu b/paddle/operators/math/sequence_pooling.cu
index 4c9e6b375ce7251747b9cd443d86cca0858c84ef..f66534a6812a66c737445ea96914a393077d7d65 100644
--- a/paddle/operators/math/sequence_pooling.cu
+++ b/paddle/operators/math/sequence_pooling.cu
@@ -73,7 +73,7 @@ class MaxSeqPoolFunctor<platform::CUDADeviceContext, T> {
     dim3 grid(num_seq, 1);
     auto stream = context.stream();
     KeMaxSequencePool<T><<<grid, threads, 0, stream>>>(
-        in_data, starts.data(), out_data, max_index, num_seq, dim);
+        in_data, starts.cuda_data(), out_data, max_index, num_seq, dim);
   }
 };
 
diff --git a/paddle/operators/math/sequence_scale.cu b/paddle/operators/math/sequence_scale.cu
index ceaabd8e0fd81c927fbd4333c0aa7954b8da8513..fd4e28f6113729cd1fa9dc179bd9b601d29b8a7f 100644
--- a/paddle/operators/math/sequence_scale.cu
+++ b/paddle/operators/math/sequence_scale.cu
@@ -46,7 +46,7 @@ class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
 
     SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<<
         num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
-        seq_data, abs_offset_lod[level].data(), scales, seq_width);
+        seq_data, abs_offset_lod[level].cuda_data(), scales, seq_width);
   }
 };
 
diff --git a/paddle/operators/mine_hard_examples_op.cc b/paddle/operators/mine_hard_examples_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..051cc24706d69ec4f38524af1dd510bf079c74c7
--- /dev/null
+++ b/paddle/operators/mine_hard_examples_op.cc
@@ -0,0 +1,330 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+enum MiningType { kNone = 0, kMaxNegative, kHardExample };
+
+template <typename T>
+bool SortScoreDescend(const std::pair<float, T>& pair1,
+                      const std::pair<float, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+inline bool IsEligibleMining(const MiningType mining_type, const int match_idx,
+                             const float match_dist,
+                             const float neg_dist_threshold) {
+  if (mining_type == MiningType::kMaxNegative) {
+    return match_idx == -1 && match_dist < neg_dist_threshold;
+  } else if (mining_type == MiningType::kHardExample) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+inline MiningType GetMiningType(std::string str) {
+  if (str == "max_negative") {
+    return MiningType::kMaxNegative;
+  } else if (str == "hard_example") {
+    return MiningType::kHardExample;
+  } else {
+    return MiningType::kNone;
+  }
+}
+
+template <typename DeviceContext, typename T>
+class MineHardExamplesKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in_cls_loss = ctx.Input<framework::Tensor>("ClsLoss");
+    auto* in_loc_loss = ctx.Input<framework::Tensor>("LocLoss");
+    auto* in_matched_indices = ctx.Input<framework::Tensor>("MatchIndices");
+    auto* in_match_dist = ctx.Input<framework::Tensor>("MatchDist");
+    float neg_pos_ratio = ctx.Attr<float>("neg_pos_ratio");
+    T neg_dist_threshold =
+        static_cast<T>(ctx.Attr<float>("neg_dist_threshold"));
+    int sample_size = ctx.Attr<int>("sample_size");
+    MiningType mining_type =
+        GetMiningType(ctx.Attr<std::string>("mining_type"));
+
+    auto out_neg_indices = ctx.Output<framework::LoDTensor>("NegIndices");
+    auto out_match_indices =
+        ctx.Output<framework::Tensor>("UpdatedMatchIndices");
+
+    framework::Copy(*in_matched_indices, ctx.GetPlace(), out_match_indices);
+
+    int batch_size = in_matched_indices->dims()[0];
+    int prior_num = in_matched_indices->dims()[1];
+
+    auto match_indices = framework::EigenMatrix<int>::From(*in_matched_indices);
+
+    auto match_indices_et =
+        framework::EigenMatrix<int>::From(*out_match_indices);
+
+    auto match_dist = framework::EigenMatrix<T>::From(*in_match_dist);
+
+    const T* cls_loss = in_cls_loss->data<T>();
+    const T* loc_loss = nullptr;
+    if (in_loc_loss) {
+      loc_loss = in_loc_loss->data<T>();
+    }
+
+    std::vector<std::vector<int>> all_neg_indices;
+    std::vector<size_t> batch_starts = {0};
+    for (int n = 0; n < batch_size; ++n) {
+      std::vector<std::pair<T, size_t>> loss_idx;
+      int neg_sel = 0;
+      for (int m = 0; m < prior_num; ++m) {
+        if (IsEligibleMining(mining_type, match_indices(n, m), match_dist(n, m),
+                             neg_dist_threshold)) {
+          T loss = cls_loss[n * prior_num + m];
+          if (mining_type == MiningType::kHardExample && loc_loss != nullptr) {
+            loss = cls_loss[n * prior_num + m] + loc_loss[n * prior_num + m];
+          }
+          loss_idx.push_back(std::make_pair(loss, m));
+          ++neg_sel;
+        }
+      }
+
+      if (mining_type == MiningType::kMaxNegative) {
+        int num_pos = 0;
+        for (int m = 0; m < prior_num; ++m) {
+          if (match_indices(n, m) != -1) ++num_pos;
+        }
+        neg_sel = std::min(static_cast<int>(num_pos * neg_pos_ratio), neg_sel);
+      } else if (mining_type == MiningType::kHardExample) {
+        neg_sel = std::min(sample_size, neg_sel);
+      }
+
+      std::sort(loss_idx.begin(), loss_idx.end(), SortScoreDescend<size_t>);
+      std::set<int> sel_indices;
+      std::vector<int> neg_indices;
+      std::transform(loss_idx.begin(), loss_idx.begin() + neg_sel,
+                     std::inserter(sel_indices, sel_indices.begin()),
+                     [](std::pair<T, size_t>& l) -> int {
+                       return static_cast<int>(l.second);
+                     });
+
+      if (mining_type == MiningType::kHardExample) {
+        for (int m = 0; m < prior_num; ++m) {
+          if (match_indices(n, m) > -1) {
+            if (sel_indices.find(m) == sel_indices.end()) {
+              match_indices_et(n, m) = -1;
+            }
+          } else {
+            if (sel_indices.find(m) != sel_indices.end()) {
+              neg_indices.push_back(m);
+            }
+          }
+        }
+      } else {
+        neg_indices.resize(sel_indices.size());
+        std::copy(sel_indices.begin(), sel_indices.end(), neg_indices.begin());
+      }
+
+      all_neg_indices.push_back(neg_indices);
+      batch_starts.push_back(batch_starts.back() + neg_indices.size());
+    }
+
+    framework::LoD out_neg_indices_lod;
+    out_neg_indices_lod.emplace_back(batch_starts);
+    int neg_offset = 0;
+    auto neg_data = out_neg_indices->mutable_data<int>(
+        framework::make_ddim({static_cast<int>(batch_starts.back()), 1}),
+        ctx.GetPlace());
+
+    for (auto neg_indices : all_neg_indices) {
+      std::copy(neg_indices.begin(), neg_indices.end(), neg_data + neg_offset);
+      neg_offset += neg_indices.size();
+    }
+    out_neg_indices->set_lod(out_neg_indices_lod);
+    return;
+  }
+};
+
+class MineHardExamplesOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("ClsLoss"),
+                   "Input(ClsLoss) of MineHardExamplesOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("MatchIndices"),
+        "Input(MatchIndices) of MineHardExamplesOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("MatchDist"),
+        "Input(MatchDist) of MineHardExamplesOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("NegIndices"),
+        "Output(NegIndices) of MineHardExamplesOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("UpdatedMatchIndices"),
+                   "Output(UpdatedMatchIndices) of MineHardExamplesOp should "
+                   "not be null.");
+
+    auto cls_loss_dims = ctx->GetInputDim("ClsLoss");
+    auto idx_dims = ctx->GetInputDim("MatchIndices");
+    auto dis_dims = ctx->GetInputDim("MatchDist");
+
+    PADDLE_ENFORCE_EQ(cls_loss_dims.size(), 2UL,
+                      "The shape of ClsLoss is [N, Np].");
+    PADDLE_ENFORCE_EQ(idx_dims.size(), 2UL,
+                      "The shape of MatchIndices is [N, Np].");
+    PADDLE_ENFORCE_EQ(dis_dims.size(), 2UL,
+                      "The shape of MatchDist is [N, Np].");
+
+    if (ctx->HasInput("LocLoss")) {
+      auto loc_loss_dims = ctx->GetInputDim("LocLoss");
+      PADDLE_ENFORCE_EQ(loc_loss_dims.size(), 2UL,
+                        "The shape of LocLoss is [N, Np].");
+      PADDLE_ENFORCE_EQ(cls_loss_dims[0], loc_loss_dims[0],
+                        "Batch size of ClsLoss and LocLoss must be the same.");
+      PADDLE_ENFORCE_EQ(
+          cls_loss_dims[1], loc_loss_dims[1],
+          "Prior box number of ClsLoss and LocLoss must be the same.");
+    }
+
+    PADDLE_ENFORCE_EQ(
+        cls_loss_dims[0], idx_dims[0],
+        "Batch size of ClsLoss and MatchIndices must be the same.");
+    PADDLE_ENFORCE_EQ(
+        cls_loss_dims[1], idx_dims[1],
+        "Prior box number of ClsLoss and MatchIndices must be the same.");
+
+    PADDLE_ENFORCE_EQ(cls_loss_dims[0], dis_dims[0],
+                      "Batch size of ClsLoss and MatchDist must be the same.");
+    PADDLE_ENFORCE_EQ(
+        cls_loss_dims[1], idx_dims[1],
+        "Prior box number of ClsLoss and MatchDist must be the same.");
+
+    auto mining_type =
+        GetMiningType(ctx->Attrs().Get<std::string>("mining_type"));
+
+    PADDLE_ENFORCE_NE(mining_type, MiningType::kNone,
+                      "mining_type must be hard_example or max_negative");
+
+    if (mining_type == MiningType::kMaxNegative) {
+      auto neg_pos_ratio = ctx->Attrs().Get<float>("neg_pos_ratio");
+      auto neg_dist_threshold = ctx->Attrs().Get<float>("neg_dist_threshold");
+      PADDLE_ENFORCE_GT(
+          neg_pos_ratio, 0.0f,
+          "neg_pos_ratio must greater than zero in max_negative mode");
+      PADDLE_ENFORCE_GT(
+          neg_dist_threshold, 0.0f,
+          "neg_dist_threshold must greater than zero in max_negative mode");
+    } else if (mining_type == MiningType::kHardExample) {
+      auto sample_size = ctx->Attrs().Get<int>("sample_size");
+      PADDLE_ENFORCE_GT(
+          sample_size, 0,
+          "sample_size must greater than zero in hard_example mode");
+    }
+
+    ctx->SetOutputDim("UpdatedMatchIndices", idx_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("ClsLoss")->type()),
+        ctx.device_context());
+  }
+};
+
+class MineHardExamplesOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MineHardExamplesOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "ClsLoss",
+        "(Tensor, default Tensor<float>), The classification loss with shape "
+        "[N, Np], N is the batch size and Np is the number of prior box.");
+    AddInput("LocLoss",
+             "(Tensor, optional, default Tensor<float>), The localization loss "
+             "with shape [N, Np], N is the batch size and Np is the number of "
+             "prior box.")
+        .AsDispensable();
+    AddInput("MatchIndices",
+             "(Tensor, Tensor<int>), Matched indices with shape [N, Np], N is "
+             "the batch size and Np is the number of prior box. "
+             "MatchIndices[i][j] equal -1 means the j-th prior box in i-th "
+             "instance does not match any entity, otherwise means it is "
+             "matched to row.");
+    AddInput("MatchDist",
+             "(Tensor, default Tensor<float>) Matched indices with shape [N, "
+             "Np], N is the batch size and Np is the number of prior box.");
+    AddAttr<float>("neg_pos_ratio",
+                   "(float) The ratio of the negative box to the positive "
+                   "box. Use only when mining_type is max_negative.")
+        .SetDefault(1.0);
+    AddAttr<float>("neg_dist_threshold",
+                   "(float) The negative overlap upper bound for the unmatched "
+                   "predictions. Use only when mining_type is max_negative.")
+        .SetDefault(0.5);
+    AddAttr<int>("sample_size",
+                 "(float) The max sample size of negative box. Use only when "
+                 "mining_type is hard_example.")
+        .SetDefault(0);
+    AddAttr<std::string>("mining_type",
+                         "(float) The mining algorithm name, the value is "
+                         "hard_example or max_negative.")
+        .SetDefault("max_negative")
+        .InEnum({"hard_example", "max_negative"});
+
+    AddOutput(
+        "NegIndices",
+        "(LoDTensor<int>) The output of negative example indices. a LoDTensor "
+        "with shape [Neg, 1]. The size of lod[0] minus 1 is batch size, "
+        "and each element is the prior box index. "
+        "For example, the batch size is 2, the lod is [[0, 1, 2]], "
+        "the sample 0's box 1(MatchIndices[0][1]) is selected, "
+        "and sample 1's box 0 is selected. The output NegIndices is "
+        "[[1], [0]].");
+
+    AddOutput("UpdatedMatchIndices",
+              "(Tensor<int>) The output of updated MatchIndices, a tensor with "
+              "shape [N, Np]. Only update when mining_type is "
+              "hard_example. The input MatchIndices elements will be update to "
+              "-1 when it is not in the candidate high loss list of negative "
+              "examples.");
+
+    AddComment(R"DOC(
+Mine hard examples Operator.
+This operator implements hard example mining to select a subset of negative box indices.
+For each image, selects the box with highest losses. subject to the condition that the 
+box cannot have an Matcht > neg_dist_threshold when mining_type is max_negative. 
+The selected number is min(sample_size, max_negative_box_number) when mining_type is 
+hard_example, or min(neg_pos_ratio * positive_box_number, max_negative_box_number) 
+when mining_type is max_negative, where the max_negative_box_number is the count of 
+MatchIndices elements with value -1.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(mine_hard_examples, ops::MineHardExamplesOp,
+                             ops::MineHardExamplesOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    mine_hard_examples,
+    ops::MineHardExamplesKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MineHardExamplesKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/multiclass_nms_op.cc b/paddle/operators/multiclass_nms_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..41b9335fb8fc9ef5f5336bd8c63dc68bb94ff4f6
--- /dev/null
+++ b/paddle/operators/multiclass_nms_op.cc
@@ -0,0 +1,384 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+constexpr int64_t kOutputDim = 6;
+constexpr int64_t kBBoxSize = 4;
+
+class MultiClassNMSOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("BBoxes"),
+                   "Input(BBoxes) of MultiClassNMS should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Scores"),
+                   "Input(Scores) of MultiClassNMS should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of MultiClassNMS should not be null.");
+
+    auto box_dims = ctx->GetInputDim("BBoxes");
+    auto score_dims = ctx->GetInputDim("Scores");
+
+    PADDLE_ENFORCE_EQ(box_dims.size(), 2,
+                      "The rank of Input(BBoxes) must be 2.");
+    PADDLE_ENFORCE_EQ(score_dims.size(), 3,
+                      "The rank of Input(Scores) must be 3.");
+    PADDLE_ENFORCE_EQ(box_dims[1], 4,
+                      "The 2nd dimension of Input(BBoxes) must be 4, "
+                      "represents the layout of coordinate "
+                      "[xmin, ymin, xmax, ymax]");
+    PADDLE_ENFORCE_EQ(box_dims[0], score_dims[2],
+                      "The 1st dimensiong of Input(BBoxes) must be equal to "
+                      "3rd dimension of Input(Scores), which represents the "
+                      "predicted bboxes.");
+
+    // Here the box_dims[0] is not the real dimension of output.
+    // It will be rewritten in the computing kernel.
+    ctx->SetOutputDim("Out", {box_dims[0], 6});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>("Scores")->type()),
+        ctx.device_context());
+  }
+};
+
+template <class T>
+bool SortScorePairDescend(const std::pair<float, T>& pair1,
+                          const std::pair<float, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+template <class T>
+static inline void GetMaxScoreIndex(
+    const std::vector<T>& scores, const T threshold, int top_k,
+    std::vector<std::pair<T, int>>* sorted_indices) {
+  for (size_t i = 0; i < scores.size(); ++i) {
+    if (scores[i] > threshold) {
+      sorted_indices->push_back(std::make_pair(scores[i], i));
+    }
+  }
+  // Sort the score pair according to the scores in descending order
+  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
+                   SortScorePairDescend<int>);
+  // Keep top_k scores if needed.
+  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
+    sorted_indices->resize(top_k);
+  }
+}
+
+template <class T>
+static inline T BBoxArea(const T* box, const bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+
+template <class T>
+static inline T JaccardOverlap(const T* box1, const T* box2,
+                               const bool normalized) {
+  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
+      box2[3] < box1[1]) {
+    return static_cast<T>(0.);
+  } else {
+    const T inter_xmin = std::max(box1[0], box2[0]);
+    const T inter_ymin = std::max(box1[1], box2[1]);
+    const T inter_xmax = std::min(box1[2], box2[2]);
+    const T inter_ymax = std::min(box1[3], box2[3]);
+    const T inter_w = inter_xmax - inter_xmin;
+    const T inter_h = inter_ymax - inter_ymin;
+    const T inter_area = inter_w * inter_h;
+    const T bbox1_area = BBoxArea<T>(box1, normalized);
+    const T bbox2_area = BBoxArea<T>(box2, normalized);
+    return inter_area / (bbox1_area + bbox2_area - inter_area);
+  }
+}
+
+template <typename T>
+class MultiClassNMSKernel : public framework::OpKernel<T> {
+ public:
+  void NMSFast(const Tensor& bbox, const Tensor& scores,
+               const T score_threshold, const T nms_threshold, const T eta,
+               const int64_t top_k, std::vector<int>* selected_indices) const {
+    // The total boxes for each instance.
+    int64_t num_boxes = bbox.dims()[0];
+    // 4: [xmin ymin xmax ymax]
+    int64_t box_size = bbox.dims()[1];
+
+    std::vector<T> scores_data(num_boxes);
+    std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
+    std::vector<std::pair<T, int>> sorted_indices;
+    GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
+
+    selected_indices->clear();
+    T adaptive_threshold = nms_threshold;
+    const T* bbox_data = bbox.data<T>();
+
+    while (sorted_indices.size() != 0) {
+      const int idx = sorted_indices.front().second;
+      bool keep = true;
+      for (size_t k = 0; k < selected_indices->size(); ++k) {
+        if (keep) {
+          const int kept_idx = (*selected_indices)[k];
+          T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
+                                        bbox_data + kept_idx * box_size, true);
+          keep = overlap <= adaptive_threshold;
+        } else {
+          break;
+        }
+      }
+      if (keep) {
+        selected_indices->push_back(idx);
+      }
+      sorted_indices.erase(sorted_indices.begin());
+      if (keep && eta < 1 && adaptive_threshold > 0.5) {
+        adaptive_threshold *= eta;
+      }
+    }
+  }
+
+  void MultiClassNMS(const framework::ExecutionContext& ctx,
+                     const Tensor& scores, const Tensor& bboxes,
+                     std::map<int, std::vector<int>>& indices,
+                     int& num_nmsed_out) const {
+    int64_t background_label = ctx.Attr<int>("background_label");
+    int64_t nms_top_k = ctx.Attr<int>("nms_top_k");
+    int64_t keep_top_k = ctx.Attr<int>("keep_top_k");
+    T nms_threshold = static_cast<T>(ctx.Attr<float>("nms_threshold"));
+    T nms_eta = static_cast<T>(ctx.Attr<float>("nms_eta"));
+    T score_threshold = static_cast<T>(ctx.Attr<float>("score_threshold"));
+
+    int64_t class_num = scores.dims()[0];
+    int64_t predict_dim = scores.dims()[1];
+    int num_det = 0;
+    for (int64_t c = 0; c < class_num; ++c) {
+      if (c == background_label) continue;
+      Tensor score = scores.Slice(c, c + 1);
+      NMSFast(bboxes, score, score_threshold, nms_threshold, nms_eta, nms_top_k,
+              &(indices[c]));
+      num_det += indices[c].size();
+    }
+
+    num_nmsed_out = num_det;
+    const T* scores_data = scores.data<T>();
+    if (keep_top_k > -1 && num_det > keep_top_k) {
+      std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
+      for (const auto& it : indices) {
+        int label = it.first;
+        const T* sdata = scores_data + label * predict_dim;
+        const std::vector<int>& label_indices = it.second;
+        for (size_t j = 0; j < label_indices.size(); ++j) {
+          int idx = label_indices[j];
+          PADDLE_ENFORCE_LT(idx, predict_dim);
+          score_index_pairs.push_back(
+              std::make_pair(sdata[idx], std::make_pair(label, idx)));
+        }
+      }
+      // Keep top k results per image.
+      std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
+                       SortScorePairDescend<std::pair<int, int>>);
+      score_index_pairs.resize(keep_top_k);
+
+      // Store the new indices.
+      std::map<int, std::vector<int>> new_indices;
+      for (size_t j = 0; j < score_index_pairs.size(); ++j) {
+        int label = score_index_pairs[j].second.first;
+        int idx = score_index_pairs[j].second.second;
+        new_indices[label].push_back(idx);
+      }
+      new_indices.swap(indices);
+      num_nmsed_out = keep_top_k;
+    }
+  }
+
+  void MultiClassOutput(const Tensor& scores, const Tensor& bboxes,
+                        std::map<int, std::vector<int>>& selected_indices,
+                        Tensor* outs) const {
+    int predict_dim = scores.dims()[1];
+    auto* scores_data = scores.data<T>();
+    auto* bboxes_data = bboxes.data<T>();
+    auto* odata = outs->data<T>();
+
+    int count = 0;
+    for (const auto& it : selected_indices) {
+      int label = it.first;
+      const T* sdata = scores_data + label * predict_dim;
+      const std::vector<int>& indices = it.second;
+      for (size_t j = 0; j < indices.size(); ++j) {
+        int idx = indices[j];
+        const T* bdata = bboxes_data + idx * kBBoxSize;
+        odata[count * kOutputDim] = label;           // label
+        odata[count * kOutputDim + 1] = sdata[idx];  // score
+        // xmin, ymin, xmax, ymax
+        std::memcpy(odata + count * kOutputDim + 2, bdata, 4 * sizeof(T));
+        count++;
+      }
+    }
+  }
+
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* boxes = ctx.Input<Tensor>("BBoxes");
+    auto* scores = ctx.Input<Tensor>("Scores");
+    auto* outs = ctx.Output<LoDTensor>("Out");
+
+    auto score_dims = scores->dims();
+
+    int64_t batch_size = score_dims[0];
+    int64_t class_num = score_dims[1];
+    int64_t predict_dim = score_dims[2];
+
+    std::vector<std::map<int, std::vector<int>>> all_indices;
+    std::vector<size_t> batch_starts = {0};
+    for (int64_t i = 0; i < batch_size; ++i) {
+      Tensor ins_score = scores->Slice(i, i + 1);
+      ins_score.Resize({class_num, predict_dim});
+      std::map<int, std::vector<int>> indices;
+      int num_nmsed_out = 0;
+      MultiClassNMS(ctx, ins_score, *boxes, indices, num_nmsed_out);
+      all_indices.push_back(indices);
+      batch_starts.push_back(batch_starts.back() + num_nmsed_out);
+    }
+
+    int num_kept = batch_starts.back();
+    if (num_kept == 0) {
+      T* od = outs->mutable_data<T>({1}, ctx.GetPlace());
+      od[0] = -1;
+    } else {
+      outs->mutable_data<T>({num_kept, kOutputDim}, ctx.GetPlace());
+      for (int64_t i = 0; i < batch_size; ++i) {
+        Tensor ins_score = scores->Slice(i, i + 1);
+        ins_score.Resize({class_num, predict_dim});
+        int64_t s = batch_starts[i];
+        int64_t e = batch_starts[i + 1];
+        if (e > s) {
+          Tensor out = outs->Slice(s, e);
+          MultiClassOutput(ins_score, *boxes, all_indices[i], &out);
+        }
+      }
+    }
+
+    framework::LoD lod;
+    lod.emplace_back(batch_starts);
+
+    outs->set_lod(lod);
+  }
+};
+
+class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MultiClassNMSOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("BBoxes",
+             "(Tensor) A 2-D Tensor with shape [M, 4] represents the "
+             "predicted locations of M bounding bboxes. Each bounding box "
+             "has four coordinate values and the layout is "
+             "[xmin, ymin, xmax, ymax].");
+    AddInput("Scores",
+             "(Tensor) A 3-D Tensor with shape [N, C, M] represents the "
+             "predicted confidence predictions. N is the batch size, C is the "
+             "class number, M is number of bounding boxes. For each category "
+             "there are total M scores which corresponding M bounding boxes. "
+             " Please note, M is equal to the 1st dimension of BBoxes. ");
+    AddAttr<int>(
+        "background_label",
+        "(int64_t, defalut: 0) "
+        "The index of background label, the background label will be ignored. "
+        "If set to -1, then all categories will be considered.")
+        .SetDefault(0);
+    AddAttr<float>("score_threshold",
+                   "(float) "
+                   "Threshold to filter out bounding boxes with low "
+                   "confidence score. If not provided, consider all boxes.");
+    AddAttr<int>("nms_top_k",
+                 "(int64_t) "
+                 "Maximum number of detections to be kept according to the "
+                 "confidences aftern the filtering detections based on "
+                 "score_threshold");
+    AddAttr<float>("nms_threshold",
+                   "(float, defalut: 0.3) "
+                   "The threshold to be used in NMS.")
+        .SetDefault(0.3);
+    AddAttr<float>("nms_eta",
+                   "(float) "
+                   "The parameter for adaptive NMS.")
+        .SetDefault(1.0);
+    AddAttr<int>("keep_top_k",
+                 "(int64_t) "
+                 "Number of total bboxes to be kept per image after NMS "
+                 "step. -1 means keeping all bboxes after NMS step.");
+    AddOutput("Out",
+              "(LoDTensor) A 2-D LoDTensor with shape [No, 6] represents the "
+              "detections. Each row has 6 values: "
+              "[label, confidence, xmin, ymin, xmax, ymax], No is the total "
+              "number of detections in this mini-batch. For each instance, "
+              "the offsets in first dimension are called LoD, the number of "
+              "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is "
+              "no detected bbox.");
+    AddComment(R"DOC(
+This operator is to do multi-class non maximum suppression (NMS) on a batched
+of boxes and scores.
+
+In the NMS step, this operator greedily selects a subset of detection bounding
+boxes that have high scores larger than score_threshold, if providing this
+threshold, then selects the largest nms_top_k confidences scores if nms_top_k
+is larger than -1. Then this operator pruns away boxes that have high IOU
+(intersection over union) overlap with already selected boxes by adaptive
+threshold NMS based on parameters of nms_threshold and nms_eta.
+
+Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
+per image if keep_top_k is larger than -1.
+
+This operator support multi-class and batched inputs. It applying NMS
+independently for each class. The outputs is a 2-D LoDTenosr, for each
+image, the offsets in first dimension of LoDTensor are called LoD, the number
+of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0,
+means there is no detected bbox for this image. If there is no detected boxes
+for all images, all the elements in LoD are 0, and the Out only contains one
+value which is -1.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(multiclass_nms, ops::MultiClassNMSOp,
+                  ops::MultiClassNMSOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(multiclass_nms, ops::MultiClassNMSKernel<float>,
+                       ops::MultiClassNMSKernel<double>);
diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc
index 78263da2fbf843f6a5af2ba95aa0b219a7523b52..d275fa5cbbfbf4a949d7bb16c3acc598543ba000 100644
--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
@@ -119,7 +119,13 @@ REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker,
 REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp);
 REGISTER_OP_CPU_KERNEL(
     multiplex,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     multiplex_grad,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu
index 4372dc2c65ec7c0f28e46cd070ea471701ce8304..546e6e7a24d3653e9904706eac51c1b833f51463 100644
--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/operators/multiplex_op.cu
@@ -90,7 +90,13 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_CUDA_KERNEL(
     multiplex,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     multiplex_grad,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/operators/nccl_op_test.cu.cc b/paddle/operators/nccl_op_test.cu.cc
index 6546096069d4c3fbc4908a16c2dba2ac6d7e6421..827a62534778e48c8d4f03d2634056b7d1392ae8 100644
--- a/paddle/operators/nccl_op_test.cu.cc
+++ b/paddle/operators/nccl_op_test.cu.cc
@@ -241,7 +241,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
 // ncclBcastOp with desc
 TEST_F(NCCLTester, ncclBcastOp) {
   std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
-  const int kRoot = 5;
+  const int kRoot = 0;
   op2->SetType("ncclBcast");
   op2->SetInput("X", {"st"});
   op2->SetInput("Communicator", {"comm"});
@@ -287,6 +287,9 @@ TEST_F(NCCLTester, ncclBcastOp) {
 }
 
 int main(int argc, char **argv) {
+  // FIXME(tonyyang-svail):
+  //   Due to the driver issue on our CI, disable for now
+  return 0;
   const int dev_count = p::GetCUDADeviceCount();
   if (dev_count <= 1) {
     LOG(WARNING)
diff --git a/paddle/operators/one_hot_op.cc b/paddle/operators/one_hot_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e78b7468de4ea5f29378c2dc5905fdd36fb0ae2f
--- /dev/null
+++ b/paddle/operators/one_hot_op.cc
@@ -0,0 +1,95 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/operators/one_hot_op.h"
+#include "paddle/framework/framework.pb.h"
+
+namespace paddle {
+namespace operators {
+
+class OneHotOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of OneHotOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of OneHotOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_GE(x_dims.size(), 2,
+                      "Rank of Input(X) should be at least 2.");
+    PADDLE_ENFORCE_GE(x_dims[x_dims.size() - 1], 1U,
+                      "Last dimension of Input(X) should be 1.");
+
+    int depth = ctx->Attrs().Get<int>("depth");
+
+    PADDLE_ENFORCE_GT(depth, 0, "Should provide a positive depth (%d).", depth);
+
+    framework::DDim out_dims(x_dims);
+    out_dims[out_dims.size() - 1] = depth;
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("X", /* --> */ "Out");
+  }
+};
+
+class OneHotOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  OneHotOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor, LoDTensor<int>) Input variable with rank at least 2. "
+             "The last dimension of X should be 1. Each value of X is an index "
+             "to indicate the position.");
+    AddOutput("Out",
+              "(Tensor, Tensor<float>) Output tensor with same rank as X. "
+              "The tensor consists of one-hot representations of values in X.");
+    AddAttr<int>("depth",
+                 "A positive integer to specify the length of one-hot vector.");
+    AddAttr<int>("dtype",
+                 "An integer to specify the data type of one-hot "
+                 "vector. The default value is FP32.")
+        .SetDefault(paddle::framework::proto::DataType::FP32);
+    AddComment(R"DOC(
+One Hot Operator. This operator creates the one-hot representations for input
+index values. The following example will help to explain the function of this
+operator:
+
+X is a LoDTensor:
+  X.lod = [[0, 1, 4]]
+  X.shape = [4, 1]
+  X.data = [[1], [1], [3], [0]]
+
+set depth = 4
+
+Out is a LoDTensor:
+  Out.lod = [[0, 1, 4]]
+  Out.shape = [4, 4]
+  Out.data = [[0., 1., 0., 0.],
+              [0., 1., 0., 0.],
+              [0., 0., 0., 1.],
+              [1., 0., 0., 0.]]
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(one_hot, ops::OneHotOp, ops::OneHotOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    one_hot, ops::OneHotKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::OneHotKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/one_hot_op.cu b/paddle/operators/one_hot_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..16f6d9433eabd7be157ed57362a0d55d86c6ee92
--- /dev/null
+++ b/paddle/operators/one_hot_op.cu
@@ -0,0 +1,80 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/operators/one_hot_op.h"
+#include "paddle/platform/cuda_helper.h"
+#include "paddle/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename InT, typename OutT>
+__global__ void FillOutputKernel(const InT* p_in_data, OutT* p_out_data,
+                                 const int64_t numel, const int depth) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < numel) {
+    *(p_out_data + (idx * depth) + p_in_data[idx]) = 1.0;
+  }
+}
+
+template <typename DeviceContext, typename InT>
+struct OneHotOpCUDAFunctor {
+  const framework::LoDTensor* in_;
+  framework::LoDTensor* out_;
+  const DeviceContext& ctx_;
+  int depth_;
+
+  OneHotOpCUDAFunctor(const framework::LoDTensor* in, framework::LoDTensor* out,
+                      int depth, const DeviceContext& ctx)
+      : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
+
+  template <typename OutT>
+  void operator()() const {
+    auto* p_in_data = in_->data<InT>();
+    auto numel = in_->numel();
+    auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
+    auto stream = ctx_.stream();
+    math::set_constant(ctx_, out_, 0.0);
+
+    FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
+                           PADDLE_CUDA_NUM_THREADS,
+                       PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+        p_in_data, p_out_data, numel, depth_);
+  }
+};
+
+using LoDTensor = framework::LoDTensor;
+template <typename DeviceContext, typename T>
+class OneHotCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<LoDTensor>("X");
+    auto* out = context.Output<LoDTensor>("Out");
+    int depth = context.Attr<int>("depth");
+
+    framework::VisitDataType(
+        static_cast<framework::proto::DataType>(context.Attr<int>("dtype")),
+        OneHotOpCUDAFunctor<DeviceContext, T>(
+            in, out, depth, context.template device_context<DeviceContext>()));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    one_hot, ops::OneHotCUDAKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::OneHotCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/operators/one_hot_op.h b/paddle/operators/one_hot_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..12031ede2c3cd042a3d25003b714652b4d0d4453
--- /dev/null
+++ b/paddle/operators/one_hot_op.h
@@ -0,0 +1,68 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename InT>
+struct OneHotOpFunctor {
+  const framework::LoDTensor* in_;
+  framework::LoDTensor* out_;
+  int depth_;
+  const DeviceContext& ctx_;
+
+  OneHotOpFunctor(const framework::LoDTensor* in, framework::LoDTensor* out,
+                  int depth, const DeviceContext& ctx)
+      : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
+
+  template <typename OutT>
+  void operator()() const {
+    auto* p_in_data = in_->data<InT>();
+    auto numel = in_->numel();
+    auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
+    math::set_constant(ctx_, out_, 0.0);
+
+    for (int i = 0; i < numel; ++i) {
+      PADDLE_ENFORCE_GE(p_in_data[i], 0,
+                        "Illegal index value, should be at least 0.");
+      PADDLE_ENFORCE_LT(p_in_data[i], depth_,
+                        "Illegal index value, should be less than depth (%d).",
+                        depth_);
+      *(p_out_data + i * depth_ + p_in_data[i]) = 1.0;
+    }
+  }
+};
+
+using LoDTensor = framework::LoDTensor;
+template <typename DeviceContext, typename T>
+class OneHotKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<LoDTensor>("X");
+    auto* out = context.Output<LoDTensor>("Out");
+    int depth = context.Attr<int>("depth");
+
+    framework::VisitDataType(
+        static_cast<framework::proto::DataType>(context.Attr<int>("dtype")),
+        OneHotOpFunctor<DeviceContext, T>(
+            in, out, depth, context.template device_context<DeviceContext>()));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/parallel_do_op.cc b/paddle/operators/parallel_do_op.cc
index 09e808902f8fe3a7a07153d3432866c18e81dc7c..89045923f9ff2f33bc112b199c493047440e15c4 100644
--- a/paddle/operators/parallel_do_op.cc
+++ b/paddle/operators/parallel_do_op.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/framework/executor.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/threadpool.h"
+#include "paddle/operators/detail/safe_ref.h"
 
 namespace paddle {
 namespace operators {
@@ -39,8 +40,10 @@ static void SplitTensorAndMoveTensorToScopes(
     const std::vector<std::string> &names) {
   size_t num_sub_scopes = 0;
   for (auto &argu : names) {
-    auto *var = scope.FindVar(argu);
-    const auto &tensor = var->Get<LoDTensor>();
+    const auto &tensor =
+        detail::Ref(scope.FindVar(argu),
+                    "Cannot find variable %s in the parent scope", argu)
+            .Get<LoDTensor>();
     auto lod_tensors = tensor.SplitLoDTensor(places);
 
     for (auto &lod : lod_tensors) {
@@ -60,7 +63,9 @@ static void SplitTensorAndMoveTensorToScopes(
     }
 
     for (size_t i = 0; i < lod_tensors.size(); ++i) {
-      *(*sub_scopes)[i]->Var(argu)->GetMutable<LoDTensor>() = lod_tensors[i];
+      *detail::Ref(sub_scopes->at(i)->Var(argu),
+                   "Cannot find variable in the sub-scope", argu)
+           .GetMutable<LoDTensor>() = lod_tensors[i];
     }
   }
 }
@@ -71,18 +76,25 @@ inline void CopyOrShare(const framework::Variable &src,
   if (src.IsType<LoDTensor>()) {
     if (src.Get<LoDTensor>().place() == dst_place) {
       dst->GetMutable<LoDTensor>()->ShareDataWith(src.Get<LoDTensor>());
+      dst->GetMutable<LoDTensor>()->set_lod(src.Get<LoDTensor>().lod());
     } else {
       Copy(src.Get<LoDTensor>(), dst_place, dst->GetMutable<LoDTensor>());
+      framework::LoD lod(src.Get<LoDTensor>().lod());
+      lod.CopyToPeer(dst_place);
+      dst->GetMutable<LoDTensor>()->set_lod(lod);
     }
   } else if (src.IsType<SelectedRows>()) {
     auto &src_sr = src.Get<SelectedRows>();
     auto *dst_sr = dst->GetMutable<SelectedRows>();
-    dst_sr->set_rows(src_sr.rows());
     dst_sr->set_height(src_sr.height());
     if (src_sr.value().place() == dst_place) {
       dst_sr->mutable_value()->ShareDataWith(src_sr.value());
+      dst_sr->set_rows(src_sr.rows());
     } else {
       Copy(src_sr.value(), dst_place, dst_sr->mutable_value());
+      framework::Vector<int64_t> lod(src_sr.rows());
+      lod.CopyToPeer(dst_place);
+      dst_sr->set_rows(lod);
     }
   } else {
     PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name());
@@ -140,6 +152,9 @@ class ParallelDoOp : public framework::OperatorBase {
         auto *sub_scope = sub_scopes[i];
         auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>();
         framework::Copy(src, place, dst);
+        framework::LoD lod(src.lod());
+        lod.CopyToPeer(place);
+        dst->set_lod(lod);
       }
     }
     WaitOnPlaces(places);
@@ -243,17 +258,19 @@ class ParallelDoGradOp : public framework::OperatorBase {
                       const std::vector<framework::Scope *> &sub_scopes,
                       const platform::PlaceList &places) const {
     for (auto &s : Outputs(framework::GradVarName(kParameters))) {
+      VLOG(3) << "Accumulating " << s;
+      if (s == framework::kEmptyVarName) continue;
       std::string tmp_name;
       auto *tmp = sub_scopes[0]->Var(&tmp_name);
 
       for (size_t i = 1; i < sub_scopes.size(); ++i) {
         CopyOrShare(*sub_scopes[i]->FindVar(s), places[0], tmp);
-        WaitOnPlace(places[0]);
+        WaitOnPlaces(places);
 
         auto sum_op = framework::OpRegistry::CreateOp(
             "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
             framework::AttributeMap{});
-        VLOG(3) << sum_op->DebugStringEx(sub_scopes[0]);
+        VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]);
         sum_op->Run(*sub_scopes[0], places[0]);
         WaitOnPlace(places[0]);
       }
@@ -287,6 +304,17 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
                         this->InputGrad(input_param, false));
       }
     }
+    auto *g_block = this->grad_block_[0];
+
+    // All variable name that needed by gradient operators
+    std::unordered_set<std::string> all_inputs_in_grad_blocks;
+
+    for (size_t i = 0; i < g_block->OpSize(); ++i) {
+      auto *op = g_block->Op(i);
+      for (auto &var_name : op->InputArgumentNames()) {
+        all_inputs_in_grad_blocks.insert(var_name);
+      }
+    }
 
     for (auto &output_param : this->OutputNames()) {
       if (output_param == kParallelScopes) {
@@ -295,8 +323,17 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
                        this->Output(output_param));
       } else {
         grad->SetInput(output_param, this->Output(output_param));
-        grad->SetInput(framework::GradVarName(output_param),
-                       this->OutputGrad(output_param));
+        std::vector<std::string> og_names;
+        for (auto &og_name : this->OutputGrad(output_param)) {
+          if (all_inputs_in_grad_blocks.count(og_name) != 0) {
+            // there are some gradient operators who need the OG. So make this
+            // OG as an input of parallel.do
+            og_names.push_back(og_name);
+          }
+          // else, there is no operator who need the OG. Do not use this OG as
+          // an input
+        }
+        grad->SetInput(framework::GradVarName(output_param), og_names);
       }
     }
     grad->SetAttrMap(this->Attrs());
@@ -309,16 +346,9 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
 class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext *ctx) const override {
-    std::vector<std::string> input{kParameters, kInputs};
-    std::vector<std::string> output{kOutputs};
-
     PADDLE_ENFORCE(ctx->HasInputs(kParameters));
-    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)));
     PADDLE_ENFORCE(ctx->HasInputs(kInputs));
-
-    for (auto &s : output) {
-      PADDLE_ENFORCE(ctx->HasInputs(s));
-    }
+    PADDLE_ENFORCE(ctx->HasInputs(kOutputs));
 
     ctx->SetOutputsDim(framework::GradVarName(kParameters),
                        ctx->GetInputsDim(kParameters));
@@ -335,10 +365,14 @@ class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
       ctx->SetDims({ig_name}, {i_dims[i]});
     }
 
-    if (ctx->HasInputs(kParameters)) {
-      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)));
-      ctx->SetOutputsDim(framework::GradVarName(kParameters),
-                         ctx->GetInputsDim(kParameters));
+    auto p_dims = ctx->GetInputsDim(kParameters);
+    auto pg_names = ctx->Outputs(framework::GradVarName(kParameters));
+    for (size_t i = 0; i < pg_names.size(); ++i) {
+      auto &pg_name = pg_names[i];
+      if (pg_name == framework::kEmptyVarName) {
+        continue;
+      }
+      ctx->SetDims({pg_name}, {p_dims[i]});
     }
   }
 };
diff --git a/paddle/operators/pool_op.h b/paddle/operators/pool_op.h
index c3d82ecbdeb412f0234fcddc27361d79b58c7122..d6ba5e298a4939e31fde71bf5bf8484640a7ceaf 100644
--- a/paddle/operators/pool_op.h
+++ b/paddle/operators/pool_op.h
@@ -139,10 +139,8 @@ class PoolGradKernel : public framework::OpKernel<T> {
     auto& dev_ctx = context.template device_context<DeviceContext>();
     if (in_x_grad) {
       in_x_grad->mutable_data<T>(context.GetPlace());
-      auto temp = framework::EigenVector<T>::Flatten(*in_x_grad);
-      temp.device(
-          *context.template device_context<DeviceContext>().eigen_device()) =
-          temp.constant(static_cast<T>(0));
+      paddle::operators::math::SetConstant<DeviceContext, T> set_constant;
+      set_constant(dev_ctx, in_x_grad, 0.0);
 
       switch (ksize.size()) {
         case 2: {
diff --git a/paddle/operators/prior_box_op.cc b/paddle/operators/prior_box_op.cc
index 105ff4ac3e3ba889aad880f4204af15829c6da47..1dc4b288559d0294e5f58cc923ffc78b80604af9 100644
--- a/paddle/operators/prior_box_op.cc
+++ b/paddle/operators/prior_box_op.cc
@@ -44,12 +44,6 @@ class PriorBoxOp : public framework::OperatorWithKernel {
     auto aspect_ratios = ctx->Attrs().Get<std::vector<float>>("aspect_ratios");
     bool flip = ctx->Attrs().Get<bool>("flip");
 
-    PADDLE_ENFORCE_GT(min_sizes.size(), 0,
-                      "Size of min_sizes must be at least 1.");
-    for (size_t i = 0; i < min_sizes.size(); ++i) {
-      PADDLE_ENFORCE_GT(min_sizes[i], 0, "min_sizes[%d] must be positive.", i);
-    }
-
     std::vector<float> aspect_ratios_vec;
     ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec);
 
@@ -65,17 +59,6 @@ class PriorBoxOp : public framework::OperatorWithKernel {
       }
     }
 
-    PADDLE_ENFORCE_EQ(variances.size(), 4, "Must and only provide 4 variance.");
-    for (size_t i = 0; i < variances.size(); ++i) {
-      PADDLE_ENFORCE_GT(variances[i], 0.0,
-                        "variance[%d] must be greater than 0.", i);
-    }
-
-    const float step_h = ctx->Attrs().Get<float>("step_h");
-    PADDLE_ENFORCE_GT(step_h, 0.0, "step_h should be larger than 0.");
-    const float step_w = ctx->Attrs().Get<float>("step_w");
-    PADDLE_ENFORCE_GT(step_w, 0.0, "step_w should be larger than 0.");
-
     std::vector<int64_t> dim_vec(4);
     dim_vec[0] = input_dims[2];
     dim_vec[1] = input_dims[3];
@@ -106,26 +89,54 @@ class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
               "PriorBoxOp. The layout is [H, W, num_priors, 4]. "
               "H is the height of input, W is the width of input, num_priors "
               "is the box count of each position.");
-    AddAttr<std::vector<int>>("min_sizes", "(vector<int>) ",
-                              "List of min sizes of generated prior boxes.");
-    AddAttr<std::vector<int>>("max_sizes", "(vector<int>) ",
-                              "List of max sizes of generated prior boxes.");
+
+    AddAttr<std::vector<int>>("min_sizes",
+                              "(vector<int>) List of min sizes "
+                              "of generated prior boxes.")
+        .AddCustomChecker([](const std::vector<int>& min_sizes) {
+          PADDLE_ENFORCE_GT(min_sizes.size(), 0,
+                            "Size of min_sizes must be at least 1.");
+          for (size_t i = 0; i < min_sizes.size(); ++i) {
+            PADDLE_ENFORCE_GT(min_sizes[i], 0,
+                              "min_sizes[%d] must be positive.", i);
+          }
+        });
+    AddAttr<std::vector<int>>(
+        "max_sizes",
+        "(vector<int>) List of max sizes of generated prior boxes.");
     AddAttr<std::vector<float>>(
-        "aspect_ratios", "(vector<float>) ",
-        "List of aspect ratios of generated prior boxes.");
+        "aspect_ratios",
+        "(vector<float>) List of aspect ratios of generated prior boxes.");
+
     AddAttr<std::vector<float>>(
-        "variances", "(vector<float>) ",
-        "List of variances to be encoded in prior boxes.");
-    AddAttr<bool>("flip", "(bool) ", "Whether to flip aspect ratios.")
+        "variances",
+        "(vector<float>) List of variances to be encoded in prior boxes.")
+        .AddCustomChecker([](const std::vector<float>& variances) {
+          PADDLE_ENFORCE_EQ(variances.size(), 4,
+                            "Must and only provide 4 variance.");
+          for (size_t i = 0; i < variances.size(); ++i) {
+            PADDLE_ENFORCE_GT(variances[i], 0.0,
+                              "variance[%d] must be greater than 0.", i);
+          }
+        });
+    AddAttr<bool>("flip", "(bool) Whether to flip aspect ratios.")
         .SetDefault(true);
-    AddAttr<bool>("clip", "(bool) ", "Whether to clip out-of-boundary boxes.")
+    AddAttr<bool>("clip", "(bool) Whether to clip out-of-boundary boxes.")
         .SetDefault(true);
+
     AddAttr<float>("step_w",
                    "Prior boxes step across width, 0 for auto calculation.")
-        .SetDefault(0.0);
+        .SetDefault(0.0)
+        .AddCustomChecker([](const float& step_w) {
+          PADDLE_ENFORCE_GT(step_w, 0.0, "step_w should be larger than 0.");
+        });
     AddAttr<float>("step_h",
                    "Prior boxes step across height, 0 for auto calculation.")
-        .SetDefault(0.0);
+        .SetDefault(0.0)
+        .AddCustomChecker([](const float& step_h) {
+          PADDLE_ENFORCE_GT(step_h, 0.0, "step_h should be larger than 0.");
+        });
+
     AddAttr<float>("offset",
                    "(float) "
                    "Prior boxes center offset.")
diff --git a/paddle/operators/prior_box_op.h b/paddle/operators/prior_box_op.h
index e0a663ace8f38c2d08fd4714c1247d3313ffae3e..6b221cb74ebb306b99533f49c824ed6c60144ff2 100644
--- a/paddle/operators/prior_box_op.h
+++ b/paddle/operators/prior_box_op.h
@@ -25,7 +25,7 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
                                std::vector<float>& output_aspect_ratior) {
   constexpr float epsilon = 1e-6;
   output_aspect_ratior.clear();
-  output_aspect_ratior.push_back(1.);
+  output_aspect_ratior.push_back(1.0f);
   for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
     float ar = input_aspect_ratior[i];
     bool already_exist = false;
@@ -38,7 +38,7 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
     if (!already_exist) {
       output_aspect_ratior.push_back(ar);
       if (flip) {
-        output_aspect_ratior.push_back(1. / ar);
+        output_aspect_ratior.push_back(1.0f / ar);
       }
     }
   }
@@ -46,7 +46,7 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
 
 template <typename T>
 struct ClipFunctor {
-  HOSTDEVICE T operator()(T in) const {
+  HOSTDEVICE inline T operator()(T in) const {
     return std::min<T>(std::max<T>(in, 0.), 1.);
   }
 };
@@ -97,6 +97,9 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
     boxes->mutable_data<T>(ctx.GetPlace());
     vars->mutable_data<T>(ctx.GetPlace());
 
+    T inv_img_width = 1.0 / img_width;
+    T inv_img_height = 1.0 / img_height;
+
     auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes);
     for (int h = 0; h < feature_height; ++h) {
       for (int w = 0; w < feature_width; ++w) {
@@ -109,13 +112,15 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
           // first prior: aspect_ratio = 1, size = min_size
           box_width = box_height = min_size;
           // xmin
-          e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width;
+          e_boxes(h, w, idx, 0) = (center_x - box_width * 0.5) * inv_img_width;
           // ymin
-          e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height;
+          e_boxes(h, w, idx, 1) =
+              (center_y - box_height * 0.5) * inv_img_height;
           // xmax
-          e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width;
+          e_boxes(h, w, idx, 2) = (center_x + box_width * 0.5) * inv_img_width;
           // ymax
-          e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height;
+          e_boxes(h, w, idx, 3) =
+              (center_y + box_height * 0.5) * inv_img_height;
 
           idx++;
           if (max_sizes.size() > 0) {
@@ -124,13 +129,17 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
             // size = sqrt(min_size * max_size)
             box_width = box_height = sqrt(min_size * max_size);
             // xmin
-            e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width;
+            e_boxes(h, w, idx, 0) =
+                (center_x - box_width * 0.5) * inv_img_width;
             // ymin
-            e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height;
+            e_boxes(h, w, idx, 1) =
+                (center_y - box_height * 0.5) * inv_img_height;
             // xmax
-            e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width;
+            e_boxes(h, w, idx, 2) =
+                (center_x + box_width * 0.5) * inv_img_width;
             // ymax
-            e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height;
+            e_boxes(h, w, idx, 3) =
+                (center_y + box_height * 0.5) * inv_img_height;
             idx++;
           }
 
@@ -143,13 +152,17 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
             box_width = min_size * sqrt(ar);
             box_height = min_size / sqrt(ar);
             // xmin
-            e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width;
+            e_boxes(h, w, idx, 0) =
+                (center_x - box_width * 0.5) * inv_img_width;
             // ymin
-            e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height;
+            e_boxes(h, w, idx, 1) =
+                (center_y - box_height * 0.5) * inv_img_height;
             // xmax
-            e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width;
+            e_boxes(h, w, idx, 2) =
+                (center_x + box_width * 0.5) * inv_img_width;
             // ymax
-            e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height;
+            e_boxes(h, w, idx, 3) =
+                (center_y + box_height * 0.5) * inv_img_height;
             idx++;
           }
         }
diff --git a/paddle/operators/read_op.cc b/paddle/operators/read_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3ae454101f585cf412a306fd3198f99fbdb8324d
--- /dev/null
+++ b/paddle/operators/read_op.cc
@@ -0,0 +1,99 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/reader.h"
+
+namespace paddle {
+namespace operators {
+
+class ReadInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Reader"),
+                   "The ReadOp must take a reader as input.");
+    PADDLE_ENFORCE(ctx->HasOutputs("Out"),
+                   "The ReadOp should be assigned with output.");
+    std::vector<framework::DDim> reader_dims = ctx->GetReaderDims("Reader");
+    std::vector<std::string> out_names = ctx->Outputs("Out");
+    PADDLE_ENFORCE_EQ(
+        reader_dims.size(), out_names.size(),
+        "The reader's dim number doesn't match the output number.");
+    ctx->SetOutputsDim("Out", reader_dims);
+  }
+};
+
+class ReadInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    std::string reader_name = op_desc.Input("Reader")[0];
+    std::vector<std::string> out_names = op_desc.Output("Out");
+    framework::VarDesc* reader = block->FindVarRecursive(reader_name);
+    auto dtypes = reader->GetDataTypes();
+    PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size());
+    for (size_t i = 0; i < dtypes.size(); ++i) {
+      framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]);
+      out.SetType(framework::proto::VarDesc::LOD_TENSOR);
+      out.SetDataType(dtypes[i]);
+    }
+  }
+};
+
+class ReadOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+  void Run(const framework::Scope& scope,
+           const platform::Place& dev_place) const override {
+    framework::ReaderHolder* reader =
+        scope.FindVar(Input("Reader"))->GetMutable<framework::ReaderHolder>();
+    if (!reader->HasNext()) {
+      reader->ReInit();
+      PADDLE_ENFORCE(
+          reader->HasNext(),
+          "Reader can not read the next data even it has been re-initialized.");
+    }
+    std::vector<std::string> out_arg_names = Outputs("Out");
+    std::vector<framework::LoDTensor> ins;
+    reader->ReadNext(&ins);
+    PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
+    for (size_t i = 0; i < ins.size(); ++i) {
+      auto* out =
+          scope.FindVar(out_arg_names[i])->GetMutable<framework::LoDTensor>();
+      out->ShareDataWith(ins[i]);
+      out->set_lod(ins[i].lod());
+    }
+  }
+};
+
+class ReadOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ReadOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(op_proto, op_checker) {
+    AddInput("Reader", "(ReaderHolder) The executed reader.");
+    AddOutput("Out", "(LoDTensor) The output data.").AsDuplicable();
+    AddComment(R"DOC(
+      Read Operator
+
+      Execute a given reader once and output data.
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(read, ops::ReadOp, ops::ReadInferShape, ops::ReadOpMaker,
+                  paddle::framework::EmptyGradOpMaker, ops::ReadInferVarType);
diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc
index 593c35879ae2b3680b93ac5d8443110e61cb99fe..ba71094219f37eb7a3c2df68be986cec7afbf7ab 100644
--- a/paddle/operators/recv_op.cc
+++ b/paddle/operators/recv_op.cc
@@ -12,179 +12,60 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <stdint.h>
-#include <sys/stat.h>
 #include <ostream>
-#include <thread>
 
-#include <unistd.h>
-
-#include "paddle/framework/executor.h"
+#include "paddle/framework/data_type.h"
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/framework/proto_desc.h"
-#include "paddle/operators/detail/grpc_server.h"
-#include "paddle/operators/detail/sendrecvop_utils.h"
-#include "paddle/operators/detail/simple_block_queue.h"
-#include "paddle/string/printf.h"
 
-#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
+#include <future>
+#include "paddle/operators/detail/grpc_client.h"
 
 namespace paddle {
 namespace operators {
 
-constexpr char kOptimizeBlock[] = "OptimizeBlock";
-
-void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
-  service->RunSyncUpdate();
-  VLOG(4) << "RunServer thread end";
-}
-
-static void CreateTensorFromMessageType(framework::Variable *var,
-                                        sendrecv::VarType var_type) {
-  if (var_type == sendrecv::VarType::LOD_TENSOR) {
-    var->GetMutable<framework::LoDTensor>();
-  } else if (var_type == sendrecv::VarType::SELECTED_ROWS) {
-    var->GetMutable<framework::SelectedRows>();
-  } else {
-    PADDLE_THROW(
-        "VariableMessage type %d is not in "
-        "[LoDTensor, SelectedRows]",
-        var_type);
-  }
-}
-
 class RecvOp : public framework::OperatorBase {
  public:
-  RecvOp(const std::string &type, const framework::VariableNameMap &inputs,
-         const framework::VariableNameMap &outputs,
-         const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {
-    if (!rpc_service_) {
-      std::string endpoint = Attr<std::string>("endpoint");
-      rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
-      server_thread_.reset(new std::thread(RunServer, rpc_service_));
+  RecvOp(const std::string& type, const framework::VariableNameMap& inputs,
+         const framework::VariableNameMap& outputs,
+         const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope& scope,
+           const platform::Place& place) const override {
+    auto outs = Outputs("Out");
+    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& ctx = *pool.Get(place);
+
+    for (size_t i = 0; i < outs.size(); i++) {
+      VLOG(3) << "getting " << outs[i];
+      client_.AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
     }
+    PADDLE_ENFORCE(client_.Wait());
   }
 
-  void Stop() override {
-    detail::MessageWithName term_msg;
-    term_msg.first = LISTEN_TERMINATE_MESSAGE;
-    rpc_service_->Push(term_msg);
-    rpc_service_->ShutDown();
-    server_thread_->join();
-  }
-
-  std::string GetGradVarNameForTrainer(const std::string &varname) const {
-    if (grads_counter_.find(varname) == grads_counter_.end()) {
-      grads_counter_[varname] = 0;
-    }
-    return string::Sprintf("%s.trainer_%d", varname, grads_counter_[varname]++);
-  }
-
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(dev_place);
-    framework::Scope &recv_scope = scope.NewScope();
-
-    // FIXME(Yancey1989): initialize rpc server with laze mode.
-    rpc_service_->SetScope(&recv_scope);
-    rpc_service_->SetDevCtx(&dev_ctx);
-    auto param_list = Attr<std::vector<std::string>>("ParamList");
-    auto grad_list = Attr<std::vector<std::string>>("GradList");
-    auto fan_in = Attr<int>("Fanin");
-    size_t param_count = param_list.size();
-
-    auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
-    auto *program = block->Program();
-    framework::Executor executor(dev_place);
-
-    // TODO(typhoonzero): change this to a while_op for every cluster-batch.
-    bool exit_flag = false;
-    size_t barrier_size = param_count * fan_in;
-    while (!exit_flag) {
-      // Get from multiple trainers, we don't care about the order in which
-      // the gradients arrives, just add suffix 0~n and merge the gradient.
-      rpc_service_->SetCond(0);
-      for (size_t i = 0; i < barrier_size; ++i) {
-        const detail::MessageWithName &v = rpc_service_->Get();
-        auto grad_var_name = v.first;
-        if (grad_var_name == LISTEN_TERMINATE_MESSAGE) {
-          LOG(INFO) << "received terminate message and exit";
-          exit_flag = true;
-          break;
-        }
-        auto it = std::find(grad_list.begin(), grad_list.end(), grad_var_name);
-        std::string param_var_name;
-        if (it != grad_list.end()) {
-          param_var_name = param_list[it - grad_list.begin()];
-        } else {
-          LOG(ERROR) << "grad has no paired param:" << grad_var_name;
-        }
-        VLOG(3) << "received grad: " << grad_var_name
-                << " updating param: " << param_var_name;
-        if (fan_in > 1) {
-          grad_var_name = this->GetGradVarNameForTrainer(grad_var_name);
-        }
-        auto *var = recv_scope.FindVar(grad_var_name);
-        if (var == nullptr) {
-          LOG(ERROR) << "Can not find server side var: " << grad_var_name;
-          PADDLE_THROW("Can not find server side var");
-        }
-        detail::DeserializeFromMessage(v.second, dev_ctx, var);
-      }
-      if (exit_flag) {
-        break;
-      }
-
-      try {
-        executor.Run(*program, &recv_scope, block->ID(), /*global_block*/
-                     false /*create_local_scope*/, false /*create_vars*/);
-      } catch (std::exception &e) {
-        LOG(ERROR) << "run sub program error " << e.what();
-      }
-      rpc_service_->SetCond(1);
-      rpc_service_->WaitClientGet(barrier_size);
-      grads_counter_.clear();
-    }  // while(true)
-  }
-
- protected:
-  std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
-  std::shared_ptr<std::thread> server_thread_;
-  mutable std::unordered_map<std::string, int> grads_counter_;
+ private:
+  mutable detail::RPCClient client_;
 };
 
 class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RecvOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  RecvOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("RX", "(Tensor) Input tensor to be optimized").AsDuplicable();
+    AddOutput("Out", "(Tensor) Variables to get from server.").AsDuplicable();
     AddComment(R"DOC(
 Recv operator
 
-This operator will recieve tensor from send_op
+This operator can get variables from server side.
 )DOC");
-    AddAttr<std::string>("endpoint",
-                         "(string, default 127.0.0.1:6164)"
-                         "IP address to listen on.")
-        .SetDefault("127.0.0.1:6164")
-        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
-    AddAttr<framework::BlockDesc *>(
-        kOptimizeBlock, "Serialized ProgramDesc string for recv to run.");
-    AddAttr<std::vector<std::string>>(
-        "ParamList", "type list of string",
-        "grad->param name mapping to find which parameters to optimize.")
-        .SetDefault({});
-    AddAttr<std::vector<std::string>>(
-        "GradList", "type list of string",
-        "grad->param name mapping to find which parameters to optimize.")
+    AddAttr<std::vector<std::string>>("epmap",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints in the order of input "
+                                      "variables for mapping")
         .SetDefault({});
-    AddAttr<int>("Fanin", "type int",
-                 "Number of trainers in the current cluster job")
-        .SetDefault(1);
   }
 };
 
diff --git a/paddle/operators/reduce_op.cc b/paddle/operators/reduce_op.cc
index 4a06babeda00f2420df80f81f876a0047a3285ef..84f24a909597915f0eebb6c9cad37510cbe93e7b 100644
--- a/paddle/operators/reduce_op.cc
+++ b/paddle/operators/reduce_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/operators/reduce_op.h"
-#include "paddle/operators/net_op.h"
 
 namespace paddle {
 namespace operators {
@@ -38,10 +37,14 @@ class ReduceOp : public framework::OperatorWithKernel {
         dim, x_rank,
         "The dim should be in the range [-rank(input), rank(input)).");
     bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
+    bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
     if (reduce_all) {
-      ctx->SetOutputDim("Out", {1});
+      if (keep_dim)
+        ctx->SetOutputDim(
+            "Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
+      else
+        ctx->SetOutputDim("Out", {1});
     } else {
-      bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
       auto dims_vector = vectorize(x_dims);
       if (keep_dim || x_rank == 1) {
         dims_vector[dim] = 1;
diff --git a/paddle/operators/row_conv_op.cu b/paddle/operators/row_conv_op.cu
index 41f2c5b9de91ade15b4010f56377675cfd1b611c..b3825212e1ac41b13a2f4cad2c128da39c5f6e71 100644
--- a/paddle/operators/row_conv_op.cu
+++ b/paddle/operators/row_conv_op.cu
@@ -307,7 +307,7 @@ class RowConvKernel<platform::CUDADeviceContext, T>
     int input_dim = X->dims()[1];
     int num_sequence = batch_indices.size() - 1;
     int future_context = Filter->dims()[0];
-    size_t *idx = batch_indices.data();
+    size_t *idx = batch_indices.cuda_data();
     auto stream = context.cuda_device_context().stream();
 
     if (future_context <= 32) {
@@ -345,7 +345,7 @@ class RowConvGradKernel<platform::CUDADeviceContext, T>
     int input_dim = X->dims()[1];
     int num_sequence = batch_indices.size() - 1;
     int future_context = Filter->dims()[0];
-    size_t *idx = batch_indices.data();
+    size_t *idx = batch_indices.cuda_data();
 
     auto &device_ctx = context.cuda_device_context();
     math::SetConstant<platform::CUDADeviceContext, T> zero;
diff --git a/paddle/operators/save_combine_op.cc b/paddle/operators/save_combine_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bffa2908bc42d73332f22fa3706d24ab49cd4b38
--- /dev/null
+++ b/paddle/operators/save_combine_op.cc
@@ -0,0 +1,141 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stdint.h>
+#include <sys/stat.h>
+#include <fstream>
+#include <numeric>
+#include <sstream>
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+// TODO(sidgoyal78): These function are needed by other files (save_op), move
+// them to paddle::filesystem namespace. (as noted by yuyang18 in save_op).
+constexpr char kSEP = '/';
+static bool FileExists(const std::string &filepath) {
+  struct stat buffer;
+  return (stat(filepath.c_str(), &buffer) == 0);
+}
+
+static std::string DirName(const std::string &filepath) {
+  auto pos = filepath.rfind(kSEP);
+  if (pos == std::string::npos) {
+    return "";
+  }
+  return filepath.substr(0, pos);
+}
+
+static void MkDir(const char *path) {
+  if (mkdir(path, 0755)) {
+    PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
+  }
+}
+
+static void MkDirRecursively(const char *fullpath) {
+  if (*fullpath == '\0') return;  // empty string
+  if (FileExists(fullpath)) return;
+
+  MkDirRecursively(DirName(fullpath).c_str());
+  MkDir(fullpath);
+}
+
+class SaveCombineOp : public framework::OperatorBase {
+ public:
+  SaveCombineOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto filename = Attr<std::string>("file_path");
+    auto overwrite = Attr<bool>("overwrite");
+
+    bool is_present = FileExists(filename);
+    if (is_present && !overwrite) {
+      PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false",
+                   filename, overwrite);
+    }
+
+    MkDirRecursively(DirName(filename).c_str());
+    std::ofstream fout(filename);
+    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
+                   filename);
+
+    auto inp_var_names = Inputs("X");
+    PADDLE_ENFORCE_GT(static_cast<int>(inp_var_names.size()), 0,
+                      "The number of input variables should be greater than 0");
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    for (size_t i = 0; i < inp_var_names.size(); i++) {
+      auto *var = scope.FindVar(inp_var_names[i]);
+
+      PADDLE_ENFORCE(var != nullptr,
+                     "Cannot find variable %s for save_combine_op",
+                     inp_var_names[i]);
+      PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
+                     "SaveCombineOp only supports LoDTensor, %s has wrong type",
+                     inp_var_names[i]);
+
+      auto &tensor = var->Get<framework::LoDTensor>();
+      // Serialize tensor
+      framework::SerializeToStream(fout, tensor, dev_ctx);
+    }
+    fout.close();
+  }
+};
+
+class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SaveCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(vector) Input LoDTensors that need to be saved together in a file.")
+        .AsDuplicable();
+    AddComment(R"DOC(
+SaveCombine operator
+
+This operator will serialize and write a list of input LoDTensor variables 
+to a file on disk.
+)DOC");
+    AddAttr<bool>("overwrite",
+                  "(boolean, default true)"
+                  "Overwrite the output file if it exists.")
+        .SetDefault(true);
+    AddAttr<std::string>(
+        "file_path",
+        "(string)"
+        "The \"file_path\" where the LoDTensor variables will be saved.")
+        .AddCustomChecker(
+            [](const std::string &path) { return !path.empty(); });
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(save_combine, ops::SaveCombineOp,
+                  ops::SaveCombineOpProtoMaker);
diff --git a/paddle/operators/save_load_combine_op_test.cc b/paddle/operators/save_load_combine_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f3ddc4a6c55d72e4e444869a1ebcd7662c892317
--- /dev/null
+++ b/paddle/operators/save_load_combine_op_test.cc
@@ -0,0 +1,180 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include <string>
+#include <vector>
+#include "gtest/gtest.h"
+#include "paddle/framework/op_registry.h"
+
+USE_NO_KERNEL_OP(save_combine);
+USE_NO_KERNEL_OP(load_combine);
+
+int* CreateForSaveCombineOp(int x, int y, const std::vector<int>& lod_info,
+                            std::string var_name,
+                            paddle::platform::CPUPlace& place,
+                            paddle::framework::Scope& scope,
+                            paddle::framework::LoD& expect_lod) {
+  auto var = scope.Var(var_name);
+  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
+  tensor->Resize({x, y});
+  expect_lod.resize(1);
+  for (size_t i = 0; i < lod_info.size(); i++) {
+    expect_lod[0].push_back(lod_info[i]);
+  }
+  tensor->set_lod(expect_lod);
+  int* expect = tensor->mutable_data<int>(place);
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    expect[i] = static_cast<int>(i);
+  }
+  return expect;
+}
+
+paddle::framework::LoDTensor* GeneratePlaceholderBeforeLoad(
+    const std::string out_var_name, paddle::framework::Scope& scope) {
+  auto load_var = scope.Var(out_var_name);
+  auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
+  return target;
+}
+
+int* GetValuesAfterLoadCombineOp(paddle::framework::LoDTensor* target,
+                                 paddle::framework::Scope& scope,
+                                 paddle::framework::LoD& actual_lod) {
+  int* actual = target->data<int>();
+  actual_lod = target->lod();
+  return actual;
+}
+
+void CheckValues(int* expect, int* actual, paddle::framework::LoD expect_lod,
+                 paddle::framework::LoD actual_lod, const int& numel) {
+  for (int64_t i = 0; i < numel; ++i) {
+    EXPECT_EQ(expect[i], actual[i]);
+  }
+  EXPECT_EQ(expect_lod.size(), actual_lod.size());
+  for (size_t i = 0; i < expect_lod.size(); ++i) {
+    for (size_t j = 0; j < expect_lod[i].size(); ++j) {
+      EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
+    }
+  }
+}
+
+// Here, we create 4 LoDTensors and use save_combine_op to first save these
+// in a single file. Then, we use load_combine_op to load these sequentially
+TEST(SaveLoadCombineOp, CPU) {
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+
+  std::vector<int> lod1 = {0, 1, 2, 3, 10};
+  int numel1 = 100;
+  paddle::framework::LoD expect_lod1;
+  int* expect1 = CreateForSaveCombineOp(10, 10, lod1, "test_var1", place, scope,
+                                        expect_lod1);
+
+  std::vector<int> lod2 = {0, 2, 5, 10};
+  int numel2 = 200;
+  paddle::framework::LoD expect_lod2;
+  int* expect2 = CreateForSaveCombineOp(10, 20, lod2, "test_var2", place, scope,
+                                        expect_lod2);
+
+  std::vector<int> lod3 = {0, 2, 3, 20};
+  int numel3 = 4000;
+  paddle::framework::LoD expect_lod3;
+  int* expect3 = CreateForSaveCombineOp(20, 200, lod3, "test_var3", place,
+                                        scope, expect_lod3);
+
+  std::vector<int> lod4 = {0, 1, 20};
+  int numel4 = 1000;
+  paddle::framework::LoD expect_lod4;
+  int* expect4 = CreateForSaveCombineOp(20, 50, lod4, "test_var4", place, scope,
+                                        expect_lod4);
+
+  // Set attributes
+  std::string filename = "check_tensor.ls";
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"file_path", std::string(filename)});
+
+  // Run the save_combine_op
+  auto save_combine_op = paddle::framework::OpRegistry::CreateOp(
+      "save_combine",
+      {{"X", {"test_var1", "test_var2", "test_var3", "test_var4"}}}, {}, attrs);
+  save_combine_op->Run(scope, place);
+
+  // Set up output vars
+  auto target1 = GeneratePlaceholderBeforeLoad("out_var1", scope);
+  auto target2 = GeneratePlaceholderBeforeLoad("out_var2", scope);
+  auto target3 = GeneratePlaceholderBeforeLoad("out_var3", scope);
+  auto target4 = GeneratePlaceholderBeforeLoad("out_var4", scope);
+
+  // Run the load_combine_op
+  auto load_combine_op = paddle::framework::OpRegistry::CreateOp(
+      "load_combine", {},
+      {{"Out", {"out_var1", "out_var2", "out_var3", "out_var4"}}}, attrs);
+  load_combine_op->Run(scope, place);
+
+  paddle::framework::LoD actual_lod1, actual_lod2, actual_lod3, actual_lod4;
+  int* actual1 = GetValuesAfterLoadCombineOp(target1, scope, actual_lod1);
+  int* actual2 = GetValuesAfterLoadCombineOp(target2, scope, actual_lod2);
+  int* actual3 = GetValuesAfterLoadCombineOp(target3, scope, actual_lod3);
+  int* actual4 = GetValuesAfterLoadCombineOp(target4, scope, actual_lod4);
+
+  CheckValues(expect1, actual1, expect_lod1, actual_lod1, numel1);
+  CheckValues(expect2, actual2, expect_lod2, actual_lod2, numel2);
+  CheckValues(expect3, actual3, expect_lod3, actual_lod3, numel3);
+  CheckValues(expect4, actual4, expect_lod4, actual_lod4, numel4);
+}
+
+// Test with original SaveLoadTest
+TEST(SaveLoadTestWithCombineOp, CPU) {
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+
+  auto var = scope.Var("test_var");
+  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
+  tensor->Resize({3, 10});
+  paddle::framework::LoD expect_lod;
+  expect_lod.resize(1);
+  expect_lod[0].push_back(0);
+  expect_lod[0].push_back(1);
+  expect_lod[0].push_back(2);
+  expect_lod[0].push_back(3);
+
+  tensor->set_lod(expect_lod);
+  int* expect = tensor->mutable_data<int>(place);
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    expect[i] = static_cast<int>(i);
+  }
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"file_path", std::string("check_t.save")});
+
+  auto save_op = paddle::framework::OpRegistry::CreateOp(
+      "save_combine", {{"X", {"test_var"}}}, {}, attrs);
+  save_op->Run(scope, place);
+
+  auto load_var = scope.Var("out_var");
+  auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
+  auto load_op = paddle::framework::OpRegistry::CreateOp(
+      "load_combine", {}, {{"Out", {"out_var"}}}, attrs);
+  load_op->Run(scope, place);
+  int* actual = target->data<int>();
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    EXPECT_EQ(expect[i], actual[i]);
+  }
+  auto& actual_lod = target->lod();
+  EXPECT_EQ(expect_lod.size(), actual_lod.size());
+  for (size_t i = 0; i < expect_lod.size(); ++i) {
+    for (size_t j = 0; j < expect_lod[i].size(); ++j) {
+      EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
+    }
+  }
+}
diff --git a/paddle/operators/save_load_op_test.cc b/paddle/operators/save_load_op_test.cc
index 40103d864fb58804b39ca5f3c63e802a430ce886..d829d5da174b73613da9dcfcd308a5b05e12bce9 100644
--- a/paddle/operators/save_load_op_test.cc
+++ b/paddle/operators/save_load_op_test.cc
@@ -24,7 +24,7 @@ TEST(SaveLoadOp, CPU) {
 
   auto var = scope.Var("test_var");
   auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
-  tensor->Resize({10, 10});
+  tensor->Resize({3, 10});
   paddle::framework::LoD expect_lod;
   expect_lod.resize(1);
   expect_lod[0].push_back(0);
diff --git a/paddle/operators/send_op.cc b/paddle/operators/send_op.cc
index 5aa66c20eaf77959089100f8dcee55f2bc83a71a..ee0f268b0e4dfa23bf878d71404d47553183a977 100644
--- a/paddle/operators/send_op.cc
+++ b/paddle/operators/send_op.cc
@@ -37,25 +37,39 @@ class SendOp : public framework::OperatorBase {
     auto ins = Inputs("X");
     auto outs = Outputs("Out");
     std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+    std::vector<std::string> endpoints =
+        Attr<std::vector<std::string>>("endpoints");
 
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
+
+    auto client_var_name = Output("RPCClient");
+    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
+                            "Can not find variable '%s' in the scope.",
+                            client_var_name);
+    auto* client_var = scope.FindVar(client_var_name);
+    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
+
     for (size_t i = 0; i < ins.size(); i++) {
-      VLOG(3) << "sending " << ins[i];
-      client_.AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
+      VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
+      rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
     }
-    PADDLE_ENFORCE(client_.Wait());
+    PADDLE_ENFORCE(rpc_client->Wait());
 
-    for (size_t i = 0; i < outs.size(); i++) {
-      VLOG(3) << "getting " << outs[i];
-      client_.AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
+    for (auto& ep : endpoints) {
+      VLOG(3) << "batch barrier, ep: " << ep;
+      rpc_client->AsyncSendBatchBarrier(ep);
+    }
+    PADDLE_ENFORCE(rpc_client->Wait());
+
+    if (outs.size() > 0) {
+      for (size_t i = 0; i < outs.size(); i++) {
+        VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
+        rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
+      }
+      PADDLE_ENFORCE(rpc_client->Wait());
     }
-
-    PADDLE_ENFORCE(client_.Wait());
   }
-
- private:
-  mutable detail::RPCClient client_;
 };
 
 class SendOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -65,11 +79,16 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "(Tensor) Input tensor to be sent").AsDuplicable();
     AddOutput("Out", "(Tensor) Output tensor to be received from server")
         .AsDuplicable();
+    AddOutput("RPCClient",
+              "(RPCClient) The RPC client object which is"
+              "initialized at most once.");
     AddComment(R"DOC(
 Send operator
 
 This operator will send tensor to recv_op at the parameter server.
 )DOC");
+    // TODO(typhoonzero): remove this attr generate de-duplicated vector from
+    // epmap when initializing.
     AddAttr<std::vector<std::string>>("endpoints",
                                       "(string vector, default 127.0.0.1:6164)"
                                       "Server endpoints to send variables to.")
diff --git a/paddle/operators/send_recv_op_test.cc b/paddle/operators/send_recv_op_test.cc
index 045a0f5434f339bab345d14881ed05450ce6588d..31527a906d56da54d2571910de627757d708a996 100644
--- a/paddle/operators/send_recv_op_test.cc
+++ b/paddle/operators/send_recv_op_test.cc
@@ -25,7 +25,7 @@ limitations under the License. */
 #include "paddle/string/printf.h"
 
 USE_NO_KERNEL_OP(send);
-USE_NO_KERNEL_OP(recv);
+USE_NO_KERNEL_OP(listen_and_serv);
 USE_OP(sum);
 
 namespace f = paddle::framework;
@@ -33,7 +33,7 @@ namespace p = paddle::platform;
 namespace m = paddle::operators::math;
 
 // global for simplicity.
-std::unique_ptr<f::OperatorBase> recv_op;
+std::unique_ptr<f::OperatorBase> listen_and_serv_op;
 
 void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) {
   p::CPUDeviceContext ctx(place);
@@ -120,7 +120,7 @@ void StartServerNet(bool is_sparse) {
     InitTensorsInScope(scope, place);
   }
 
-  // sub program run in recv_op, for simple test we use sum
+  // sub program run in listen_and_serv_op, for simple test we use sum
   f::ProgramDesc program;
   f::BlockDesc *block = program.MutableBlock(0);
   // X for server side tensors, RX for received tensers, must be of same shape.
@@ -131,8 +131,9 @@ void StartServerNet(bool is_sparse) {
   attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
   attrs.insert({"GradList", std::vector<std::string>({"x1"})});
   attrs.insert({"OptimizeBlock", block});
-  recv_op = f::OpRegistry::CreateOp("recv", {{"RX", {"x1"}}}, {}, attrs);
-  recv_op->Run(scope, place);
+  listen_and_serv_op =
+      f::OpRegistry::CreateOp("listen_and_serv", {}, {}, attrs);
+  listen_and_serv_op->Run(scope, place);
 }
 
 TEST(SendRecvOp, CPUDense) {
@@ -161,9 +162,9 @@ TEST(SendRecvOp, CPUDense) {
   for (int64_t i = 0; i < target->numel(); ++i) {
     EXPECT_EQ(expected[i] * 2, actual[i]);
   }
-  recv_op->Stop();
+  listen_and_serv_op->Stop();
   server_thread.join();
-  recv_op.reset(nullptr);
+  listen_and_serv_op.reset(nullptr);
 }
 
 TEST(SendRecvOp, CPUSparse) {
@@ -200,7 +201,7 @@ TEST(SendRecvOp, CPUSparse) {
     EXPECT_EQ(expect_value->mutable_data<float>(place)[i],
               actual->mutable_data<float>(place)[i]);
   }
-  recv_op->Stop();
+  listen_and_serv_op->Stop();
   server_thread.join();
-  recv_op.reset();
+  listen_and_serv_op.reset();
 }
diff --git a/paddle/operators/sequence_erase_op.cu b/paddle/operators/sequence_erase_op.cu
index f1e3b96acd0259de2b3ca1348834bd17e1e174a2..a5311f15f0c607c880a6f12c0bef10b2dd8c8a79 100644
--- a/paddle/operators/sequence_erase_op.cu
+++ b/paddle/operators/sequence_erase_op.cu
@@ -96,9 +96,8 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
     GetOutLod<<<(lod_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
                 PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
         num_erased_ptr, dev_in_lod_ptr, lod_len, dev_out_lod_ptr);
-
     // Set LoD for output
-    thrust::host_vector<size_t> out_lod0 = dev_out_lod;
+    std::vector<size_t> out_lod0(dev_out_lod.begin(), dev_out_lod.end());
     framework::LoD out_lod;
     out_lod.push_back(out_lod0);
     out->set_lod(out_lod);
diff --git a/paddle/operators/sequence_reshape_op.cc b/paddle/operators/sequence_reshape_op.cc
index 57cca13105537d88fe942b850cae10650d3096e2..d89a46a712c9c84a142e1e347219ed171556d761 100644
--- a/paddle/operators/sequence_reshape_op.cc
+++ b/paddle/operators/sequence_reshape_op.cc
@@ -30,8 +30,13 @@ class SequenceReshapeOp : public framework::OperatorWithKernel {
     auto x_numel = product(x_dims);
     PADDLE_ENFORCE_EQ(x_dims.size(), 2U, "Rank of Input(X) should be 2.");
     int new_dim = ctx->Attrs().Get<int>("new_dim");
-    ctx->SetOutputDim("Out",
-                      {x_numel / new_dim, static_cast<int64_t>(new_dim)});
+    if (ctx->IsRuntime()) {
+      ctx->SetOutputDim("Out",
+                        {x_numel / new_dim, static_cast<int64_t>(new_dim)});
+    } else {
+      // when compiling, the batch size is undetermined, just set to -1
+      ctx->SetOutputDim("Out", {-1, static_cast<int64_t>(new_dim)});
+    }
   }
 };
 
diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu
index 42f8f8b2f072f9d204dfadcd732926b5c98dc617..29f5aa3542c26c76a1b80da61ec6752019216131 100644
--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
@@ -89,7 +89,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
 
       auto& in_value = grad->value();
-      auto& in_rows = grad->rows();
+      framework::Vector<int64_t> in_rows(grad->rows());
 
       int64_t in_row_numel = in_value.numel() / in_rows.size();
       PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
@@ -102,7 +102,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
       dim3 grid(1, in_rows.size());
       SparseSGDFunctorKernel<
           T, 256><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
-          in_data, in_rows.data(), learning_rate->data<T>(), out_data,
+          in_data, in_rows.cuda_data(), learning_rate->data<T>(), out_data,
           in_row_numel);
 
     } else {
diff --git a/paddle/operators/sum_op.h b/paddle/operators/sum_op.h
index 48201b344de0d3bd2b121a12389876dad095f10d..3d8102c3ae20c8b714cd48b4fc78dc18a0cf89a7 100644
--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
@@ -68,7 +68,32 @@ class SumKernel : public framework::OpKernel<T> {
         }
       }
     } else if (out_var->IsType<framework::SelectedRows>()) {
-      PADDLE_ENFORCE(!in_place, "SelectedRows not support inplace sum now");
+      std::unique_ptr<framework::SelectedRows> in0;
+      if (in_place) {
+        // If is in_place, we store the input[0] to in0
+        auto &in_sel0 = in_vars[0]->Get<SelectedRows>();
+        auto &rows = in_sel0.rows();
+#ifdef PADDLE_WITH_CUDA
+        std::vector<int64_t> rows_in_cpu;
+        rows_in_cpu.reserve(rows.size());
+        for (auto item : rows) {
+          rows_in_cpu.push_back(item);
+        }
+        in0.reset(new framework::SelectedRows(rows_in_cpu, in_sel0.height()));
+#else
+        in0.reset(new framework::SelectedRows(rows, in_sel0.height()));
+#endif
+        in0->mutable_value()->ShareDataWith(in_sel0.value());
+      }
+
+      auto get_selected_row = [&](size_t i) -> const SelectedRows & {
+        if (i == 0 && in0) {
+          return *in0.get();
+        } else {
+          return in_vars[i]->Get<SelectedRows>();
+        }
+      };
+
       auto *out = context.Output<SelectedRows>("Out");
       out->mutable_rows()->clear();
       auto *out_value = out->mutable_value();
@@ -76,24 +101,26 @@ class SumKernel : public framework::OpKernel<T> {
       // Runtime InferShape
       size_t first_dim = 0;
       for (int i = 0; i < N; i++) {
-        first_dim += in_vars[i]->Get<SelectedRows>().rows().size();
+        auto &sel_row = get_selected_row(i);
+        first_dim += sel_row.rows().size();
       }
-      auto in_dim = in_vars[0]->Get<SelectedRows>().value().dims();
-      auto in_dim_vec = framework::vectorize(in_dim);
-      in_dim_vec[0] = static_cast<int64_t>(first_dim);
+      auto in_dim =
+          framework::vectorize(get_selected_row(N - 1).value().dims());
+      in_dim[0] = static_cast<int64_t>(first_dim);
 
-      out_value->Resize(framework::make_ddim(in_dim_vec));
+      out_value->Resize(framework::make_ddim(in_dim));
       out_value->mutable_data<T>(context.GetPlace());
 
       math::SelectedRowsAddTo<DeviceContext, T> functor;
 
       int64_t offset = 0;
       for (int i = 0; i < N; i++) {
-        PADDLE_ENFORCE_EQ(out->height(),
-                          in_vars[i]->Get<SelectedRows>().height());
-        functor(context.template device_context<DeviceContext>(),
-                in_vars[i]->Get<SelectedRows>(), offset, out);
-        offset += in_vars[i]->Get<SelectedRows>().value().numel();
+        auto &sel_row = get_selected_row(i);
+
+        PADDLE_ENFORCE_EQ(out->height(), sel_row.height());
+        functor(context.template device_context<DeviceContext>(), sel_row,
+                offset, out);
+        offset += sel_row.value().numel();
       }
     } else if (out_var->IsType<framework::LoDTensorArray>()) {
       auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>();
diff --git a/paddle/operators/target_assign_op.cc b/paddle/operators/target_assign_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..615ca857ceb45d442b75fffc6662cc2bda19562d
--- /dev/null
+++ b/paddle/operators/target_assign_op.cc
@@ -0,0 +1,202 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/target_assign_op.h"
+
+namespace paddle {
+namespace operators {
+
+class TargetAssignOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    // checkout inputs
+    PADDLE_ENFORCE(ctx->HasInput("EncodedGTBBox"),
+                   "Input(EncodedGTBBox) of TargetAssignOp should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("GTScoreLabel"),
+                   "Input(GTScoreLabel) of TargetAssignOp should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("MatchIndices"),
+                   "Input(MatchIndices) of TargetAssignOp should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("NegIndices"),
+                   "Input(NegIndices) of TargetAssignOp should not be null");
+
+    // checkout outputs
+    PADDLE_ENFORCE(
+        ctx->HasOutput("PredBBoxLabel"),
+        "Output(PredBBoxLabel) of TargetAssignOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("PredBBoxWeight"),
+        "Output(PredBBoxWeight) of TargetAssignOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("PredScoreLabel"),
+        "Output(PredScoreLabel) of TargetAssignOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("PredScoreWeight"),
+        "Output(PredScoreWeight) of TargetAssignOp should not be null.");
+
+    auto blabel_dims = ctx->GetInputDim("EncodedGTBBox");
+    auto slabel_dims = ctx->GetInputDim("GTScoreLabel");
+    auto mi_dims = ctx->GetInputDim("MatchIndices");
+    auto neg_dims = ctx->GetInputDim("NegIndices");
+
+    PADDLE_ENFORCE_EQ(blabel_dims.size(), 3UL,
+                      "The rank of Input(EncodedGTBBox) must be 3.");
+    PADDLE_ENFORCE_EQ(slabel_dims.size(), 2UL,
+                      "The rank of Input(GTScoreLabel) must be 2.");
+    PADDLE_ENFORCE_EQ(mi_dims.size(), 2UL,
+                      "The rank of Input(MatchIndices) must be 2.");
+    PADDLE_ENFORCE_EQ(neg_dims.size(), 2UL,
+                      "The rank of Input(NegIndices) must be 2.");
+
+    PADDLE_ENFORCE_EQ(blabel_dims[0], slabel_dims[0],
+                      "The 1st dimension (means the total number of "
+                      "ground-truth bounding boxes) of Input(EncodedGTBBox) "
+                      "and Input(GTScoreLabel) must be the same.");
+    PADDLE_ENFORCE_EQ(blabel_dims[1], mi_dims[1],
+                      "The 2nd dimension (means the number of priod boxes) "
+                      "of Input(EncodedGTBBox) and "
+                      "Input(MatchIndices) must be the same.");
+    PADDLE_ENFORCE_EQ(blabel_dims[2], 4,
+                      "The 3rd dimension of Input(EncodedGTBBox) must be 4.");
+
+    auto n = mi_dims[0];
+    auto np = mi_dims[1];
+    ctx->SetOutputDim("PredBBoxLabel", {n, np, 4});
+    ctx->SetOutputDim("PredBBoxWeight", {n, np, 1});
+    ctx->SetOutputDim("PredScoreLabel", {n, np, 1});
+    ctx->SetOutputDim("PredScoreWeight", {n, np, 1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>("EncodedGTBBox")->type()),
+        ctx.device_context());
+  }
+};
+
+class TargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  TargetAssignOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("EncodedGTBBox",
+             "(LoDTensor), The encoded ground-truth bounding boxes with shape "
+             "[Ng, Np, 4], where Ng is the total number of ground-truth boxes "
+             "in this mini-batch, Np the number of predictions, 4 is the "
+             "number of coordinate in [xmin, ymin, xmax, ymax] layout.");
+    AddInput("GTScoreLabel",
+             "(LoDTensor, default LoDTensor<int>),  The input ground-truth "
+             "labels with shape [Ng, 1], where the Ng is the same as it in "
+             "the input of EncodedGTBBox.");
+    AddInput("MatchIndices",
+             "(Tensor, default Tensor<int>), The input matched indices "
+             "with shape [N, Np], where N is the batch size, Np is the same "
+             "as it in the input of EncodedGTBBox. If MatchIndices[i][j] "
+             "is -1, the j-th prior box is not matched to any ground-truh "
+             "box in i-th instance.");
+    AddInput("NegIndices",
+             "(LoDTensor, default LoDTensor<int>), The input negative example "
+             "indices with shape [Neg, 1], where is the total number of "
+             "negative example indices.");
+    AddAttr<int>("background_label",
+                 "(int, default 0), Label index of background class.")
+        .SetDefault(0);
+    AddOutput("PredBBoxLabel",
+              "(Tensor), The output encoded ground-truth labels "
+              "with shape [N, Np, 4], N is the batch size and Np, 4 is the "
+              "same as they in input of EncodedGTBBox. If MatchIndices[i][j] "
+              "is -1, the PredBBoxLabel[i][j][:] is the encoded ground-truth "
+              "box for background_label in i-th instance.");
+    AddOutput("PredBBoxWeight",
+              "(Tensor), The weight for PredBBoxLabel with the shape "
+              "of [N, Np, 1]");
+    AddOutput("PredScoreLabel",
+              "(Tensor, default Tensor<int>), The output score labels for "
+              "each predictions with shape [N, Np, 1]. If MatchIndices[i][j] "
+              "is -1, PredScoreLabel[i][j] = background_label.");
+    AddOutput("PredScoreWeight",
+              "(Tensor), The weight for PredScoreLabel with the shape "
+              "of [N, Np, 1]");
+    AddComment(R"DOC(
+This operator is, for given the encoded boxes between prior boxes and
+ground-truth boxes and ground-truth class labels, to assign classification
+and regression targets to each prior box as well as weights to each
+prior box. The weights is used to specify which prior box would not contribute
+to training loss.
+
+For each instance, the output `PredBBoxLabel`, `PredBBoxWeight`,
+`PredScoreLabel` and `PredScoreWeight` are assigned based on `MatchIndices`.
+Assumed that the row offset for each instance in `EncodedGTBBox` is called lod,
+this operato assigns classification/regression targets by performing the
+following steps:
+
+1. Assigning all outpts based on `MatchIndices`:
+
+If id = MatchIndices[i][j] > 0,
+
+    PredBBoxLabel[i][j] = EncodedGTBBox[lod[i] + id][j]
+    PredBBoxWeight[i][j] = 1.
+    PredScoreLabel[i][j] = GTScoreLabel[lod[i] + id]
+    PredScoreWeight[i][j] = 1.
+
+Otherwise, 
+
+    PredBBoxLabel[j][j] = [0., 0., 0., 0.]
+    PredBBoxWeight[i][j] = 0.
+    PredScoreLabel[i][j] = background_label
+    PredScoreWeight[i][j] = 0.
+
+2. Assigning PredScoreWeight based on `NegIndices`:
+
+Assumed that the row offset for each instance in `NegIndices` is caleed neg_lod,
+for i-th instance and all ids of NegIndices in this instance:
+
+    PredScoreLabel[i][id] = background_label
+    PredScoreWeight[i][id] = 1.0
+
+    )DOC");
+  }
+};
+
+template <typename T>
+struct NegTargetAssignFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx, const int* neg_indices,
+                  const size_t* lod, const int num, const int num_prior_box,
+                  const int background_label, int* out_label, T* out_label_wt) {
+    for (int i = 0; i < num; ++i) {
+      for (size_t j = lod[i]; j < lod[i + 1]; ++j) {
+        int id = neg_indices[j];
+        out_label[i * num_prior_box + id] = background_label;
+        out_label_wt[i * num_prior_box + id] = static_cast<T>(1.0);
+      }
+    }
+  }
+};
+
+template struct NegTargetAssignFunctor<platform::CPUDeviceContext, float>;
+template struct NegTargetAssignFunctor<platform::CPUDeviceContext, double>;
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(target_assign, ops::TargetAssignOp,
+                             ops::TargetAssignOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    target_assign,
+    ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/target_assign_op.cu b/paddle/operators/target_assign_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fc0a1000a4202adeca3e0d6fbb05e832a79dbaba
--- /dev/null
+++ b/paddle/operators/target_assign_op.cu
@@ -0,0 +1,61 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/target_assign_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void NegTargetAssignKernel(const int* neg_indices, const size_t* lod,
+                                      const int num, const int num_prior_box,
+                                      const int background_label,
+                                      int* out_label, T* out_label_wt) {
+  int bidx = blockIdx.x;
+  int st = lod[bidx];
+  int ed = lod[bidx + 1];
+
+  int row_start = bidx * num_prior_box;
+  for (int i = st + threadIdx.x; i < ed; i += blockDim.x) {
+    int id = row_start + neg_indices[i];
+    out_label[id] = background_label;
+    out_label_wt[id] = 1.;
+  }
+}
+
+template <typename T>
+struct NegTargetAssignFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const int* neg_indices, const size_t* lod, const int num,
+                  const int num_prior_box, const int background_label,
+                  int* out_label, T* out_label_wt) {
+    const int block_size = 256;
+    const int grid_size = num;
+    NegTargetAssignKernel<T><<<grid_size, block_size, 0, ctx.stream()>>>(
+        neg_indices, lod, num, num_prior_box, background_label, out_label,
+        out_label_wt);
+  }
+};
+
+template struct NegTargetAssignFunctor<platform::CUDADeviceContext, float>;
+template struct NegTargetAssignFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    target_assign,
+    ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/target_assign_op.h b/paddle/operators/target_assign_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..82fca5724c0bd9fbfb60a98b91944700bfab9cdf
--- /dev/null
+++ b/paddle/operators/target_assign_op.h
@@ -0,0 +1,160 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct TargetAssignFunctor {
+  const T* gt_box_;
+  const int* gt_label_;
+  const int* match_indices_;
+  const size_t* lod_;
+  const int background_label_;
+  const int64_t num_;
+  const int64_t num_prior_box_;
+
+  T* out_box_;
+  T* out_box_wt_;
+  int* out_label_;
+  T* out_label_wt_;
+
+  TargetAssignFunctor(const T* gt_box, const int* gt_label,
+                      const int* match_indices, const size_t* lod,
+                      const int background_label, const int64_t num,
+                      const int64_t np, T* out_box, T* out_box_wt,
+                      int* out_label, T* out_label_wt)
+      : gt_box_(gt_box),
+        gt_label_(gt_label),
+        match_indices_(match_indices),
+        lod_(lod),
+        background_label_(background_label),
+        num_(num),
+        num_prior_box_(np),
+        out_box_(out_box),
+        out_box_wt_(out_box_wt),
+        out_label_(out_label),
+        out_label_wt_(out_label_wt) {}
+
+  HOSTDEVICE void operator()(size_t i) const {
+    int row = i / num_prior_box_;
+    int col = i - row * num_prior_box_;
+
+    size_t row_off = lod_[row];
+    int offset = row * num_prior_box_ + col;
+
+    int id = match_indices_[offset];
+    T* obox = out_box_ + offset * 4;
+    int* olabel = out_label_ + offset;
+    T* obox_wt = out_box_wt_ + offset;
+    T* olabel_wt = out_label_wt_ + offset;
+
+    if (id > -1) {
+      const T* gtbox = gt_box_ + ((row_off + id) * num_prior_box_ + col) * 4;
+
+      obox[0] = gtbox[0];
+      obox[1] = gtbox[1];
+      obox[2] = gtbox[2];
+      obox[3] = gtbox[3];
+
+      olabel[0] = gt_label_[row_off + id];
+      obox_wt[0] = static_cast<T>(1.);
+      olabel_wt[0] = static_cast<T>(1.);
+    } else {
+      obox[0] = static_cast<T>(0.);
+      obox[1] = static_cast<T>(0.);
+      obox[2] = static_cast<T>(0.);
+      obox[3] = static_cast<T>(0.);
+
+      olabel[0] = background_label_;
+      obox_wt[0] = static_cast<T>(0.);
+      olabel_wt[0] = static_cast<T>(0.);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+struct NegTargetAssignFunctor {
+  void operator()(const platform::DeviceContext& ctx, const int* neg_indices,
+                  const size_t* lod, const int num, const int num_prior_box,
+                  const int background_label, int* out_label,
+                  T* out_label_wt) const;
+};
+
+template <typename DeviceContext, typename T>
+class TargetAssignKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* enc_gt_box = ctx.Input<framework::LoDTensor>("EncodedGTBBox");
+    auto* gt_label = ctx.Input<framework::LoDTensor>("GTScoreLabel");
+    auto* match_indices = ctx.Input<framework::Tensor>("MatchIndices");
+    auto* neg_indices = ctx.Input<framework::LoDTensor>("NegIndices");
+
+    auto* out_box = ctx.Output<framework::Tensor>("PredBBoxLabel");
+    auto* out_box_wt = ctx.Output<framework::Tensor>("PredBBoxWeight");
+    auto* out_label = ctx.Output<framework::Tensor>("PredScoreLabel");
+    auto* out_label_wt = ctx.Output<framework::Tensor>("PredScoreWeight");
+
+    PADDLE_ENFORCE_EQ(enc_gt_box->lod().size(), 1UL);
+    PADDLE_ENFORCE_EQ(gt_label->lod().size(), 1UL);
+    PADDLE_ENFORCE_EQ(neg_indices->lod().size(), 1UL);
+
+    int background_label = ctx.Attr<int>("background_label");
+
+    const T* box_data = enc_gt_box->data<T>();
+    const int* label_data = gt_label->data<int>();
+    const int* match_idx_data = match_indices->data<int>();
+    const int* neg_idx_data = neg_indices->data<int>();
+
+    T* obox_data = out_box->mutable_data<T>(ctx.GetPlace());
+    T* obox_wt_data = out_box_wt->mutable_data<T>(ctx.GetPlace());
+    int* olabel_data = out_label->mutable_data<int>(ctx.GetPlace());
+    T* olabel_wt_data = out_label_wt->mutable_data<T>(ctx.GetPlace());
+
+    int64_t num = match_indices->dims()[0];
+    int64_t num_prior_box = match_indices->dims()[1];
+
+    auto gt_lod = enc_gt_box->lod().back();
+    auto gt_label_lod = gt_label->lod().back();
+    auto neg_lod = neg_indices->lod().back();
+    for (size_t i = 0; i < gt_lod.size(); ++i) {
+      PADDLE_ENFORCE_EQ(gt_lod.data()[i], gt_label_lod.data()[i]);
+    }
+
+    size_t* gt_lod_data = gt_lod.data(ctx.GetPlace());
+    size_t* neg_lod_data = neg_lod.data(ctx.GetPlace());
+
+    TargetAssignFunctor<T> functor(box_data, label_data, match_idx_data,
+                                   gt_lod_data, background_label, num,
+                                   num_prior_box, obox_data, obox_wt_data,
+                                   olabel_data, olabel_wt_data);
+
+    auto& device_ctx = ctx.template device_context<DeviceContext>();
+    platform::ForRange<DeviceContext> for_range(device_ctx,
+                                                num * num_prior_box);
+    for_range(functor);
+
+    NegTargetAssignFunctor<DeviceContext, T> neg_trg_functor;
+    neg_trg_functor(device_ctx, neg_idx_data, neg_lod_data, num, num_prior_box,
+                    background_label, olabel_data, olabel_wt_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc
index 2fdd25dbbe68659f8a0a9da13a87148ed259127a..a744ebd61595403ee495a2e2c9e84181422e92ff 100644
--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
@@ -53,6 +53,8 @@ class WhileOp : public framework::OperatorBase {
     auto step_scopes =
         scope.FindVar(Output(kStepScopes))->GetMutable<StepScopeVar>();
 
+    PADDLE_ENFORCE(platform::is_cpu_place(cond.place()),
+                   "Condition of while op must in CPU memory.");
     while (cond.data<bool>()[0]) {
       auto &current_scope = scope.NewScope();
       step_scopes->push_back(&current_scope);
@@ -99,6 +101,9 @@ class WhileGradOp : public framework::OperatorBase {
 
   void Run(const framework::Scope &scope,
            const platform::Place &dev_place) const override {
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
     framework::Executor executor(dev_place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
     auto *program = block->Program();
@@ -205,6 +210,8 @@ class WhileGradOp : public framework::OperatorBase {
         sum_op->Run(cur_scope, dev_place);
         cur_scope.Rename(new_inside_name, inside_grad_name);
       }
+      dev_ctx.Wait();
+      const_cast<framework::Scope &>(scope).DeleteScope(&cur_scope);
     }
   }
 };
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index d68caea99719b37816391f9bddcc5cac051025b2..5ce4b3de39d93e1935c6349ae446dec11d2fa986 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -39,11 +39,3 @@ nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context)
 
 cc_library(profiler SRCS profiler.cc DEPS device_context)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
-
-if(NOT WITH_C_API AND WITH_FLUID)
-  file(GLOB PLATFORM_HEADERS *.h)
-  file(GLOB PLATFORM_dynload_HEADERS dynload/*.h)
-  install(FILES ${PLATFORM_HEADERS} DESTINATION include/paddle/platform)
-  install(FILES ${PLATFORM_HEADERS} DESTINATION include/paddle/platform/dynload)
-  install(FILES details/device_ptr_cast.h DESTINATION include/paddle/platform/details)
-endif()
diff --git a/paddle/platform/assert.h b/paddle/platform/assert.h
index d813b9529ba2c8d5a3f39eadeb82d7569acd5fdd..1f5a8f6a195738ec3b0681aff8565885258a91fb 100644
--- a/paddle/platform/assert.h
+++ b/paddle/platform/assert.h
@@ -1,16 +1,16 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/platform/call_once.h b/paddle/platform/call_once.h
index 00337a7f051758559a0f8012d8c78dbe8e3457a6..44a4d38f679ddf6c317e52132b6cf3eb2f0a0649 100644
--- a/paddle/platform/call_once.h
+++ b/paddle/platform/call_once.h
@@ -29,20 +29,25 @@ namespace platform {
 */
 template <typename Callable, typename... Args>
 inline void call_once(std::once_flag& flag, Callable&& f, Args&&... args) {
-  bool good = false;
+  bool good = true;
   std::exception ex;
-  std::call_once(flag,
-                 [&](Args&&... args) {
-                   try {
-                     f(args...);
-                     good = true;
-                   } catch (const std::exception& e) {
-                     ex = e;
-                   } catch (...) {
-                     ex = std::runtime_error("excption caught in call_once");
-                   }
-                 },
-                 args...);
+  try {
+    std::call_once(flag,
+                   [&](Args&&... args) {
+                     try {
+                       f(args...);
+                     } catch (const std::exception& e) {
+                       ex = e;
+                       good = false;
+                     } catch (...) {
+                       ex = std::runtime_error("excption caught in call_once");
+                       good = false;
+                     }
+                   },
+                   args...);
+  } catch (std::system_error& x) {
+    throw std::runtime_error("call once failed");
+  }
   if (!good) {
     throw std::exception(ex);
   }
diff --git a/paddle/platform/nccl_test.cu b/paddle/platform/nccl_test.cu
index ef6d845874745af1150e4425f8d6be416cc44ece..84f5ac28be319473d045dc554bf2cb3c0e48803a 100644
--- a/paddle/platform/nccl_test.cu
+++ b/paddle/platform/nccl_test.cu
@@ -127,6 +127,9 @@ TEST(NCCL, all_reduce) {
 }  // namespace paddle
 
 int main(int argc, char** argv) {
+  // FIXME(tonyyang-svail):
+  //   Due to the driver issue on our CI, disable for now
+  return 0;
   dev_count = paddle::platform::GetCUDADeviceCount();
   if (dev_count <= 1) {
     LOG(WARNING)
diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc
index 2a8afc940393baaaa939471f50f2d5c63edd6a84..6df087d154cc104955c6399050c9cb2bce8d36e1 100644
--- a/paddle/platform/profiler.cc
+++ b/paddle/platform/profiler.cc
@@ -233,7 +233,7 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
       };
       break;
     default:
-      sorted_domain = "event end time";
+      sorted_domain = "event first end time";
   }
 
   std::vector<std::vector<EventItem>> events_table;
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
index e78673e0baa03496faab13d069b3bd456660bad6..de53fea0dd692167d61fcca552cc834a7916e209 100644
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
@@ -1,7 +1,7 @@
 if(WITH_PYTHON)
   cc_library(paddle_pybind SHARED
     SRCS pybind.cc exception.cc protobuf.cc const_value.cc
-    DEPS pybind python backward proto_desc paddle_memory executor prune init profiler
+    DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
     ${GLOB_OP_LIB})
   if(NOT APPLE AND NOT ANDROID)
     target_link_libraries(paddle_pybind rt)
diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index 371d6119d4ab73e683821d0dc5db5194f44a64ce..0a92e10927caf00be60fdd8107600b4033cf09ea 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -214,11 +214,18 @@ void BindVarDsec(py::module &m) {
            py::return_value_policy::reference)
       .def("set_name", &VarDesc::SetName)
       .def("set_shape", &VarDesc::SetShape)
+      .def("set_shapes", &VarDesc::SetShapes)
       .def("set_dtype", &VarDesc::SetDataType)
-      .def("shape", &VarDesc::Shape, py::return_value_policy::reference)
+      .def("set_dtypes", &VarDesc::SetDataTypes)
+      .def("shape", &VarDesc::GetShape, py::return_value_policy::reference)
+      .def("shapes", &VarDesc::GetShapes, py::return_value_policy::reference)
       .def("dtype", &VarDesc::GetDataType, py::return_value_policy::reference)
+      .def("dtypes", &VarDesc::GetDataTypes, py::return_value_policy::reference)
       .def("lod_level", &VarDesc::GetLoDLevel)
+      .def("lod_levels", &VarDesc::GetLoDLevels,
+           py::return_value_policy::reference)
       .def("set_lod_level", &VarDesc::SetLoDLevel)
+      .def("set_lod_levels", &VarDesc::SetLoDLevels)
       .def("type", &VarDesc::GetType)
       .def("set_type", &VarDesc::SetType)
       .def("serialize_to_string", SerializeMessage<VarDesc>)
@@ -233,7 +240,8 @@ void BindVarDsec(py::module &m) {
       .value("STEP_SCOPES", proto::VarDesc::STEP_SCOPES)
       .value("LOD_RANK_TABLE", proto::VarDesc::LOD_RANK_TABLE)
       .value("LOD_TENSOR_ARRAY", proto::VarDesc::LOD_TENSOR_ARRAY)
-      .value("PLACE_LIST", proto::VarDesc::PLACE_LIST);
+      .value("PLACE_LIST", proto::VarDesc::PLACE_LIST)
+      .value("READER", proto::VarDesc::READER);
 }
 
 void BindOpDesc(py::module &m) {
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 82f5b1922c6e97ee73a187e838350a965f1fd269..a880d9bdbc63aacc1f2cdbc0d7da001a59c7b372 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -53,7 +53,7 @@ static size_t UniqueIntegerGenerator(const std::string &prefix) {
   return generators[prefix].fetch_add(1);
 }
 
-bool IsCompileGPU() {
+bool IsCompiledWithCUDA() {
 #ifndef PADDLE_WITH_CUDA
   return false;
 #else
@@ -124,44 +124,25 @@ PYBIND11_PLUGIN(core) {
       .def(
           "__init__",
           [](LoDTensor &instance, const std::vector<std::vector<size_t>> &lod) {
-#ifndef PADDLE_WITH_CUDA
-            new (&instance) LoDTensor(lod);
-#else
-             LoD new_lod;
-             new_lod.reserve(lod.size());
-             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-             new (&instance) LoDTensor(new_lod);
-#endif
+            LoD new_lod;
+            new_lod.reserve(lod.size());
+            std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+            new (&instance) LoDTensor(new_lod);
           })
       .def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); })
       .def("set_lod",
            [](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
-#ifndef PADDLE_WITH_CUDA
-             self.set_lod(lod);
-#else
              LoD new_lod;
              new_lod.reserve(lod.size());
              std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
              self.set_lod(new_lod);
-#endif
            })
       .def("lod", [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
-#ifndef PADDLE_WITH_CUDA
-        return self.lod();
-#else
-           auto lod = self.lod();
-           std::vector<std::vector<size_t>> new_lod;
-           new_lod.reserve(lod.size());
-           std::transform(lod.begin(), lod.end(), std::back_inserter(new_lod),
-               [](Vector<size_t> item) ->
-                   std::vector<size_t> {
-                 std::vector<size_t> v;
-                 v.reserve(item.size());
-                 std::copy(item.begin(), item.end(), std::back_inserter(v));
-                 return v;
-               });
-           return new_lod;
-#endif
+        auto lod = self.lod();
+        std::vector<std::vector<size_t>> new_lod;
+        new_lod.reserve(lod.size());
+        std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+        return new_lod;
       });
 
   py::class_<SelectedRows>(m, "SelectedRows")
@@ -424,14 +405,16 @@ All parameter, weight, gradient are variables in Paddle.
 
   py::class_<framework::Executor>(m, "Executor")
       .def(py::init<const platform::Place &>())
-      .def("run", &Executor::Run);
+      .def("run",
+           (void (Executor::*)(const ProgramDesc &, Scope *, int, bool, bool)) &
+               Executor::Run);
 
   m.def("unique_integer", UniqueIntegerGenerator);
   m.def("init_gflags", framework::InitGflags);
   m.def("init_glog", framework::InitGLOG);
   m.def("init_devices", &framework::InitDevices);
 
-  m.def("is_compile_gpu", IsCompileGPU);
+  m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
 
   m.def("set_feed_variable", framework::SetFeedVariable);
   m.def("get_fetch_variable", framework::GetFetchVariable);
diff --git a/paddle/scripts/docker/README.md b/paddle/scripts/docker/README.md
index f0620498cfa6775ce2949cc02fa9f6c9529dec2e..65c46745556bc5ea91fdd4e33060f2535422e8e8 100644
--- a/paddle/scripts/docker/README.md
+++ b/paddle/scripts/docker/README.md
@@ -56,7 +56,7 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
 | ------ | -------- | ----------- |
 | `WITH_GPU` | OFF | Generates NVIDIA CUDA GPU code and relies on CUDA libraries. |
 | `WITH_AVX` | OFF | Set to "ON" to enable AVX support. |
-| `WITH_TESTING` | ON | Build unit tests binaries. |
+| `WITH_TESTING` | OFF | Build unit tests binaries. |
 | `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. |
 | `WITH_GOLANG` | ON | Build fault-tolerant parameter server written in go. |
 | `WITH_SWIG_PY` | ON | Build with SWIG python API support. |
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index e70d04d9017e9e36bbd55d6a28889d9ba7fb2a13..ba496db5f834efe767bfe446a46877932faa81a0 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -32,7 +32,7 @@ function cmake_gen() {
     cat <<EOF
     ========================================
     Configuring cmake in /paddle/build ...
-        -DCMAKE_BUILD_TYPE=Release
+        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
         ${PYTHON_FLAGS}
         -DWITH_DOC=OFF
         -DWITH_GPU=${WITH_GPU:-OFF}
@@ -40,6 +40,7 @@ function cmake_gen() {
         -DWITH_MKL=${WITH_MKL:-ON}
         -DWITH_AVX=${WITH_AVX:-OFF}
         -DWITH_GOLANG=${WITH_GOLANG:-ON}
+        -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
         -DWITH_SWIG_PY=ON
         -DWITH_C_API=${WITH_C_API:-OFF}
         -DWITH_PYTHON=${WITH_PYTHON:-ON}
@@ -54,7 +55,7 @@ EOF
     # docker environment is fully controlled by this script.
     # See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
     cmake .. \
-        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \
         ${PYTHON_FLAGS} \
         -DWITH_DOC=OFF \
         -DWITH_GPU=${WITH_GPU:-OFF} \
@@ -62,6 +63,7 @@ EOF
         -DWITH_MKL=${WITH_MKL:-ON} \
         -DWITH_AVX=${WITH_AVX:-OFF} \
         -DWITH_GOLANG=${WITH_GOLANG:-ON} \
+        -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
         -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
         -DWITH_C_API=${WITH_C_API:-OFF} \
         -DWITH_PYTHON=${WITH_PYTHON:-ON} \
@@ -77,6 +79,7 @@ function run_build() {
     Building in /paddle/build ...
     ============================================
 EOF
+    make clean
     make -j `nproc`
 }
 
@@ -114,7 +117,7 @@ EOF
             -DWITH_STYLE_CHECK=OFF
         make -j `nproc` gen_proto_py
         make -j `nproc` paddle_python
-        make -j `nproc` paddle_docs paddle_docs_cn
+        make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs
         make -j `nproc` print_operators_doc
         paddle/pybind/print_operators_doc > doc/en/html/operators.json
         popd
diff --git a/paddle/scripts/docker/test.sh b/paddle/scripts/docker/test.sh
new file mode 100755
index 0000000000000000000000000000000000000000..8180737a8f431d6eb8bab4b2ef7bdcc50cce41f3
--- /dev/null
+++ b/paddle/scripts/docker/test.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+set -e
+
+# the number of process to run tests
+NUM_PROC=6
+
+# calculate and set the memory usage for each process
+MEM_USAGE=$(printf "%.2f" `echo "scale=5; 1.0 / $NUM_PROC" | bc`)
+export FLAGS_fraction_of_gpu_memory_to_use=$MEM_USAGE
+
+# get the CUDA device count
+CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
+
+for (( i = 0; i < $NUM_PROC; i++ )); do
+    cuda_list=()
+    for (( j = 0; j < $CUDA_DEVICE_COUNT; j++ )); do
+        s=$[i+j]
+        n=$[s%CUDA_DEVICE_COUNT]
+        if [ $j -eq 0 ]; then
+            cuda_list=("$n")
+        else
+            cuda_list="$cuda_list,$n"
+        fi
+    done
+    echo $cuda_list
+    # CUDA_VISIBLE_DEVICES http://acceleware.com/blog/cudavisibledevices-masking-gpus
+    # ctest -I https://cmake.org/cmake/help/v3.0/manual/ctest.1.html?highlight=ctest
+    env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC --output-on-failure &
+done
+wait
diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh
index 0db8d33bbcb5278ed0dd5584b5822502b719ede9..4af4ac4f5e43543449ae922d7eb2a5740372f68f 100755
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -9,13 +9,14 @@ cd $TRAVIS_BUILD_DIR/build
 cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
 make -j `nproc` gen_proto_py
 make -j `nproc` paddle_python
-make -j `nproc` paddle_docs paddle_docs_cn
+make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs
 make -j `nproc` print_operators_doc
 paddle/pybind/print_operators_doc > doc/en/html/operators.json
 
 # check websites for broken links
 linkchecker doc/en/html/index.html
 linkchecker doc/cn/html/index.html
+linkchecker doc/api/en/html/index.html
 
 # Parse Github URL
 REPO=`git config remote.origin.url`
@@ -54,10 +55,11 @@ function deploy_docs() {
   mkdir -p ${DIR}
   # remove old docs. mv new docs.
   set +e
-  rm -rf ${DIR}/doc ${DIR}/doc_cn
+  rm -rf ${DIR}/doc ${DIR}/doc_cn ${DIR}/api_doc
   set -e
   cp -r ../doc/cn/html ${DIR}/doc_cn
   cp -r ../doc/en/html ${DIR}/doc
+  cp -r ../doc/api/en/html ${DIR}/api_doc
   git add .
 }
 
diff --git a/paddle/string/CMakeLists.txt b/paddle/string/CMakeLists.txt
index 751776dbb5c00972c0b6893fcfb2e710f3f082d7..1fe7f42ca1c692e4d7034883022852657be8cc20 100644
--- a/paddle/string/CMakeLists.txt
+++ b/paddle/string/CMakeLists.txt
@@ -2,9 +2,3 @@ cc_library(stringpiece SRCS piece.cc)
 cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
 cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
 cc_test(to_string_test SRCS to_string_test.cc)
-
-if(NOT WITH_C_API AND WITH_FLUID)
-  file(GLOB STRING_HEADERS *.h)
-  install(FILES ${STRING_HEADERS} DESTINATION include/paddle/string)
-  install(FILES tinyformat/tinyformat.h DESTINATION include/paddle/string/tinyformat)
-endif()
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index a7fb50ee4149a3c36077f83383f45f3106e7e0f1..fd8c4a69da897cc39f31f435036e32c41285fb59 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -22,12 +22,15 @@ limitations under the License. */
 int main(int argc, char** argv) {
   std::vector<char*> new_argv;
   std::string gflags_env;
-  new_argv.push_back(argv[0]);
+  for (int i = 0; i < argc; ++i) {
+    new_argv.push_back(argv[i]);
+  }
 #ifdef PADDLE_WITH_CUDA
   new_argv.push_back(
-      strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
+      strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory,"
+             "warpctc_dir"));
 #else
-  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory"));
+  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory,warpctc_dir"));
 #endif
   int new_argc = static_cast<int>(new_argv.size());
   char** new_argv_address = new_argv.data();
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 4fdf4090212e31adcccf6b119c937e70d5cbf995..186b91c226accbe1c2d5465d6244b9438eec9979 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -140,8 +140,13 @@ def init_config_environment(
         g_submodel_stack=[],
         g_add_submodel_suffix=False, ):
 
-    for k, v in locals().iteritems():
-        globals()[k] = copy.deepcopy(v)
+    # directly iterate through locals().iteritems() will change
+    # the size of locals() due to introducing k, v into scope
+    # which will break the process in some env
+
+    local_vars = copy.deepcopy(locals())
+    for k, v in local_vars.iteritems():
+        globals()[k] = v
 
 
 # Because type is widely used as a variable name in this code.
diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py
index 1f041c74597637a7b74e9690a60b6cd8fdd21cf8..73acbf3e009965f9eaaade77d2fe4cf4f99d4379 100644
--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -26,15 +26,17 @@ import initializer
 import layers
 import nets
 import optimizer
+import learning_rate_decay
 import backward
 import regularizer
-from param_attr import ParamAttr
+from param_attr import ParamAttr, WeightNormParamAttr
 from data_feeder import DataFeeder
 from core import LoDTensor, CPUPlace, CUDAPlace
 from distribute_transpiler import DistributeTranspiler
 from distribute_transpiler_simple import SimpleDistributeTranspiler
 import clip
 from memory_optimization_transpiler import memory_optimize
+import profiler
 
 Tensor = LoDTensor
 
@@ -44,18 +46,21 @@ __all__ = framework.__all__ + executor.__all__ + [
     'layers',
     'nets',
     'optimizer',
+    'learning_rate_decay',
     'backward',
     'regularizer',
     'LoDTensor',
     'CPUPlace',
     'CUDAPlace',
     'Tensor',
-    'ParamAttr'
+    'ParamAttr',
+    'WeightNormParamAttr',
     'DataFeeder',
     'clip',
     'SimpleDistributeTranspiler',
     'DistributeTranspiler',
     'memory_optimize',
+    'profiler',
 ]
 
 
@@ -87,10 +92,10 @@ def __bootstrap__():
     os.environ['OMP_NUM_THREADS'] = str(num_threads)
 
     read_env_flags = [
-        'use_pinned_memory', 'check_nan_inf', 'do_memory_benchmark'
+        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir'
     ]
-    if core.is_compile_gpu():
-        read_env_flags += ['fraction_of_gpu_memory_to_use', 'op_sync']
+    if core.is_compiled_with_cuda():
+        read_env_flags += ['fraction_of_gpu_memory_to_use']
     core.init_gflags([sys.argv[0]] +
                      ["--tryfromenv=" + ",".join(read_env_flags)])
     core.init_glog(sys.argv[0])
diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py
index ae81d68bafd22db5d9f7ab0f9cc0dcdb204493e1..29243c90e872ca4a7d1ce6f84f6297b865655da1 100644
--- a/python/paddle/v2/fluid/backward.py
+++ b/python/paddle/v2/fluid/backward.py
@@ -178,7 +178,7 @@ def _remove_no_grad_branch_(op_descs, no_grad_set):
         if _all_in_set_(
                 filter(lambda name: name.find(core.grad_var_suffix()) != -1,
                        op_desc.input_arg_names()), no_grad_set):
-            no_grad_set.union(out_arg_names)
+            no_grad_set.update(out_arg_names)
             return True
         return False
 
diff --git a/python/paddle/v2/fluid/clip.py b/python/paddle/v2/fluid/clip.py
index 3028029e60fde2f481b4348ab1b0a4980ebb2b60..fdbc8524abb7d6687983b026ca8e65e61c3dfd1a 100644
--- a/python/paddle/v2/fluid/clip.py
+++ b/python/paddle/v2/fluid/clip.py
@@ -30,6 +30,9 @@ __all__ = [
 
 
 class BaseErrorClipAttr(object):
+    def __str__(self):
+        raise NotImplementedError()
+
     def append_clip_op(self, block, grad_name):
         raise NotImplementedError()
 
@@ -44,6 +47,9 @@ class ErrorClipByValue(BaseErrorClipAttr):
         self.max = max
         self.min = min
 
+    def __str__(self):
+        return "ByValue, min=%f, max=%f" % (self.min, self.max)
+
     def append_clip_op(self, block, grad_name):
         clip_op_desc = block.desc.append_op()
         clip_op_desc.set_type("clip")
@@ -71,6 +77,9 @@ def error_clip_callback(block, context):
 
 
 class BaseGradientClipAttr(object):
+    def __str__(self):
+        raise NotImplementedError()
+
     def process_context(self, context, param, grad):
         raise NotImplementedError()
 
@@ -79,6 +88,9 @@ class BaseGradientClipAttr(object):
 
 
 class NullGradientClipAttr(BaseGradientClipAttr):
+    def __str__(self):
+        return "Null"
+
     def process_context(self, context, param, grad):
         pass
 
@@ -96,6 +108,9 @@ class GradientClipByValue(BaseGradientClipAttr):
         self.max = max
         self.min = min
 
+    def __str__(self):
+        return "ByValue, min=%f, max=%f" % (self.min, self.max)
+
     def process_context(self, context, param, grad):
         pass
 
@@ -108,6 +123,9 @@ class GradientClipByNorm(BaseGradientClipAttr):
     def __init__(self, clip_norm):
         self.clip_norm = clip_norm
 
+    def __str__(self):
+        return "ByNorm, clip_norm=%f" % self.clip_norm
+
     def process_context(self, context, param, grad):
         pass
 
@@ -124,6 +142,10 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
         self.clip_norm = clip_norm
         self.group_name = group_name
 
+    def __str__(self):
+        return "ByGlobalNorm, group_name=%s, clip_norm=%f" % (self.group_name,
+                                                              self.clip_norm)
+
     def process_context(self, context, param, grad):
         if self.group_name not in context:
             context[self.group_name] = []
@@ -160,6 +182,17 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
 
 
 def set_gradient_clip(clip, param_list=None, program=None):
+    """
+        To specify parameters that require gradient clip.
+        Args:
+            clip(BaseGradientClipAttr): An instance of some derived class of BaseGradientClipAttr, 
+                    which describes the type and detailed attributes of required gradient clip.
+            param_list(list, None by default): Parameters that require gradient clip. 
+                    It can be a list of parameter or a list of parameter's name. 
+                    When it's None, all parameters in the program will be included. 
+            program(Program, None by default): The program where parameters are. 
+                    Will be the default main program when assigned with None.
+    """
     if not isinstance(clip, BaseGradientClipAttr):
         raise TypeError(
             "'clip' should be an instance of BaseGradientClipAttr's derived class"
@@ -199,3 +232,5 @@ def append_gradient_clip_ops(param_grad):
 
 
 ClipByValue = GradientClipByValue
+ClipByNorm = GradientClipByNorm
+ClipByGlobalNorm = GradientClipByGlobalNorm
diff --git a/python/paddle/v2/fluid/debuger.py b/python/paddle/v2/fluid/debuger.py
new file mode 100644
index 0000000000000000000000000000000000000000..db1808c64745ac153962c050b08993450dd93c06
--- /dev/null
+++ b/python/paddle/v2/fluid/debuger.py
@@ -0,0 +1,265 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import re
+from graphviz import GraphPreviewGenerator
+import proto.framework_pb2 as framework_pb2
+
+_vartype2str_ = [
+    "UNK",
+    "LoDTensor",
+    "SelectedRows",
+    "FeedMinibatch",
+    "FetchList",
+    "StepScopes",
+    "LodRankTable",
+    "LoDTensorArray",
+    "PlaceList",
+]
+_dtype2str_ = [
+    "bool",
+    "int16",
+    "int32",
+    "int64",
+    "float16",
+    "float32",
+    "float64",
+]
+
+
+def repr_data_type(type):
+    return _dtype2str_[type]
+
+
+def repr_tensor(proto):
+    return "tensor(type={}, shape={})".format(_dtype2str_[int(proto.data_type)],
+                                              str(proto.dims))
+
+
+reprtpl = "{ttype} {name} ({reprs})"
+
+
+def repr_lodtensor(proto):
+    if not proto.lod_tensor: return
+    level = proto.lod_tensor.lod_level
+    reprs = repr_tensor(proto.lod_tensor.tensor)
+    return reprtpl.format(
+        ttype="LoDTensor" if level > 0 else "Tensor",
+        name=proto.name,
+        reprs="level=%d, %s" % (level, reprs) if level > 0 else reprs)
+
+
+def repr_selected_rows(proto):
+    if not proto.selected_rows: return
+    return reprtpl.format(
+        ttype="SelectedRows",
+        name=proto.name,
+        reprs=repr_tensor(proto.selected_rows))
+
+
+def repr_tensor_array(proto):
+    if not proto.tensor_array: return
+    return reprtpl.format(
+        ttype="TensorArray",
+        name=proto.name,
+        reprs="level=%d, %s" % (proto.tensor_array.lod_level,
+                                repr_tensor(proto.lod_tensor)))
+
+
+type_handlers = [
+    repr_lodtensor,
+    repr_selected_rows,
+    repr_tensor_array,
+]
+
+
+def repr_var(vardesc):
+    for handler in type_handlers:
+        res = handler(vardesc)
+        if res:
+            return res
+
+
+def pprint_program_codes(program_desc):
+    reprs = []
+    for block_idx in range(program_desc.num_blocks()):
+        block_desc = program_desc.block(block_idx)
+        block_repr = pprint_block_codes(block_desc)
+        reprs.append(block_repr)
+    return '\n'.join(reprs)
+
+
+def pprint_block_codes(block_desc, show_backward=False):
+    def is_op_backward(op_desc):
+        if op_desc.type.endswith('_grad'): return True
+
+        def is_var_backward(var):
+            if "@GRAD" in var.parameter: return True
+            for arg in var.arguments:
+                if "@GRAD" in arg: return True
+
+        for var in op_desc.inputs:
+            if is_var_backward(var): return True
+        for var in op_desc.outputs:
+            if is_var_backward(var): return True
+        return False
+
+    def is_var_backward(var_desc):
+        return "@GRAD" in var_desc.name
+
+    if type(block_desc) is not framework_pb2.BlockDesc:
+        block_desc = framework_pb2.BlockDesc.FromString(
+            block_desc.serialize_to_string())
+    var_reprs = []
+    op_reprs = []
+    for var in block_desc.vars:
+        if not show_backward and is_var_backward(var):
+            continue
+        var_reprs.append(repr_var(var))
+
+    for op in block_desc.ops:
+        if not show_backward and is_op_backward(op): continue
+        op_reprs.append(repr_op(op))
+
+    tpl = "// block-{idx}  parent-{pidx}\n// variables\n{vars}\n\n// operators\n{ops}\n"
+    return tpl.format(
+        idx=block_desc.idx,
+        pidx=block_desc.parent_idx,
+        vars='\n'.join(var_reprs),
+        ops='\n'.join(op_reprs), )
+
+
+def repr_attr(desc):
+    tpl = "{key}={value}"
+    valgetter = [
+        lambda attr: attr.i,
+        lambda attr: attr.f,
+        lambda attr: attr.s,
+        lambda attr: attr.ints,
+        lambda attr: attr.floats,
+        lambda attr: attr.strings,
+        lambda attr: attr.b,
+        lambda attr: attr.bools,
+        lambda attr: attr.block_idx,
+        lambda attr: attr.l,
+    ]
+    key = desc.name
+    value = valgetter[desc.type](desc)
+    if key == "dtype":
+        value = repr_data_type(value)
+    return tpl.format(key=key, value=str(value)), (key, value)
+
+
+def _repr_op_fill_constant(optype, inputs, outputs, attrs):
+    if optype == "fill_constant":
+        return "{output} = {data} [shape={shape}]".format(
+            output=','.join(outputs),
+            data=attrs['value'],
+            shape=str(attrs['shape']))
+
+
+op_repr_handlers = [_repr_op_fill_constant, ]
+
+
+def repr_op(opdesc):
+    optype = None
+    attrs = []
+    attr_dict = {}
+    is_target = None
+    inputs = []
+    outputs = []
+
+    tpl = "{outputs} = {optype}({inputs}{is_target}) [{attrs}]"
+    args2value = lambda args: args[0] if len(args) == 1 else str(list(args))
+    for var in opdesc.inputs:
+        key = var.parameter
+        value = args2value(var.arguments)
+        inputs.append("%s=%s" % (key, value))
+    for var in opdesc.outputs:
+        value = args2value(var.arguments)
+        outputs.append(value)
+    for attr in opdesc.attrs:
+        attr_repr, attr_pair = repr_attr(attr)
+        attrs.append(attr_repr)
+        attr_dict[attr_pair[0]] = attr_pair[1]
+
+    is_target = opdesc.is_target
+
+    for handler in op_repr_handlers:
+        res = handler(opdesc.type, inputs, outputs, attr_dict)
+        if res: return res
+
+    return tpl.format(
+        outputs=', '.join(outputs),
+        optype=opdesc.type,
+        inputs=', '.join(inputs),
+        attrs="{%s}" % ','.join(attrs),
+        is_target=", is_target" if is_target else "")
+
+
+def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
+    '''
+    Generate a debug graph for block.
+    Args:
+        block(Block): a block.
+    '''
+    graph = GraphPreviewGenerator("some graph")
+    # collect parameters and args
+    protostr = block.desc.serialize_to_string()
+    desc = framework_pb2.BlockDesc.FromString(str(protostr))
+
+    def need_highlight(name):
+        if highlights is None: return False
+        for pattern in highlights:
+            assert type(pattern) is str
+            if re.match(pattern, name):
+                return True
+        return False
+
+    # draw parameters and args
+    vars = {}
+    for var in desc.vars:
+        shape = [str(i) for i in var.lod_tensor.tensor.dims]
+        if not shape:
+            shape = ['null']
+        # create var
+        if var.persistable:
+            varn = graph.add_param(
+                var.name, var.type, shape, highlight=need_highlight(var.name))
+        else:
+            varn = graph.add_arg(var.name, highlight=need_highlight(var.name))
+        vars[var.name] = varn
+
+    def add_op_link_var(op, var, op2var=False):
+        for arg in var.arguments:
+            if arg not in vars:
+                # add missing variables as argument
+                vars[arg] = graph.add_arg(arg, highlight=need_highlight(arg))
+            varn = vars[arg]
+            highlight = need_highlight(op.description) or need_highlight(
+                varn.description)
+            if op2var:
+                graph.add_edge(op, varn, highlight=highlight)
+            else:
+                graph.add_edge(varn, op, highlight=highlight)
+
+    for op in desc.ops:
+        opn = graph.add_op(op.type, highlight=need_highlight(op.type))
+        for var in op.inputs:
+            add_op_link_var(opn, var, False)
+        for var in op.outputs:
+            add_op_link_var(opn, var, True)
+
+    graph(path, show=True)
diff --git a/python/paddle/v2/fluid/distribute_transpiler.py b/python/paddle/v2/fluid/distribute_transpiler.py
index abcad899bfac9ba3eff20cde825e136d867a4485..c5f1d51bd718acf32d173b97ee7bb7cdeb443c63 100644
--- a/python/paddle/v2/fluid/distribute_transpiler.py
+++ b/python/paddle/v2/fluid/distribute_transpiler.py
@@ -33,6 +33,10 @@ class VarBlock:
         return "%s:%d:%d" % (self.varname, self.offset, self.size)
 
 
+def same_or_split_var(p_name, var_name):
+    return p_name == var_name or p_name.startswith(var_name + ".block")
+
+
 def split_dense_variable(var_list,
                          pserver_count,
                          min_block_size=1024,
@@ -149,11 +153,18 @@ class DistributeTranspiler:
             self.param_grad_ep_mapping[ep]["params"].append(param)
             self.param_grad_ep_mapping[ep]["grads"].append(grad)
 
+        rpc_client_var = program.global_block().create_var(
+            name="RPC_CLIENT_VAR",
+            psersistable=True,
+            dtype='float32',  # dtype and shape is not used in fact
+            shape=[0])
+
         # create send_op
         send_op = program.global_block().append_op(
             type="send",
             inputs={"X": send_inputs},
-            outputs={"Out": send_outputs},
+            outputs={"Out": send_outputs,
+                     "RPCClient": rpc_client_var},
             attrs={"endpoints": pserver_endpoints,
                    "epmap": eplist})
         # step4
@@ -221,7 +232,7 @@ class DistributeTranspiler:
             if len(splited_vars) <= 1:
                 continue
             orig_var = program.global_block().vars[varname]
-            if orig_var == core.VarDesc.VarType.SELECTED_ROWS:
+            if orig_var.type == core.VarDesc.VarType.SELECTED_ROWS:
                 height_sections = []
                 for v in splited_vars:
                     height_sections.append(v.shape[0])
@@ -230,7 +241,7 @@ class DistributeTranspiler:
                     inputs={"X": orig_var},
                     outputs={"Out": splited_vars},
                     attrs={"height_sections": height_sections})
-            elif orig_var == core.VarDesc.VarType.LOD_TENSOR:
+            elif orig_var.type == core.VarDesc.VarType.LOD_TENSOR:
                 sections = []
                 for v in splited_vars:
                     sections.append(v.shape[0])
@@ -289,6 +300,9 @@ class DistributeTranspiler:
             pass
         return orig_shape
 
+    def _op_input_var(self, op, varname):
+        pass
+
     def _is_op_on_pserver(self, endpoint, all_ops, idx):
         """
         Recursively check if the op need to run on current server.
@@ -298,29 +312,35 @@ class DistributeTranspiler:
             p.name for p in self.param_grad_ep_mapping[endpoint]["params"]
         ]
         op = all_ops[idx]
-        if op.inputs.has_key("Param"):
-            if op.inputs["Param"].name in param_names:
+        input_names = set(op.input_names)
+        # TODO(typhoonzero): using Param and Grad input name to identify
+        # that the operator is an optimization operator, need a better way.
+        if "Param" in input_names:
+            if op.input("Param")[0] in param_names:
                 return True
             else:
                 for n in param_names:
-                    if n.startswith(op.inputs["Param"].name+".block") and \
-                       n != op.inputs["Param"].name:
+                    if same_or_split_var(n, op.input("Param")[0]) \
+                            and n != op.input("Param")[0]:
                         return True
                 return False
         else:
             j = idx - 1
             while j >= 0:
                 prev_op = all_ops[j]
-                prev_output_names = [o.name for o in prev_op.outputs.values()]
-                prev_input_names = [o.name for o in prev_op.inputs.values()]
+                # prev_output_names = [o.name for o in prev_op.outputs.values()]
+                # prev_input_names = [o.name for o in prev_op.inputs.values()]
+                # NOTE(typhoonzero): consider list input/output
+                prev_output_names = prev_op.desc.output_arg_names()
+                prev_input_names = prev_op.desc.input_arg_names()
                 found1 = False
                 found2 = False
-                for _, v in op.inputs.iteritems():
-                    if v.name in prev_output_names:
+                for varname in op.desc.input_arg_names():
+                    if varname in prev_output_names:
                         found1 = self._is_op_on_pserver(endpoint, all_ops, j)
                 # later ops may produce output for prev op's next batch use.
-                for _, v in op.outputs.iteritems():
-                    if v.name in prev_input_names:
+                for varname in op.desc.output_arg_names():
+                    if varname in prev_input_names:
                         found2 = self._is_op_on_pserver(endpoint, all_ops, j)
                 if found1 or found2:
                     return True
@@ -331,11 +351,11 @@ class DistributeTranspiler:
         new_inputs = dict()
         # update param/grad shape first, then other inputs like
         # moment can use the updated shape
-        for key, var in opt_op.inputs.iteritems():
+        for key in opt_op.input_names:
             if key == "Grad":
                 grad_block = None
                 for g in self.param_grad_ep_mapping[endpoint]["grads"]:
-                    if g.name.startswith(var.name):
+                    if same_or_split_var(g.name, opt_op.input(key)[0]):
                         grad_block = g
                         break
                 if not grad_block:
@@ -365,7 +385,7 @@ class DistributeTranspiler:
                 # param is already created on global program
                 param_block = None
                 for p in self.param_grad_ep_mapping[endpoint]["params"]:
-                    if p.name.startswith(var.name):
+                    if same_or_split_var(p.name, opt_op.input(key)[0]):
                         param_block = p
                         break
                 if not param_block:
@@ -378,11 +398,12 @@ class DistributeTranspiler:
 
                 new_inputs[key] = tmpvar
 
-        for key, var in opt_op.inputs.iteritems():
+        for key in opt_op.input_names:
             if key in ["Param", "Grad"]:
                 continue
             # update accumulator variable shape
             param_shape = new_inputs["Param"].shape
+            var = program.global_block().vars[opt_op.input(key)[0]]
             new_shape = self._get_optimizer_input_shape(opt_op.type, key,
                                                         var.shape, param_shape)
             tmpvar = program.global_block().create_var(
@@ -401,30 +422,44 @@ class DistributeTranspiler:
                 shape=new_shape)
 
         # change output's ParamOut variable
-        opt_op.outputs["ParamOut"] = new_inputs["Param"]
+        outputs = self._get_output_map_from_op(program.global_block(), opt_op)
+        outputs["ParamOut"] = new_inputs["Param"]
         program.global_block().append_op(
             type=opt_op.type,
             inputs=new_inputs,
-            outputs=opt_op.outputs,
+            outputs=outputs,
             attrs=opt_op.attrs)
 
     def _append_pserver_non_opt_ops(self, program, pserver_program, opt_op):
         # Append the ops for parameters that do not need to be optimized/updated
-        for _, var in opt_op.inputs.iteritems():
-            program.global_block().create_var(
-                name=var.name,
-                persistable=var.persistable,
-                dtype=var.dtype,
-                shape=var.shape)
-            pserver_program.global_block().create_var(
-                name=var.name,
-                persistable=var.persistable,
-                dtype=var.dtype,
-                shape=var.shape)
+        inputs = self._get_input_map_from_op(self.program.global_block().vars,
+                                             opt_op)
+        for var in inputs.itervalues():
+            if type(var) == list:
+                varlist = var
+            else:
+                varlist = [var]
+            for var in varlist:
+                # TODO(typhoonzero): will remove below line later.
+                program.global_block().create_var(
+                    name=var.name,
+                    persistable=var.persistable,
+                    dtype=var.dtype,
+                    shape=var.shape)
+                if not pserver_program.global_block().vars.has_key(var.name):
+                    pserver_program.global_block().create_var(
+                        name=var.name,
+                        persistable=var.persistable,
+                        dtype=var.dtype,
+                        shape=var.shape)
+
+        outputs = self._get_output_map_from_op(self.program.global_block().vars,
+                                               opt_op)
+
         program.global_block().append_op(
             type=opt_op.type,
-            inputs=opt_op.inputs,
-            outputs=opt_op.outputs,
+            inputs=inputs,
+            outputs=outputs,
             attrs=opt_op.attrs)
 
     def get_pserver_program(self, endpoint):
@@ -461,17 +496,16 @@ class DistributeTranspiler:
                                                       self.optimize_ops, idx)
             if not is_op_on_pserver:
                 continue
-            if opt_op.inputs.has_key("Grad"):
+            if "Grad" in opt_op.desc.input_arg_names():
                 self._append_pserver_ops(optimize_sub_program, pserver_program,
                                          opt_op, endpoint)
             else:
                 self._append_pserver_non_opt_ops(optimize_sub_program,
                                                  pserver_program, opt_op)
-        # Append the recv op
+        # Append the listen_and_serv op
         pserver_program.global_block().append_op(
-            type="recv",
-            inputs={"RX": self.param_grad_ep_mapping[endpoint]["grads"]
-                    },  # grads to recv
+            type="listen_and_serv",
+            inputs={},
             outputs={},
             attrs={
                 "OptimizeBlock": optimize_sub_program.global_block(),
@@ -489,6 +523,30 @@ class DistributeTranspiler:
         pserver_program.sync_with_cpp()
         return pserver_program
 
+    def _get_input_map_from_op(self, varmap, op):
+        iomap = dict()
+        for key in op.input_names:
+            vars = []
+            for varname in op.input(key):
+                vars.append(varmap[varname])
+            if len(vars) == 1:
+                iomap[key] = vars[0]
+            else:
+                iomap[key] = vars
+        return iomap
+
+    def _get_output_map_from_op(self, varmap, op):
+        iomap = dict()
+        for key in op.output_names:
+            vars = []
+            for varname in op.output(key):
+                vars.append(varmap[varname])
+            if len(vars) == 1:
+                iomap[key] = vars[0]
+            else:
+                iomap[key] = vars
+        return iomap
+
     def get_startup_program(self, endpoint, pserver_program):
         """
         Get startup program for current parameter server.
@@ -502,7 +560,7 @@ class DistributeTranspiler:
         def _get_splited_name_and_shape(varname):
             for idx, splited_param in enumerate(params):
                 pname = splited_param.name
-                if pname.startswith(varname) and varname != pname:
+                if same_or_split_var(pname, varname) and varname != pname:
                     return pname, splited_param.shape
             return "", []
 
@@ -519,17 +577,21 @@ class DistributeTranspiler:
 
         # 2. rename op outputs
         for op in orig_s_prog.global_block().ops:
+            new_inputs = dict()
             new_outputs = dict()
             # do not append startup op if var is not on this pserver
             op_on_pserver = False
-            for key, var in op.outputs.iteritems():
-                newname, _ = _get_splited_name_and_shape(var.name)
+            for key in op.output_names:
+                newname, _ = _get_splited_name_and_shape(op.output(key)[0])
                 if newname:
                     op_on_pserver = True
                     new_outputs[key] = created_var_map[newname]
-                elif var.name in pserver_vars:
+                elif op.output(key)[0] in pserver_vars:
                     op_on_pserver = True
-                    new_outputs[key] = pserver_vars[var.name]
+                    new_outputs[key] = pserver_vars[op.output(key)[0]]
+
+            # most startup program ops have no inputs
+            new_inputs = self._get_input_map_from_op(pserver_vars, op)
 
             if op_on_pserver:
                 if op.type in [
@@ -538,7 +600,7 @@ class DistributeTranspiler:
                     op.attrs["shape"] = new_outputs["Out"].shape
                 s_prog.global_block().append_op(
                     type=op.type,
-                    inputs=op.inputs,
+                    inputs=new_inputs,
                     outputs=new_outputs,
                     attrs=op.attrs)
         return s_prog
diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py
index 9d5ed9571a2fa0a871a25e43b23b1a3c3a6102db..01cbdb3ec487d6e2e60890619131de0067d40db9 100644
--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
@@ -17,7 +17,9 @@ import contextlib
 from framework import Program, default_main_program
 from . import core
 
-__all__ = ['Executor', 'global_scope', 'scope_guard', 'switch_scope']
+__all__ = [
+    'Executor', 'global_scope', 'scope_guard', 'switch_scope', 'fetch_var'
+]
 
 g_scope = core.Scope()
 
@@ -45,27 +47,120 @@ def as_numpy(tensor):
         return [as_numpy(t) for t in tensor]
     assert isinstance(tensor, core.LoDTensor)
     lod = tensor.lod()
-    tensor_data = np.array(tensor)
-    if len(lod) == 0:
-        ans = tensor_data
-    else:
-        raise RuntimeError("LoD Calculate lacks unit tests and buggy")
-    # elif len(lod) == 1:
-    #     ans = []
-    #     idx = 0
-    #     while idx < len(lod) - 1:
-    #         ans.append(tensor_data[lod[idx]:lod[idx + 1]])
-    #         idx += 1
-    # else:
-    #     for l in reversed(lod):
-    #         ans = []
-    #         idx = 0
-    #         while idx < len(l) - 1:
-    #             ans.append(tensor_data[l[idx]:l[idx + 1]])
-    #             idx += 1
-    #         tensor_data = ans
-    #     ans = tensor_data
-    return ans
+    if len(lod) > 0:
+        raise RuntimeError(
+            "Some of your featched tensors hold LoD information. \
+            They can not be completely cast to Python ndarray. \
+            Please set the parameter 'return_numpy' as 'False' to \
+            return LoDTensor itself directly.")
+    return np.array(tensor)
+
+
+def has_feed_operators(block, feed_targets, feed_holder_name):
+    """ Check whether the block already has feed operators.
+
+    Return false if the block does not have any feed operators.
+    If some feed operators have been prepended to the block, check that
+    the info contained in these feed operators matches the feed_targets
+    and feed_holder_name. Raise exception when any mismatch is found.
+    Return true when the block has feed operators with matching info.
+
+    Args:
+        block: a block instance (typically global block of a program)
+        feed_targets: a dictionary of {feed_target_name: feed_target_data}
+        feed_holder_name: the name of the variable that holds the data of
+            all feed targets. The type of this feed_holder variable is
+            FEED_MINIBATCH, which is essentially vector<LoDTensor>.
+
+    Returns:
+        A boolean value that indicates whether a block has feed operators
+        that match the info contained in feed_targets and feed_holder_name.
+    """
+
+    feed_count = 0
+    for op in block.ops:
+        if op.desc.type() == 'feed':
+            feed_count += 1
+            assert op.desc.input('X')[0] == feed_holder_name
+            feed_target_name = op.desc.output('Out')[0]
+            if feed_target_name not in feed_targets:
+                raise Exception("'feed_targets' does not have {} variable".
+                                format(feed_target_name))
+        else:
+            break
+    if feed_count > 0 and feed_count != len(feed_targets):
+        raise Exception(
+            "Feed operators in program desc do not match 'feed_targets'")
+    return feed_count > 0
+
+
+def has_fetch_operators(block, fetch_targets, fetch_holder_name):
+    """ Check whether the block already has fetch operators.
+
+    Return false if the block does not have any fetch operators.
+    If some fetch operators have been appended to the block, check that
+    the info contained in these fetch operators matches the fetch_targets
+    and fetch_holder_name. Raise exception when any mismatch is found.
+    Return true when the block has fetch operators with matching info.
+
+    Args:
+        block: a block instance (typically global block of a program)
+        fetch_targets: a dictionary of {fetch_target_name: fetch_target_data}
+        fetch_holder_name: the name of the variable that holds the data of
+            all fetch targets. The type of this fetch_holder variable is
+            FETCH_LIST, which is essentially vector<LoDTensor>.
+
+    Return:
+        A boolean value that indicates whether a block has fetch operators
+        that match the info contained in fetch_targets and fetch_holder_name.
+    """
+
+    fetch_count = 0
+    for op in block.ops:
+        if op.desc.type() == 'fetch':
+            fetch_count += 1
+            assert op.desc.output('Out')[0] == fetch_holder_name
+            fetch_target_name = op.desc.input('X')[0]
+            if fetch_target_name not in [
+                    var.desc.name() for var in fetch_targets
+            ]:
+                raise Exception("'fetch_targets' does not have {} variable".
+                                format(fetch_target_name))
+            idx = op.desc.attr('col')
+            assert fetch_target_name == fetch_targets[idx].desc.name()
+    if fetch_count > 0 and fetch_count != len(fetch_targets):
+        raise Exception(
+            "Fetch operators in program desc do not match 'fetch_targets'")
+    return fetch_count > 0
+
+
+def fetch_var(name, scope=None, return_numpy=True):
+    """
+    Fetch the value of the variable with the given name from the given scope
+    Args:
+        name(str): name of the variable. Typically, only persistable variables
+            can be found in the scope used for running the program.
+        scope(core.Scope|None): scope object. It should be the scope where
+            you pass to Executor.run() when running your program.
+            If None, global_scope() will be used.
+        return_numpy(bool): whether convert the tensor to numpy.ndarray
+    Returns:
+       LodTensor|numpy.ndarray
+    """
+    assert isinstance(name, str)
+    if scope is None:
+        scope = global_scope()
+    assert isinstance(scope, core.Scope)
+
+    var = global_scope().find_var(name)
+    assert var is not None, (
+        "Cannot find " + name + " in scope. Perhaps you need to make the"
+        " variable persistable by using var.persistable = True in your"
+        " program.")
+    tensor = var.get_tensor()
+    if return_numpy:
+        tensor = as_numpy(tensor)
+    return tensor
 
 
 class Executor(object):
@@ -147,40 +242,56 @@ class Executor(object):
 
         program = program.clone()
         global_block = program.global_block()
-        feed_var = global_block.create_var(
-            name=feed_var_name,
-            type=core.VarDesc.VarType.FEED_MINIBATCH,
-            persistable=True)
-
-        for i, name in enumerate(feed):
-            out = global_block.var(name)
-            global_block.prepend_op(
-                'feed',
-                inputs={'X': [feed_var]},
-                outputs={'Out': [out]},
-                attrs={'col': i})
-            cur_feed = feed[name]
-            if not isinstance(cur_feed, core.LoDTensor):
-                cur_feed = self.aslodtensor(cur_feed)
-            core.set_feed_variable(scope, cur_feed, feed_var.name, i)
-
-        fetch_var = global_block.create_var(
-            name=fetch_var_name,
-            type=core.VarDesc.VarType.FETCH_LIST,
-            persistable=True)
-        for i, var in enumerate(fetch_list):
-            global_block.append_op(
-                type='fetch',
-                inputs={'X': [var]},
-                outputs={'Out': [fetch_var]},
-                attrs={'col': i})
+
+        if feed_var_name in global_block.vars:
+            feed_var = global_block.var(feed_var_name)
+        else:
+            feed_var = global_block.create_var(
+                name=feed_var_name,
+                type=core.VarDesc.VarType.FEED_MINIBATCH,
+                persistable=True)
+
+        if fetch_var_name in global_block.vars:
+            fetch_var = global_block.var(fetch_var_name)
+        else:
+            fetch_var = global_block.create_var(
+                name=fetch_var_name,
+                type=core.VarDesc.VarType.FETCH_LIST,
+                persistable=True)
+
+        if not has_feed_operators(global_block, feed, feed_var_name):
+            for i, name in enumerate(feed):
+                out = global_block.var(name)
+                global_block.prepend_op(
+                    type='feed',
+                    inputs={'X': [feed_var]},
+                    outputs={'Out': [out]},
+                    attrs={'col': i})
+
+        for op in global_block.ops:
+            if op.desc.type() == 'feed':
+                feed_target_name = op.desc.output('Out')[0]
+                cur_feed = feed[feed_target_name]
+                if not isinstance(cur_feed, core.LoDTensor):
+                    cur_feed = self.aslodtensor(cur_feed)
+                idx = op.desc.attr('col')
+                core.set_feed_variable(scope, cur_feed, feed_var_name, idx)
+            else:
+                break
+
+        if not has_fetch_operators(global_block, fetch_list, fetch_var_name):
+            for i, var in enumerate(fetch_list):
+                global_block.append_op(
+                    type='fetch',
+                    inputs={'X': [var]},
+                    outputs={'Out': [fetch_var]},
+                    attrs={'col': i})
 
         self.executor.run(program.desc, scope, 0, True, True)
         outs = [
             core.get_fetch_variable(scope, fetch_var_name, i)
             for i in xrange(len(fetch_list))
         ]
-
         if return_numpy:
             outs = as_numpy(outs)
         return outs
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
index 4d8343e7de9526d527ebe93f334b59108d5ace8e..a517db68c5886fbcbe19e6981aee5bf3971352e4 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -14,6 +14,7 @@
 
 import collections
 import contextlib
+import re
 
 import numpy as np
 
@@ -30,6 +31,7 @@ __all__ = [
     'program_guard',
     'switch_startup_program',
     'switch_main_program',
+    'get_var',
 ]
 
 EMPTY_VAR_NAME = core.kEmptyVarName()
@@ -239,20 +241,30 @@ class Variable(object):
     def __str__(self):
         return self.to_string(True)
 
-    def to_string(self, throw_on_error):
+    def to_string(self, throw_on_error, with_details=False):
         """
         Get debug string.
 
         Args:
             throw_on_error(bool): True if raise an exception when self is not
                 intialized.
+            with_details(bool): more details about variables and parameters
+                (e.g. trainable, optimize_attr, ...) will be printed when with_details is True
 
         Returns(str): The debug string.
 
         """
+        assert isinstance(throw_on_error, bool) and isinstance(with_details,
+                                                               bool)
         protostr = self.desc.serialize_to_string()
         proto = framework_pb2.VarDesc.FromString(str(protostr))
-        return _debug_string_(proto, throw_on_error)
+        res_str = _debug_string_(proto, throw_on_error)
+        if with_details:
+            additional_attr = ("error_clip", "stop_gradient")
+            for attr_name in additional_attr:
+                res_str += "%s: %s\n" % (attr_name,
+                                         str(getattr(self, attr_name)))
+        return res_str
 
     __repr__ = __str__
 
@@ -440,9 +452,8 @@ class Operator(object):
             if not given == need:
                 raise ValueError(("Incorrect setting for output(s) of "
                                   "operator \"%s\". Need: [%s] Given: [%s]") %
-                                 (type, ", ".join(str(e)
-                                                  for e in need), ", ".join(
-                                                      str(e) for e in given)))
+                                 (type, ", ".join(str(e) for e in need),
+                                  ", ".join(str(e) for e in given)))
 
             for out_proto in proto.outputs:
                 out_args = outputs[out_proto.name]
@@ -478,7 +489,8 @@ class Operator(object):
         no_kernel_op_set = {
             'feed', 'fetch', 'save', 'load', 'recurrent',
             'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
-            'recv', 'parallel_do'
+            'recv', 'listen_and_serv', 'parallel_do', 'save_combine',
+            'load_combine'
         }
         if type not in no_kernel_op_set:
             self.desc.infer_var_type(self.block.desc)
@@ -629,10 +641,36 @@ class Block(object):
     def __str__(self):
         return self.to_string(True)
 
-    def to_string(self, throw_on_error):
-        protostr = self.desc.serialize_to_string()
-        proto = framework_pb2.BlockDesc.FromString(str(protostr))
-        return _debug_string_(proto, throw_on_error)
+    def to_string(self, throw_on_error, with_details=False):
+        """
+        To debug string.
+        Args:
+            throw_on_error(bool): raise exception when self is not initialized
+                when throw_on_error is True
+            with_details(bool): more details about variables and parameters
+                (e.g. trainable, optimize_attr, ...) will be printed when with_details is True
+
+        Returns(str): The debug string.
+
+        """
+        assert isinstance(throw_on_error, bool) and isinstance(with_details,
+                                                               bool)
+        if with_details:
+            re_add_indent = re.compile(r"\n(.)")
+            res_str = "blocks {\n  idx: %d\n  parent_idx: %d" % (
+                self.idx, self.parent_idx)
+            for var in self.vars.itervalues():
+                res_str += "\n  vars {\n    %s  }" % re_add_indent.sub(
+                    r"\n    \1", var.to_string(throw_on_error, with_details))
+            for op in self.ops:
+                res_str += "\n  ops {\n    %s  }" % re_add_indent.sub(
+                    r"\n    \1", op.to_string(throw_on_error))
+            res_str += "\n}"
+        else:
+            protostr = self.desc.serialize_to_string()
+            proto = framework_pb2.BlockDesc.FromString(str(protostr))
+            res_str = _debug_string_(proto, throw_on_error)
+        return res_str
 
     __repr__ = __str__
 
@@ -702,6 +740,9 @@ class Block(object):
             raise e
         self.desc.remove_op(start, end + 1)
 
+    def slice_ops(self, start, end):
+        return list(self.ops)[start:end]
+
     def prepend_op(self, *args, **kwargs):
         op_desc = self.desc.prepend_op()
         op = Operator(self, op_desc, *args, **kwargs)
@@ -796,10 +837,29 @@ class Program(object):
     def __str__(self):
         return self.to_string(True)
 
-    def to_string(self, throw_on_error):
-        protostr = self.desc.serialize_to_string()
-        proto = framework_pb2.ProgramDesc.FromString(str(protostr))
-        return _debug_string_(proto, throw_on_error)
+    def to_string(self, throw_on_error, with_details=False):
+        """
+        To debug string.
+        Args:
+            throw_on_error(bool): raise exception when self is not initialized
+                when throw_on_error is True
+            with_details(bool): more details about variables and parameters
+                (e.g. trainable, optimize_attr, ...) will be printed when with_details is True
+
+        Returns(str): The debug string.
+
+        """
+        assert isinstance(throw_on_error, bool) and isinstance(with_details,
+                                                               bool)
+        if with_details:
+            res_str = ""
+            for block in self.blocks:
+                res_str += block.to_string(throw_on_error, with_details)
+        else:
+            protostr = self.desc.serialize_to_string()
+            proto = framework_pb2.ProgramDesc.FromString(str(protostr))
+            res_str = _debug_string_(proto, throw_on_error)
+        return res_str
 
     def get_desc(self):
         return self.desc
@@ -950,6 +1010,36 @@ class Parameter(Variable):
 
         self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None)
 
+    def __str__(self):
+        return self.to_string(True)
+
+    def to_string(self, throw_on_error, with_details=False):
+        """
+        To debug string.
+        Args:
+            throw_on_error(bool): raise exception when self is not initialized
+                when throw_on_error is True
+            with_details(bool): more details about variables and parameters
+                (e.g. trainable, optimize_attr, ...) will be printed when with_details is True
+
+        Returns(str): The debug string.
+
+        """
+        assert isinstance(throw_on_error, bool) and isinstance(with_details,
+                                                               bool)
+        if with_details:
+            res_str = Variable.to_string(self, throw_on_error, True)
+            additional_attr = ("trainable", "optimize_attr", "regularizer",
+                               "gradient_clip_attr")
+            for attr_name in additional_attr:
+                res_str += "%s: %s\n" % (attr_name,
+                                         str(getattr(self, attr_name)))
+        else:
+            res_str = Variable.to_string(self, throw_on_error, False)
+        return res_str
+
+    __repr__ = __str__
+
 
 # program is a global instance.
 _main_program_ = Program()
@@ -1037,3 +1127,22 @@ def program_guard(main_program, startup_program=None):
     switch_main_program(main_program)
     if startup_program is not None:
         switch_startup_program(startup_program)
+
+
+def get_var(name, program=None):
+    """
+    Get a variable by name from the global block of a program
+    Args:
+        name(str): name of the variable
+        program(Program|None): program object.
+             If None, default_global_program() will be used.
+
+    Returns:
+        Variable
+    """
+    if program is None:
+        program = default_main_program()
+    assert isinstance(name, str)
+    assert isinstance(name, Program)
+
+    return program.global_block().var(name)
diff --git a/python/paddle/v2/fluid/graphviz.py b/python/paddle/v2/fluid/graphviz.py
new file mode 100644
index 0000000000000000000000000000000000000000..5881119c39231282b5654cd60720a1d8a7877896
--- /dev/null
+++ b/python/paddle/v2/fluid/graphviz.py
@@ -0,0 +1,272 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+import subprocess
+import logging
+
+
+def crepr(v):
+    if type(v) is str or type(v) is unicode:
+        return '"%s"' % v
+    return str(v)
+
+
+class Rank(object):
+    def __init__(self, kind, name, priority):
+        '''
+        kind: str
+        name: str
+        priority: int
+        '''
+        self.kind = kind
+        self.name = name
+        self.priority = priority
+        self.nodes = []
+
+    def __str__(self):
+        if not self.nodes:
+            return ''
+
+        return '{' + 'rank={};'.format(self.kind) + \
+               ','.join([node.name for node in self.nodes]) + '}'
+
+
+class Graph(object):
+    rank_counter = 0
+
+    def __init__(self, title, **attrs):
+        self.title = title
+        self.attrs = attrs
+        self.nodes = []
+        self.edges = []
+        self.rank_groups = {}
+
+    def code(self):
+        return self.__str__()
+
+    def rank_group(self, kind, priority):
+        name = "rankgroup-%d" % Graph.rank_counter
+        Graph.rank_counter += 1
+        rank = Rank(kind, name, priority)
+        self.rank_groups[name] = rank
+        return name
+
+    def node(self, label, prefix, description="", **attrs):
+        node = Node(label, prefix, description, **attrs)
+
+        if 'rank' in attrs:
+            rank = self.rank_groups[attrs['rank']]
+            del attrs['rank']
+            rank.nodes.append(node)
+        self.nodes.append(node)
+        return node
+
+    def edge(self, source, target, **attrs):
+        edge = Edge(source, target, **attrs)
+        self.edges.append(edge)
+        return edge
+
+    def compile(self, dot_path):
+        file = open(dot_path, 'w')
+        file.write(self.__str__())
+        image_path = os.path.join(
+            os.path.dirname(__file__), dot_path[:-3] + "pdf")
+        cmd = ["dot", "-Tpdf", dot_path, "-o", image_path]
+        subprocess.Popen(
+            cmd,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+        logging.warning("write block debug graph to {}".format(image_path))
+        return image_path
+
+    def show(self, dot_path):
+        image = self.compile(dot_path)
+        cmd = ["open", image]
+        subprocess.Popen(
+            cmd,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+
+    def _rank_repr(self):
+        ranks = sorted(
+            self.rank_groups.items(),
+            cmp=lambda a, b: a[1].priority > b[1].priority)
+        repr = []
+        for x in ranks:
+            repr.append(str(x[1]))
+        return '\n'.join(repr) + '\n'
+
+    def __str__(self):
+        reprs = [
+            'digraph G {',
+            'title = {}'.format(crepr(self.title)),
+        ]
+
+        for attr in self.attrs:
+            reprs.append("{key}={value};".format(
+                key=attr, value=crepr(self.attrs[attr])))
+
+        reprs.append(self._rank_repr())
+
+        random.shuffle(self.nodes)
+        reprs += [str(node) for node in self.nodes]
+
+        for x in self.edges:
+            reprs.append(str(x))
+
+        reprs.append('}')
+        return '\n'.join(reprs)
+
+
+class Node(object):
+    counter = 1
+
+    def __init__(self, label, prefix, description="", **attrs):
+        self.label = label
+        self.name = "%s_%d" % (prefix, Node.counter)
+        self.description = description
+        self.attrs = attrs
+        Node.counter += 1
+
+    def __str__(self):
+        reprs = '{name} [label={label} {extra} ];'.format(
+            name=self.name,
+            label=self.label,
+            extra=',' + ','.join("%s=%s" % (key, crepr(value))
+                                 for key, value in self.attrs.items())
+            if self.attrs else "")
+        return reprs
+
+
+class Edge(object):
+    def __init__(self, source, target, **attrs):
+        '''
+        Link source to target.
+        :param source: Node
+        :param target: Node
+        :param graph: Graph
+        :param attrs: dic
+        '''
+        self.source = source
+        self.target = target
+        self.attrs = attrs
+
+    def __str__(self):
+        repr = "{source} -> {target} {extra}".format(
+            source=self.source.name,
+            target=self.target.name,
+            extra="" if not self.attrs else
+            "[" + ','.join("{}={}".format(attr[0], crepr(attr[1]))
+                           for attr in self.attrs.items()) + "]")
+        return repr
+
+
+class GraphPreviewGenerator(object):
+    '''
+    Generate a graph image for ONNX proto.
+    '''
+
+    def __init__(self, title):
+        # init graphviz graph
+        self.graph = Graph(
+            title,
+            layout="dot",
+            concentrate="true",
+            rankdir="TB", )
+
+        self.op_rank = self.graph.rank_group('same', 2)
+        self.param_rank = self.graph.rank_group('same', 1)
+        self.arg_rank = self.graph.rank_group('same', 0)
+
+    def __call__(self, path='temp.dot', show=False):
+        if not show:
+            self.graph.compile(path)
+        else:
+            self.graph.show(path)
+
+    def add_param(self, name, data_type, shape, highlight=False):
+        label = '\n'.join([
+            '<<table cellpadding="5">',
+            '  <tr>',
+            '    <td bgcolor="#2b787e">',
+            '    <b>',
+            name,
+            '    </b>',
+            '    </td>',
+            '  </tr>',
+            '  <tr>',
+            '    <td>',
+            str(data_type),
+            '    </td>'
+            '  </tr>',
+            '  <tr>',
+            '    <td>',
+            '[%s]' % 'x'.join(shape),
+            '    </td>'
+            '  </tr>',
+            '</table>>',
+        ])
+        return self.graph.node(
+            label,
+            prefix="param",
+            description=name,
+            shape="none",
+            style="rounded,filled,bold",
+            width="1.3",
+            color="#148b97" if not highlight else "orange",
+            fontcolor="#ffffff",
+            fontname="Arial")
+
+    def add_op(self, opType, **kwargs):
+        highlight = False
+        if 'highlight' in kwargs:
+            highlight = kwargs['highlight']
+            del kwargs['highlight']
+        return self.graph.node(
+            "<<B>%s</B>>" % opType,
+            prefix="op",
+            description=opType,
+            shape="box",
+            style="rounded, filled, bold",
+            color="#303A3A" if not highlight else "orange",
+            fontname="Arial",
+            fontcolor="#ffffff",
+            width="1.3",
+            height="0.84", )
+
+    def add_arg(self, name, highlight=False):
+        return self.graph.node(
+            crepr(name),
+            prefix="arg",
+            description=name,
+            shape="box",
+            style="rounded,filled,bold",
+            fontname="Arial",
+            fontcolor="#999999",
+            color="#dddddd" if not highlight else "orange")
+
+    def add_edge(self, source, target, **kwargs):
+        highlight = False
+        if 'highlight' in kwargs:
+            highlight = kwargs['highlight']
+            del kwargs['highlight']
+        return self.graph.edge(
+            source,
+            target,
+            color="#00000" if not highlight else "orange",
+            **kwargs)
diff --git a/python/paddle/v2/fluid/io.py b/python/paddle/v2/fluid/io.py
index 5b02d2495d1ebe9e82e7f847e5bd07548901c7fc..0f43e46082a8988be4805a2b750227312ba80ff3 100644
--- a/python/paddle/v2/fluid/io.py
+++ b/python/paddle/v2/fluid/io.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import os
-import cPickle as pickle
 
 from paddle.v2.fluid.evaluator import Evaluator
 from paddle.v2.fluid.framework import Program, Parameter, default_main_program, Variable
@@ -47,6 +46,9 @@ def is_parameter(var):
 
 
 def is_persistable(var):
+    if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+       var.desc.type() == core.VarDesc.VarType.FETCH_LIST:
+        return False
     return var.persistable
 
 
@@ -61,7 +63,12 @@ def _clone_var_in_block_(block, var):
         persistable=True)
 
 
-def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
+def save_vars(executor,
+              dirname,
+              main_program=None,
+              vars=None,
+              predicate=None,
+              save_file_name=None):
     """
     Save variables to directory by executor.
 
@@ -70,9 +77,12 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
     :param main_program: program. If vars is None, then filter all variables in this
     program which fit `predicate`. Default default_main_program.
     :param predicate: The Predicate describes a callable that returns a variable
-    as a bool. If it returns true, the variables will be saved.
-    :param vars: variables need to be saved. If specify vars, program & predicate
+    as a bool. If it returns true, the corresponding input variable will be saved.
+    :param vars: variables need to be saved. If vars is specified, program & predicate
     will be ignored
+    :param save_file_name: The name of a single file that all vars are saved to. 
+    If it is None, save variables to separate files.
+
     :return: None
     """
     if vars is None:
@@ -84,21 +94,39 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
         save_vars(
             executor,
             dirname=dirname,
-            vars=filter(predicate, main_program.list_vars()))
+            vars=filter(predicate, main_program.list_vars()),
+            save_file_name=save_file_name)
     else:
         save_program = Program()
         save_block = save_program.global_block()
+
+        save_var_map = {}
         for each_var in vars:
             new_var = _clone_var_in_block_(save_block, each_var)
+            if save_file_name is None:
+                save_block.append_op(
+                    type='save',
+                    inputs={'X': [new_var]},
+                    outputs={},
+                    attrs={'file_path': os.path.join(dirname, new_var.name)})
+            else:
+                save_var_map[new_var.name] = new_var
+
+        if save_file_name is not None:
+            save_var_list = []
+            for name in sorted(save_var_map.keys()):
+                save_var_list.append(save_var_map[name])
+
             save_block.append_op(
-                type='save',
-                inputs={'X': [new_var]},
+                type='save_combine',
+                inputs={'X': save_var_list},
                 outputs={},
-                attrs={'file_path': os.path.join(dirname, new_var.name)})
+                attrs={'file_path': os.path.join(dirname, save_file_name)})
+
         executor.run(save_program)
 
 
-def save_params(executor, dirname, main_program=None):
+def save_params(executor, dirname, main_program=None, save_file_name=None):
     """
     Save all parameters to directory with executor.
     """
@@ -107,10 +135,12 @@ def save_params(executor, dirname, main_program=None):
         dirname=dirname,
         main_program=main_program,
         vars=None,
-        predicate=is_parameter)
+        predicate=is_parameter,
+        save_file_name=save_file_name)
 
 
-def save_persistables(executor, dirname, main_program=None):
+def save_persistables(executor, dirname, main_program=None,
+                      save_file_name=None):
     """
     Save all persistables to directory with executor.
     """
@@ -119,21 +149,30 @@ def save_persistables(executor, dirname, main_program=None):
         dirname=dirname,
         main_program=main_program,
         vars=None,
-        predicate=is_persistable)
+        predicate=is_persistable,
+        save_file_name=save_file_name)
 
 
-def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
+def load_vars(executor,
+              dirname,
+              main_program=None,
+              vars=None,
+              predicate=None,
+              load_file_name=None):
     """
     Load variables from directory by executor.
 
-    :param executor: executor that save variable
+    :param executor: executor that load variable
     :param dirname: directory path
     :param main_program: program. If vars is None, then filter all variables in this
     program which fit `predicate`. Default default_main_program().
     :param predicate: The Predicate describes a callable that returns a variable
-    as a bool. If it returns true, the variables will be loaded.
-    :param vars: variables need to be loaded. If specify vars, program &
+    as a bool. If it returns true, the corresponding input variable will be loaded.
+    :param vars: variables need to be loaded. If vars is specified, program &
     predicate will be ignored
+    :param load_file_name: The name of the single file that all vars are loaded from.   
+    If it is None, load variables from separate files.
+
     :return: None
     """
     if vars is None:
@@ -145,23 +184,40 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
         load_vars(
             executor,
             dirname=dirname,
-            vars=filter(predicate, main_program.list_vars()))
+            vars=filter(predicate, main_program.list_vars()),
+            load_file_name=load_file_name)
     else:
         load_prog = Program()
         load_block = load_prog.global_block()
+
+        load_var_map = {}
         for each_var in vars:
             assert isinstance(each_var, Variable)
             new_var = _clone_var_in_block_(load_block, each_var)
+            if load_file_name is None:
+                load_block.append_op(
+                    type='load',
+                    inputs={},
+                    outputs={'Out': [new_var]},
+                    attrs={'file_path': os.path.join(dirname, new_var.name)})
+            else:
+                load_var_map[new_var.name] = new_var
+
+        if load_file_name is not None:
+            load_var_list = []
+            for name in sorted(load_var_map.keys()):
+                load_var_list.append(load_var_map[name])
+
             load_block.append_op(
-                type='load',
+                type='load_combine',
                 inputs={},
-                outputs={"Out": [new_var]},
-                attrs={'file_path': os.path.join(dirname, new_var.name)})
+                outputs={"Out": load_var_list},
+                attrs={'file_path': os.path.join(dirname, load_file_name)})
 
         executor.run(load_prog)
 
 
-def load_params(executor, dirname, main_program=None):
+def load_params(executor, dirname, main_program=None, load_file_name=None):
     """
     load all parameters from directory by executor.
     """
@@ -169,10 +225,12 @@ def load_params(executor, dirname, main_program=None):
         executor,
         dirname=dirname,
         main_program=main_program,
-        predicate=is_parameter)
+        predicate=is_parameter,
+        load_file_name=load_file_name)
 
 
-def load_persistables(executor, dirname, main_program=None):
+def load_persistables(executor, dirname, main_program=None,
+                      load_file_name=None):
     """
     load all persistables from directory by executor.
     """
@@ -180,7 +238,8 @@ def load_persistables(executor, dirname, main_program=None):
         executor,
         dirname=dirname,
         main_program=main_program,
-        predicate=is_persistable)
+        predicate=is_persistable,
+        load_file_name=load_file_name)
 
 
 def get_inference_program(target_vars, main_program=None):
@@ -191,8 +250,8 @@ def get_inference_program(target_vars, main_program=None):
     vars = []
     for var in target_vars:
         if isinstance(var, Evaluator):
-            vars.append(var.states)
-            vars.append(var.metrics)
+            vars.extend(var.states)
+            vars.extend(var.metrics)
         else:
             vars.append(var)
     pruned_program = main_program.prune(targets=vars)
@@ -200,12 +259,16 @@ def get_inference_program(target_vars, main_program=None):
     return inference_program
 
 
-def prepend_feed_ops(inference_program, feeded_var_names):
+def prepend_feed_ops(inference_program,
+                     feed_target_names,
+                     feed_holder_name='feed'):
     global_block = inference_program.global_block()
     feed_var = global_block.create_var(
-        name='feed', type=core.VarDesc.VarType.FEED_MINIBATCH, persistable=True)
+        name=feed_holder_name,
+        type=core.VarDesc.VarType.FEED_MINIBATCH,
+        persistable=True)
 
-    for i, name in enumerate(feeded_var_names):
+    for i, name in enumerate(feed_target_names):
         out = global_block.var(name)
         global_block.prepend_op(
             type='feed',
@@ -214,12 +277,16 @@ def prepend_feed_ops(inference_program, feeded_var_names):
             attrs={'col': i})
 
 
-def append_fetch_ops(inference_program, fetch_var_names):
+def append_fetch_ops(inference_program,
+                     fetch_target_names,
+                     fetch_holder_name='fetch'):
     global_block = inference_program.global_block()
     fetch_var = global_block.create_var(
-        name='fetch', type=core.VarDesc.VarType.FETCH_LIST, persistable=True)
+        name=fetch_holder_name,
+        type=core.VarDesc.VarType.FETCH_LIST,
+        persistable=True)
 
-    for i, name in enumerate(fetch_var_names):
+    for i, name in enumerate(fetch_target_names):
         global_block.append_op(
             type='fetch',
             inputs={'X': [name]},
@@ -231,7 +298,8 @@ def save_inference_model(dirname,
                          feeded_var_names,
                          target_vars,
                          executor,
-                         main_program=None):
+                         main_program=None,
+                         save_file_name=None):
     """
     Build a model especially for inference,
     and save it to directory by the executor.
@@ -242,6 +310,8 @@ def save_inference_model(dirname,
     :param executor: executor that save inference model
     :param main_program: original program, which will be pruned to build the inference model.
             Default default_main_program().
+    :param save_file_name: The name of a single file that all parameters are saved to. 
+    If it is None, save parameters to separate files.
 
     :return: None
     """
@@ -269,68 +339,73 @@ def save_inference_model(dirname,
     inference_program = pruned_program.inference_optimize()
     fetch_var_names = [v.name for v in target_vars]
 
-    model_file_name = dirname + "/__model__"
-    with open(model_file_name, "w") as f:
-        pickle.dump({
-            "program_desc_str": inference_program.desc.serialize_to_string(),
-            "feed_var_names": feeded_var_names,
-            "fetch_var_names": fetch_var_names
-        }, f, -1)
-
     prepend_feed_ops(inference_program, feeded_var_names)
     append_fetch_ops(inference_program, fetch_var_names)
 
-    # Save only programDesc of inference_program in binary format
-    # in another file: __model__.dat
-    with open(model_file_name + ".dat", "wb") as fp:
-        fp.write(inference_program.desc.serialize_to_string())
+    if save_file_name == None:
+        model_file_name = dirname + "/__model__"
+    else:
+        model_file_name = dirname + "/__model_combined__"
 
-    save_params(executor, dirname, main_program)
+    with open(model_file_name, "wb") as f:
+        f.write(inference_program.desc.serialize_to_string())
 
+    save_persistables(executor, dirname, inference_program, save_file_name)
 
-def load_persistables_if_exist(executor, dirname, main_program=None):
-    filenames = next(os.walk(dirname))[2]
-    filenames = set(filenames)
 
-    def _is_presistable_and_exist_(var):
-        if not is_persistable(var):
-            return False
-        else:
-            return var.name in filenames
+def get_feed_targets_names(program):
+    feed_targets_names = []
+    global_block = program.global_block()
+    for op in global_block.ops:
+        if op.desc.type() == 'feed':
+            feed_targets_names.insert(0, op.desc.output('Out')[0])
+    return feed_targets_names
 
-    load_vars(
-        executor,
-        dirname,
-        main_program=main_program,
-        vars=None,
-        predicate=_is_presistable_and_exist_)
+
+def get_fetch_targets_names(program):
+    fetch_targets_names = []
+    global_block = program.global_block()
+    for op in global_block.ops:
+        if op.desc.type() == 'fetch':
+            fetch_targets_names.append(op.desc.input('X')[0])
+    return fetch_targets_names
 
 
-def load_inference_model(dirname, executor):
+def load_inference_model(dirname, executor, load_file_name=None):
     """
     Load inference model from a directory
 
     :param dirname: directory path
     :param executor: executor that load inference model
-
-    :return: [program, feed_var_names, fetch_var_names]
+    :param load_file_name: The name of the single file that all parameters are loaded from.   
+    If it is None, load parameters from separate files.
+    
+    :return: [program, feed_target_names, fetch_targets]
              program: program especially for inference.
-             feeded_var_names: Names of variables that need to feed data
-             fetch_vars: Variables from which we can get inference results.
+             feed_target_names: Names of variables that need to feed data
+             fetch_targets: Variables from which we can get inference results.
     """
     if not os.path.isdir(dirname):
         raise ValueError("There is no directory named '%s'", dirname)
 
-    model_file_name = dirname + "/__model__"
-    model = pickle.load(open(model_file_name, "r"))
-    program_desc_str = model["program_desc_str"]
-    feed_var_names = model["feed_var_names"]
-    fetch_var_names = model["fetch_var_names"]
+    if load_file_name == None:
+        model_file_name = dirname + "/__model__"
+    else:
+        model_file_name = dirname + "/__model_combined__"
+
+    with open(model_file_name, "rb") as f:
+        program_desc_str = f.read()
+
     program = Program.parse_from_string(program_desc_str)
-    load_persistables_if_exist(executor, dirname, program)
-    fetch_vars = [program.global_block().var(name) for name in fetch_var_names]
+    load_persistables(executor, dirname, program, load_file_name)
 
-    return [program, feed_var_names, fetch_vars]
+    feed_target_names = get_feed_targets_names(program)
+    fetch_target_names = get_fetch_targets_names(program)
+    fetch_targets = [
+        program.global_block().var(name) for name in fetch_target_names
+    ]
+
+    return [program, feed_target_names, fetch_targets]
 
 
 def get_parameter_value(para, executor):
@@ -339,6 +414,7 @@ def get_parameter_value(para, executor):
 
     :param executor: executor for retrieving the value
     :param para: the given parameter
+
     :return: the LoDTensor for the parameter
     """
     assert is_parameter(para)
@@ -357,6 +433,7 @@ def get_parameter_value_by_name(name, executor, program=None):
     :param name: the name of the parameter
     :param program: the program where the variable is found
             Default default_main_program().
+
     :return: the LoDTensor for the variable
     """
     if program is None:
diff --git a/python/paddle/v2/fluid/layer_helper.py b/python/paddle/v2/fluid/layer_helper.py
index 7d9ae53d94b6c82890150346f138e48a0dfbf15c..2119ca12c8dea6463934aa68cb1b46ec687e3f72 100644
--- a/python/paddle/v2/fluid/layer_helper.py
+++ b/python/paddle/v2/fluid/layer_helper.py
@@ -18,7 +18,7 @@ import itertools
 from framework import Variable, Parameter, default_main_program, default_startup_program, \
     unique_name, dtype_is_floating
 from paddle.v2.fluid.initializer import Constant, Xavier
-from param_attr import ParamAttr
+from param_attr import ParamAttr, WeightNormParamAttr
 
 
 class LayerHelper(object):
@@ -104,6 +104,177 @@ class LayerHelper(object):
                                  (dtype, each.dtype))
         return dtype
 
+    def _create_weight_normalize(self, attr, shape, dtype):
+        from .layers import elementwise_mul, elementwise_div, reshape
+
+        # Remove these ops when LayerHelper and layers support indicating
+        # program and block.
+        def __norm_op(x,
+                      out=None,
+                      p=2,
+                      dim=None,
+                      keep_dim=False,
+                      block=self.startup_program.global_block()):
+            if out is None:
+                out = block.create_var(
+                    name=unique_name(".".join([self.name, 'weight_norm_norm'])),
+                    dtype=dtype,
+                    persistable=False)
+            abs_out = block.create_var(
+                name=unique_name(".".join([self.name, 'weight_norm_abs'])),
+                dtype=dtype,
+                persistable=False)
+            block.append_op(
+                type='abs', inputs={'X': x}, outputs={'Out': abs_out})
+            pow_out = block.create_var(
+                name=unique_name(".".join([self.name, 'weight_norm_pow'])),
+                dtype=dtype,
+                persistable=False)
+            block.append_op(
+                type='pow',
+                inputs={'X': abs_out},
+                outputs={'Out': pow_out},
+                attrs={'factor': float(p)})
+            sum_out = block.create_var(
+                name=unique_name(".".join([self.name, 'weight_norm_sum'])),
+                dtype=dtype,
+                persistable=False)
+            block.append_op(
+                type='reduce_sum',
+                inputs={'X': pow_out},
+                outputs={'Out': sum_out},
+                attrs={
+                    'dim': dim,
+                    'keep_dim': keep_dim,
+                    'reduce_all': True if dim is None else False
+                })
+            block.append_op(
+                type='pow',
+                inputs={'X': sum_out},
+                outputs={'Out': out},
+                attrs={'factor': 1. / p})
+            return out
+
+        def __reshape_op(x,
+                         shape,
+                         out=None,
+                         block=self.startup_program.global_block()):
+            if out is None:
+                out = block.create_var(
+                    name=unique_name(".".join(
+                        [self.name, 'weight_norm_reshape'])),
+                    dtype=dtype,
+                    persistable=False)
+            block.append_op(
+                type='reshape',
+                inputs={'X': x},
+                outputs={'Out': out},
+                attrs={'shape': shape})
+            return out
+
+        def __transpose_op(x,
+                           axis,
+                           out=None,
+                           block=self.startup_program.global_block()):
+            if out is None:
+                out = block.create_var(
+                    name=unique_name(".".join(
+                        [self.name, 'weight_norm_transpose'])),
+                    dtype=dtype,
+                    persistable=False)
+            block.append_op(
+                type='transpose',
+                inputs={'X': x},
+                outputs={'Out': out},
+                attrs={'axis': axis})
+            return out
+
+        def __norm_except_dim(x,
+                              out=None,
+                              dim=None,
+                              block=self.startup_program.global_block()):
+            """Computes the norm over all dimensions except dim"""
+            if out is None:
+                out = block.create_var(
+                    name=unique_name(".".join([self.name, 'weight_norm_norm'])),
+                    dtype=dtype,
+                    persistable=False)
+            if dim is None:
+                __norm_op(x, out, dim=dim, block=block)
+            elif dim == 0:
+                out_shape = [x.shape[0]] + [1] * (len(x.shape) - 1)
+                reshape = __reshape_op(x, shape=[x.shape[0], -1], block=block)
+                norm = __norm_op(reshape, dim=1, block=block)
+                __reshape_op(norm, out=out, shape=out_shape, block=block)
+            elif dim == len(x.shape) - 1:
+                out_shape = [1] * (len(x.shape) - 1) + [x.shape[-1]]
+                reshape = __reshape_op(x, shape=[-1, x.shape[-1]], block=block)
+                norm = __norm_op(reshape, dim=0, block=block)
+                __reshape_op(norm, out=out, shape=out_shape, block=block)
+            else:
+                perm = range(len(x.shape))
+                perm[0], perm[dim] = dim, 0
+                transpose = __transpose_op(x, perm, block=block)
+                norm = __norm_op(transpose, dim=0, block=block)
+                __transpose_op(norm, perm, out=out, block=block)
+            return out
+
+        def __weight_normalize(g, v, dim):
+            """Calculations for weight normalization"""
+            norm = __norm_except_dim(
+                v, dim=dim, block=self.main_program.current_block())
+            scale = elementwise_div(
+                x=g, y=norm)  # The shapes of g and norm are the same.
+            # Currently, elementwise_mul only support broadcast when the shape
+            # of y is a subset of the shape of x. Thus, we reshape y to squeeze
+            # to achive the subset.
+            w = elementwise_mul(
+                x=v,
+                y=scale if dim is None else reshape(
+                    x=scale, shape=[v.shape[dim]]),
+                axis=-1 if dim is None else dim)
+            # To serialize the original parameter for inference, maybe a
+            # parameter rather than a variable should be returned.
+            return w
+
+        g_param_attr = copy.deepcopy(attr)
+        g_param_attr.name = attr.name + '_g'
+        g_param_shape = [1] * len(shape)
+        if attr.dim is not None:
+            g_param_shape[attr.dim] = shape[attr.dim]
+        v_param_attr = copy.deepcopy(attr)
+        v_param_attr.name = attr.name + '_v'
+        v_param_shape = shape
+
+        # Add to startup_program to initialize g and v.
+        # Try to reconstruct the initializer of w by initializing g and v.
+        # Set the initializers of g and v as below, then the distribution
+        # of w is the same as initializing w with the given initializer.
+        # For Data-Dependent Initialization, please compute the init-values
+        # of g and v in external and then feed the values to g and v by
+        # executing an extra program.
+        g_param = self.startup_program.global_block().create_parameter(
+            dtype=dtype,
+            shape=g_param_shape,
+            **g_param_attr.to_kwargs(with_initializer=False))
+        v_param = self.startup_program.global_block().create_parameter(
+            dtype=dtype,
+            shape=v_param_shape,
+            **v_param_attr.to_kwargs(with_initializer=True))
+        __norm_except_dim(
+            x=v_param,
+            out=g_param,
+            dim=attr.dim,
+            block=self.startup_program.global_block())
+
+        # Add weight normalization to main_program
+        g_param = self.main_program.global_block().create_parameter(
+            dtype=dtype, shape=g_param_shape, **g_param_attr.to_kwargs())
+        v_param = self.main_program.global_block().create_parameter(
+            dtype=dtype, shape=v_param_shape, **v_param_attr.to_kwargs())
+        w_param = __weight_normalize(g_param, v_param, dim=attr.dim)
+        return w_param
+
     def create_parameter(self,
                          attr,
                          shape,
@@ -114,16 +285,23 @@ class LayerHelper(object):
         attr = copy.deepcopy(attr)
         assert isinstance(attr, ParamAttr)
         suffix = 'b' if is_bias else 'w'
+        if attr.name is None:
+            attr.name = unique_name(".".join([self.name, suffix]))
 
-        if default_initializer is None:
+        if default_initializer is None and attr.initializer is None:
             if is_bias:
                 attr.set_default_bias_initializer()
             else:
                 attr.set_default_param_initializer()
         else:
             attr.set_default_initializer(default_initializer)
-        if attr.name is None:
-            attr.name = unique_name(".".join([self.name, suffix]))
+
+        # If weight normalization is set, insert extra parameters and ops.
+        # Refer to https://arxiv.org/pdf/1602.07868.pdf
+        if isinstance(attr, WeightNormParamAttr):
+            param = self._create_weight_normalize(attr, shape, dtype)
+            WeightNormParamAttr.params_with_weight_norm.append(param)
+            return param
 
         self.startup_program.global_block().create_parameter(
             dtype=dtype, shape=shape, **attr.to_kwargs(with_initializer=True))
diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py
index 0fcbfe0e2f2f9686366139e84b7fdcc158bf0aa7..71a9459d556e2b3e25b1cd4ae768a8fb8ae41273 100644
--- a/python/paddle/v2/fluid/layers/control_flow.py
+++ b/python/paddle/v2/fluid/layers/control_flow.py
@@ -18,6 +18,7 @@ from tensor import assign, fill_constant
 from .. import core
 from ..framework import Program, Variable, Operator
 from ..layer_helper import LayerHelper, unique_name
+from ops import logical_and, logical_not, logical_or
 
 __all__ = [
     'split_lod_tensor',
@@ -27,6 +28,7 @@ __all__ = [
     'StaticRNNMemoryLink',
     'WhileGuard',
     'While',
+    'Switch',
     'lod_rank_table',
     'max_sequence_len',
     'topk',
@@ -36,6 +38,7 @@ __all__ = [
     'array_write',
     'create_array',
     'less_than',
+    'equal',
     'array_read',
     'shrink_memory',
     'array_length',
@@ -274,21 +277,20 @@ class ParallelDo(object):
         parent_block = self.parent_block()
 
         local_inputs = set()
-
-        for op in current_block.ops:
-            for oname in op.output_names:
-                for out_var_name in op.output(oname):
-                    local_inputs.add(out_var_name)
-
+        params = list()
         for var in self.inputs:
             local_inputs.add(var.name)
 
-        params = list()
         for op in current_block.ops:
             for iname in op.input_names:
                 for in_var_name in op.input(iname):
                     if in_var_name not in local_inputs:
                         params.append(in_var_name)
+
+            for oname in op.output_names:
+                for out_var_name in op.output(oname):
+                    local_inputs.add(out_var_name)
+
         params = list(set(params))
 
         return [parent_block.var(name) for name in params]
@@ -973,6 +975,36 @@ def less_than(x, y, cond=None, **ignored):
     return cond
 
 
+def equal(x, y, cond=None, **ignored):
+    """
+    **equal**
+
+    This layer returns the truth value of :math:`x == y` elementwise.
+
+    Args:
+        x(Variable): First operand of *equal*
+        y(Variable): Second operand of *equal*
+        cond(Variable|None): Optional output variable to store the result of *equal*
+
+    Returns:
+        Variable: The tensor variable storing the output of *equal*.
+
+    Examples:
+        .. code-block:: python
+
+          less = fluid.layers.equal(x=label, y=limit)
+    """
+    helper = LayerHelper("equal", **locals())
+    if cond is None:
+        cond = helper.create_tmp_variable(dtype='bool')
+        cond.stop_gradient = True
+
+    helper.append_op(
+        type='equal', inputs={'X': [x],
+                              'Y': [y]}, outputs={'Out': [cond]})
+    return cond
+
+
 def array_read(array, i):
     """This function performs the operation to read the data in as an
     LOD_TENSOR_ARRAY.
@@ -1063,11 +1095,12 @@ class ConditionalBlockGuard(BlockGuard):
 
 
 class ConditionalBlock(object):
-    def __init__(self, inputs, name=None):
+    def __init__(self, inputs, is_scalar_condition=False, name=None):
         for each_input in inputs:
             if not isinstance(each_input, Variable):
                 raise TypeError("Each input should be variable")
         self.inputs = inputs
+        self.is_scalar_condition = is_scalar_condition
         self.helper = LayerHelper('conditional_block', name=name)
 
     def block(self):
@@ -1112,7 +1145,66 @@ class ConditionalBlock(object):
             },
             outputs={'Out': out_list,
                      'Scope': [step_scope]},
-            attrs={'sub_block': inside_block})
+            attrs={
+                'sub_block': inside_block,
+                'is_scalar_condition': self.is_scalar_condition
+            })
+
+
+class Switch(object):
+    def __init__(self, name=None):
+        self.helper = LayerHelper('switch', name=name)
+        self.inside_scope = False
+        self.pre_not_conditions = []
+
+    def case(self, condition):
+        """create a new block for this condition
+        """
+        if not self.inside_scope:
+            raise ValueError("case should be called inside with")
+
+        if len(self.pre_not_conditions) == 0:
+            cond_block = ConditionalBlock([condition], is_scalar_condition=True)
+            not_cond = logical_not(x=condition)
+            self.pre_not_conditions.append(not_cond)
+        else:
+            pre_cond_num = len(self.pre_not_conditions)
+            pre_not_cond = self.pre_not_conditions[pre_cond_num - 1]
+            new_not_cond = logical_and(
+                x=pre_not_cond, y=logical_not(x=condition))
+            self.pre_not_conditions.append(new_not_cond)
+            cond_block = ConditionalBlock(
+                [logical_and(
+                    x=pre_not_cond, y=condition)],
+                is_scalar_condition=True)
+
+        return ConditionalBlockGuard(cond_block)
+
+    def default(self):
+        """create a default case for this switch
+        """
+        pre_cond_num = len(self.pre_not_conditions)
+        if pre_cond_num == 0:
+            raise ValueError("there should be at least one condition")
+        cond_block = ConditionalBlock(
+            [self.pre_not_conditions[pre_cond_num - 1]],
+            is_scalar_condition=True)
+        return ConditionalBlockGuard(cond_block)
+
+    def __enter__(self):
+        """
+        set flag that now is inside switch.block {}
+        :return:
+        """
+        self.inside_scope = True
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.inside_scope = False
+        if exc_type is not None:
+            return False  # re-raise exception
+
+        return True
 
 
 class IfElseBlockGuard(object):
diff --git a/python/paddle/v2/fluid/layers/io.py b/python/paddle/v2/fluid/layers/io.py
index 9af00e7de560d96103b54b37facaeadba2d3fe23..85e44a0e5149bd36f2787d9f2d516dbe4abdbb2e 100644
--- a/python/paddle/v2/fluid/layers/io.py
+++ b/python/paddle/v2/fluid/layers/io.py
@@ -14,8 +14,10 @@
 
 from .. import core
 from ..layer_helper import LayerHelper
+from control_flow import BlockGuard
+from ..layer_helper import LayerHelper
 
-__all__ = ['data']
+__all__ = ['data', 'BlockGuardServ', 'ListenAndServ', 'Send']
 
 
 def data(name,
@@ -74,3 +76,151 @@ def data(name,
         type=type,
         stop_gradient=stop_gradient,
         lod_level=lod_level)
+
+
+class BlockGuardServ(BlockGuard):
+    """
+    BlockGuardServ class.
+
+    BlockGuardServ class is used to create an op with a block in a program.
+    """
+
+    def __init__(self, server):
+        if not (isinstance(server, ListenAndServ)):
+            raise TypeError("BlockGuardServ takes a ListenAndServ")
+        super(BlockGuardServ, self).__init__(server.helper.main_program)
+        self.server = server
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is not None:
+            return False
+
+        self.server.complete_op()
+        return super(BlockGuardServ, self).__exit__(exc_type, exc_val, exc_tb)
+
+
+class ListenAndServ(object):
+    """
+    ListenAndServ class.
+
+    ListenAndServ class is used to wrap listen_and_serv op to create a server
+    which can receive variables from clients and run a block.
+    """
+
+    def __init__(self, endpoint, fan_in=1, optimizer_mode=True):
+        self.helper = LayerHelper("listen_and_serv")
+        self.inputs = []
+        self.outputs = []
+        self.endpoint = endpoint
+        self.fan_in = fan_in
+        # FIXME(typhoonzero): add optimizer_mode is stupid, should make it more
+        # general.
+        self.optimizer_mode = optimizer_mode
+
+    def do(self):
+        return BlockGuardServ(self)
+
+    def get_params_and_grads(self):
+        main_program = self.helper.main_program
+        current_block = main_program.current_block()
+        parent_block = self.parent_block()
+        # params and grads in the same order.
+        params = list()
+        grads = list()
+        for op in current_block.ops:
+            # FIXME(typhoonzero): op.inputs is None if it's cloned.
+            if self.optimizer_mode:
+                if "Grad" in op.inputs and "Param" in op.inputs:
+                    params.append(op.inputs["Param"].name)
+                    grads.append(op.inputs["Grad"].name)
+            else:
+                # simple recv mode, recv operators inputs.
+                for iname in op.input_names:
+                    for in_var_name in op.input(iname):
+                        params.append(parent_block.var(in_var_name))
+                        grads.append(parent_block.var(in_var_name))
+
+        return params, grads
+
+    def parent_block(self):
+        prog = self.helper.main_program
+        parent_idx = prog.current_block().parent_idx
+        assert parent_idx >= 0
+        parent_block = prog.block(parent_idx)
+        return parent_block
+
+    def complete_op(self):
+        main_program = self.helper.main_program
+        current_block = main_program.current_block()
+        parent_block = self.parent_block()
+
+        params, grads = self.get_params_and_grads()
+        param_names = [p.name for p in params]
+        grad_names = [g.name for g in grads]
+        parent_block.append_op(
+            type='listen_and_serv',
+            inputs={},
+            outputs={},
+            attrs={
+                'endpoint': self.endpoint,
+                'Fanin': self.fan_in,
+                'ParamList': param_names,
+                'GradList': grad_names,
+                'OptimizeBlock': current_block
+            })
+
+
+def Send(endpoints, send_vars, get_vars):
+    """
+    Send layer
+
+    Args:
+        endpoints: comma seperated IP:PORT pairs in the order
+                   of send_vars to send
+        send_vars: vars to send
+        get_vars: vars to get from server after send completes.
+
+    Send variables to the server side, and get vars from server
+    side when server have finished running server side program.
+    """
+    assert (type(send_vars) == list)
+    assert (type(get_vars) == list)
+
+    epmap = endpoints.split(",")
+    endpoints = list(set(epmap))
+
+    helper = LayerHelper("Send", **locals())
+    helper.append_op(
+        type="send",
+        inputs={"X": send_vars},
+        outputs={"Out": get_vars},
+        attrs={"endpoints": endpoints,
+               "epmap": epmap})
+
+
+def Recv(endpoints, get_vars):
+    """
+    Recv layer
+
+    Args:
+        endpoints: comma seperated IP:PORT pairs in the order
+                   of send_vars to send
+        send_vars: vars to send
+        get_vars: vars to get from server after send completes.
+
+    Send variables to the server side, and get vars from server
+    side when server have finished running server side program.
+    """
+    assert (type(send_vars) == list)
+    assert (type(get_vars) == list)
+
+    epmap = endpoints.split(",")
+    endpoints = list(set(epmap))
+
+    helper = LayerHelper("Recv", **locals())
+    helper.append_op(
+        type="recv",
+        inputs={"X": get_vars},
+        outputs={"Out": get_vars},
+        attrs={"endpoints": endpoints,
+               "epmap": epmap})
diff --git a/python/paddle/v2/fluid/layers/math_op_patch.py b/python/paddle/v2/fluid/layers/math_op_patch.py
index f359e70126f7601b75261e795b5a37bdc241112e..79a130a3eb148e6c5a8fa3cdf174780b354c23c9 100644
--- a/python/paddle/v2/fluid/layers/math_op_patch.py
+++ b/python/paddle/v2/fluid/layers/math_op_patch.py
@@ -145,7 +145,9 @@ def monkey_patch_variable():
             # a*b == b*a. Do not need to reverse explicitly
         ("__rmul__", "elementwise_mul", False),
         ("__div__", "elementwise_div", False),
-        ("__rdiv__", "elementwise_div", True)):
+        ("__rdiv__", "elementwise_div", True),
+        ("__pow__", "elementwise_pow", False),
+        ("__rpow__", "elementwise_pow", True)):
         setattr(Variable, method_name,
                 _elemwise_method_creator_(method_name, op_type, reverse))
 
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 4740a36c8ad15545659ec3eadb207aa74739a8bd..0b64e09cd359fc89ddc868ae87c1afdbfface541 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -26,6 +26,7 @@ __all__ = [
     'fc',
     'embedding',
     'dynamic_lstm',
+    'dynamic_lstmp',
     'dynamic_gru',
     'gru_unit',
     'linear_chain_crf',
@@ -63,6 +64,8 @@ __all__ = [
     'nce',
     'beam_search',
     'row_conv',
+    'multiplex',
+    'layer_norm',
 ]
 
 
@@ -90,7 +93,7 @@ def fc(input,
 
     .. math::
 
-        Out = Act({\sum_{i=0}^{N-1}W_iX_i + b})
+        Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
 
     In the above equation:
 
@@ -255,7 +258,8 @@ def dynamic_lstm(input,
                  gate_activation='sigmoid',
                  cell_activation='tanh',
                  candidate_activation='tanh',
-                 dtype='float32'):
+                 dtype='float32',
+                 name=None):
     """
     **Dynamic LSTM Layer**
 
@@ -281,7 +285,7 @@ def dynamic_lstm(input,
     W_{fc}, W_{oc}` are diagonal weight matrices for peephole connections. In
     our implementation, we use vectors to reprenset these diagonal weight
     matrices. The :math:`b` terms denote bias vectors (:math:`b_i` is the input
-    gate bias vector), :math:`\sigma` is the non-line activations, such as
+    gate bias vector), :math:`\sigma` is the non-linear activations, such as
     logistic sigmoid function, and :math:`i, f, o` and :math:`c` are the input
     gate, forget gate, output gate, and cell activation vectors, respectively,
     all of which have the same size as the cell output activation vector :math:`h`.
@@ -307,25 +311,25 @@ def dynamic_lstm(input,
                          (T X 4D), where T is the total time steps in this
                          mini-batch, D is the hidden size.
         size(int): 4 * hidden size.
-        param_attr(ParamAttr): The parameter attribute for the learnable
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
                                hidden-hidden weights.
 
-                               - The shape is (D x 4D), where D is the hidden
-                                 size.
                                - Weights = {:math:`W_{ch}, W_{ih}, \
                                                 W_{fh}, W_{oh}`}
-        bias_attr(ParamAttr): The bias attribute for the learnable bias
+                               - The shape is (D x 4D), where D is the hidden
+                                 size.
+        bias_attr(ParamAttr|None): The bias attribute for the learnable bias
                               weights, which contains two parts, input-hidden
                               bias weights and peephole connections weights if
                               setting `use_peepholes` to `True`.
 
                               1. `use_peepholes = False`
-                                - The shape is (1 x 4D).
                                 - Biases = {:math:`b_c, b_i, b_f, b_o`}.
+                                - The shape is (1 x 4D).
                               2. `use_peepholes = True`
-                                - The shape is (1 x 7D).
                                 - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
                                                  W_{fc}, W_{oc}`}.
+                                - The shape is (1 x 7D).
         use_peepholes(bool): Whether to enable diagonal/peephole connections,
                              default `True`.
         is_reverse(bool): Whether to compute reversed LSTM, default `False`.
@@ -338,6 +342,8 @@ def dynamic_lstm(input,
                               Choices = ["sigmoid", "tanh", "relu", "identity"],
                               default "tanh".
         dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
 
     Returns:
         tuple: The hidden state, and cell state of LSTM. The shape of both \
@@ -352,6 +358,7 @@ def dynamic_lstm(input,
             forward, _ = fluid.layers.dynamic_lstm(
                 input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
     """
+
     helper = LayerHelper('lstm', **locals())
     size = size / 4
     weight = helper.create_parameter(
@@ -388,6 +395,192 @@ def dynamic_lstm(input,
     return hidden, cell
 
 
+def dynamic_lstmp(input,
+                  size,
+                  proj_size,
+                  param_attr=None,
+                  bias_attr=None,
+                  use_peepholes=True,
+                  is_reverse=False,
+                  gate_activation='sigmoid',
+                  cell_activation='tanh',
+                  candidate_activation='tanh',
+                  proj_activation='tanh',
+                  dtype='float32',
+                  name=None):
+    """
+    **Dynamic LSTMP Layer**
+
+    LSTMP (LSTM with recurrent projection) layer has a separate projection
+    layer after the LSTM layer, projecting the original hidden state to a
+    lower-dimensional one, which is proposed to reduce the number of total
+    parameters and furthermore computational complexity for the LSTM,
+    espeacially for the case that the size of output units is relative
+    large (https://research.google.com/pubs/archive/43905.pdf).
+
+    The formula is as follows:
+
+    .. math::
+
+        i_t & = \sigma(W_{ix}x_{t} + W_{ir}r_{t-1} + W_{ic}c_{t-1} + b_i)
+
+        f_t & = \sigma(W_{fx}x_{t} + W_{fr}r_{t-1} + W_{fc}c_{t-1} + b_f)
+
+        \\tilde{c_t} & = act_g(W_{cx}x_t + W_{cr}r_{t-1} + b_c)
+
+        o_t & = \sigma(W_{ox}x_{t} + W_{or}r_{t-1} + W_{oc}c_t + b_o)
+
+        c_t & = f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}
+
+        h_t & = o_t \odot act_h(c_t)
+
+        r_t & = \overline{act_h}(W_{rh}h_t)
+
+    In the above formula:
+
+    * :math:`W`: Denotes weight matrices (e.g. :math:`W_{xi}` is \
+          the matrix of weights from the input gate to the input).
+    * :math:`W_{ic}`, :math:`W_{fc}`, :math:`W_{oc}`: Diagonal weight \
+          matrices for peephole connections. In our implementation, \
+          we use vectors to reprenset these diagonal weight matrices.
+    * :math:`b`: Denotes bias vectors (e.g. :math:`b_i` is the input gate \
+          bias vector).
+    * :math:`\sigma`: The activation, such as logistic sigmoid function.
+    * :math:`i, f, o` and :math:`c`: The input gate, forget gate, output \
+          gate, and cell activation vectors, respectively, all of which have \
+          the same size as the cell output activation vector :math:`h`.
+    * :math:`h`: The hidden state.
+    * :math:`r`: The recurrent projection of the hidden state.
+    * :math:`\\tilde{c_t}`: The candidate hidden state, whose \
+          computation is based on the current input and previous hidden state.
+    * :math:`\odot`: The element-wise product of the vectors.
+    * :math:`act_g` and :math:`act_h`: The cell input and cell output \
+          activation functions and `tanh` is usually used for them.
+    * :math:`\overline{act_h}`: The activation function for the projection \
+          output, usually using `identity` or same as :math:`act_h`.
+
+    Set `use_peepholes` to `False` to disable peephole connection. The formula
+    is omitted here, please refer to the paper
+    http://www.bioinf.jku.at/publications/older/2604.pdf for details.
+
+    Note that these :math:`W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}`
+    operations on the input :math:`x_{t}` are NOT included in this operator.
+    Users can choose to use fully-connected layer before LSTMP layer.
+
+    Args:
+        input(Variable): The input of dynamic_lstmp layer, which supports
+                         variable-time length input sequence. The underlying
+                         tensor in this Variable is a matrix with shape
+                         (T X 4D), where T is the total time steps in this
+                         mini-batch, D is the hidden size.
+        size(int): 4 * hidden size.
+        proj_size(int): The size of projection output.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+                               hidden-hidden weight and projection weight.
+
+                               - Hidden-hidden weight = {:math:`W_{ch}, W_{ih}, \
+                                                W_{fh}, W_{oh}`}.
+                               - The shape of hidden-hidden weight is (P x 4D),
+                                 where P is the projection size and D the hidden
+                                 size.
+                               - Projection weight = {:math:`W_{rh}`}.
+                               - The shape of projection weight is (D x P).
+        bias_attr(ParamAttr|None): The bias attribute for the learnable bias
+                              weights, which contains two parts, input-hidden
+                              bias weights and peephole connections weights if
+                              setting `use_peepholes` to `True`.
+
+                              1. `use_peepholes = False`
+                                - Biases = {:math:`b_c, b_i, b_f, b_o`}.
+                                - The shape is (1 x 4D).
+                              2. `use_peepholes = True`
+                                - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
+                                                 W_{fc}, W_{oc}`}.
+                                - The shape is (1 x 7D).
+        use_peepholes(bool): Whether to enable diagonal/peephole connections,
+                             default `True`.
+        is_reverse(bool): Whether to compute reversed LSTM, default `False`.
+        gate_activation(str): The activation for input gate, forget gate and
+                              output gate. Choices = ["sigmoid", "tanh", "relu",
+                              "identity"], default "sigmoid".
+        cell_activation(str): The activation for cell output. Choices = ["sigmoid",
+                              "tanh", "relu", "identity"], default "tanh".
+        candidate_activation(str): The activation for candidate hidden state.
+                              Choices = ["sigmoid", "tanh", "relu", "identity"],
+                              default "tanh".
+        proj_activation(str): The activation for projection output.
+                              Choices = ["sigmoid", "tanh", "relu", "identity"],
+                              default "tanh".
+        dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        tuple: The projection of hidden state, and cell state of LSTMP. The \
+               shape of projection is (T x P), for the cell state which is \
+               (T x D), and both LoD is the same with the `input`.
+
+    Examples:
+        .. code-block:: python
+
+            hidden_dim, proj_dim = 512, 256
+            fc_out = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
+                                     act=None, bias_attr=None)
+            proj_out, _ = fluid.layers.dynamic_lstmp(input=fc_out,
+                                                     size=hidden_dim * 4,
+                                                     proj_size=proj_dim,
+                                                     use_peepholes=False,
+                                                     is_reverse=True,
+                                                     cell_activation="tanh",
+                                                     proj_activation="tanh")
+    """
+
+    helper = LayerHelper('lstmp', **locals())
+    size = size / 4
+    weight = helper.create_parameter(
+        attr=helper.param_attr, shape=[proj_size, 4 * size], dtype=dtype)
+    proj_weight = helper.create_parameter(
+        attr=helper.param_attr, shape=[size, proj_size], dtype=dtype)
+    bias_size = [1, 7 * size]
+    if not use_peepholes:
+        bias_size[1] = 4 * size
+    bias = helper.create_parameter(
+        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+
+    projection = helper.create_tmp_variable(dtype)
+    cell = helper.create_tmp_variable(dtype)
+    ordered_proj0 = helper.create_tmp_variable(dtype)
+    batch_hidden = helper.create_tmp_variable(dtype)
+    batch_gate = helper.create_tmp_variable(dtype)
+    batch_cell_pre_act = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='lstmp',
+        inputs={
+            'Input': input,
+            'Weight': weight,
+            'ProjWeight': proj_weight,
+            'Bias': bias
+        },
+        outputs={
+            'Projection': projection,
+            'Cell': cell,
+            'OrderedP0': ordered_proj0,
+            'BatchHidden': batch_hidden,
+            'BatchGate': batch_gate,
+            'BatchCellPreAct': batch_cell_pre_act
+        },
+        attrs={
+            'use_peepholes': use_peepholes,
+            'is_reverse': is_reverse,
+            'gate_activation': gate_activation,
+            'cell_activation': cell_activation,
+            'candidate_activation': candidate_activation,
+            'proj_activation': proj_activation
+        })
+    return projection, cell
+
+
 def dynamic_gru(input,
                 size,
                 param_attr=None,
@@ -449,8 +642,8 @@ def dynamic_gru(input,
             Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh".
 
     Returns:
-        Variable: The hidden state of GRU. The shape is (T \\times D), and lod \
-            is the same with the input.
+        Variable: The hidden state of GRU. The shape is :math:`(T \\times D)`, \
+            and lod is the same with the input.
 
     Examples:
         .. code-block:: python
@@ -655,7 +848,35 @@ def cos_sim(X, Y, **kwargs):
     return out
 
 
-def dropout(x, dropout_prob, is_test=False, seed=0, **kwargs):
+def dropout(x, dropout_prob, is_test=False, seed=None, **kwargs):
+    """
+    Computes dropout.
+
+    Drop or keep each element of `x` independently. Dropout is a regularization
+    technique for reducing overfitting by preventing neuron co-adaption during
+    training. The dropout operator randomly set (according to the given dropout
+    probability) the outputs of some units to zero, while others are remain
+    unchanged.
+
+    Args:
+       x(variable): The input tensor.
+       dropout_prob(float): Probability of setting units to zero.
+       is_test(bool): A flag indicating whether it is in test phrase or not.
+       seed(int): A Python integer used to create random seeds. If this
+                  parameter is set to None, a random seed is used.
+                  NOTE: If an integer seed is given, always the same output
+                  units will be dropped. DO NOT use a fixed seed in training.
+
+    Returns:
+        Variable: A tensor variable.
+
+    Examples:
+        .. code-block:: python
+
+          x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+          droped = fluid.layers.dropout(input=x, dropout_rate=0.5)
+    """
+
     helper = LayerHelper('dropout', **kwargs)
     out = helper.create_tmp_variable(dtype=x.dtype)
     mask = helper.create_tmp_variable(dtype=x.dtype, stop_gradient=True)
@@ -664,9 +885,12 @@ def dropout(x, dropout_prob, is_test=False, seed=0, **kwargs):
         inputs={'X': [x]},
         outputs={'Out': [out],
                  'Mask': [mask]},
-        attrs={'dropout_prob': dropout_prob,
-               'is_test': is_test,
-               'seed': seed})
+        attrs={
+            'dropout_prob': dropout_prob,
+            'is_test': is_test,
+            'fix_seed': seed is not None,
+            'seed': seed if seed is not None else 0
+        })
     return out
 
 
@@ -767,7 +991,7 @@ def square_error_cost(input, label, **kwargs):
        label(Variable): Label tensor, has target labels.
 
     Returns:
-        Variable: The tensor variable storing the element-wise squared error
+        Variable: The tensor variable storing the element-wise squared error \
                   difference of input and label.
 
     Examples:
@@ -991,7 +1215,7 @@ def conv2d(input,
        act(str): Activation type. Default: None
 
     Returns:
-        Variable: The tensor variable storing the convolution and
+        Variable: The tensor variable storing the convolution and \
                   non-linearity activation result.
 
     Raises:
@@ -1008,10 +1232,17 @@ def conv2d(input,
     """
     if stride is None:
         stride = [1, 1]
-    helper = LayerHelper('conv2d', **locals())
-    dtype = helper.input_dtype()
 
     num_channels = input.shape[1]
+
+    l_type = 'conv2d'
+    if (num_channels == groups and num_filters % num_channels == 0 and
+            not use_cudnn):
+        l_type = 'depthwise_conv2d'
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+
     if groups is None:
         num_filter_channels = num_channels
     else:
@@ -1044,7 +1275,7 @@ def conv2d(input,
     pre_bias = helper.create_tmp_variable(dtype)
 
     helper.append_op(
-        type='conv2d',
+        type=l_type,
         inputs={
             'Input': input,
             'Filter': filter_param,
@@ -1255,7 +1486,9 @@ def batch_norm(input,
                param_attr=None,
                bias_attr=None,
                data_layout='NCHW',
-               name=None):
+               name=None,
+               moving_mean_name=None,
+               moving_variance_name=None):
     """
     This function helps create an operator to implement
     the BatchNorm layer using the configurations from the input parameters.
@@ -1285,6 +1518,7 @@ def batch_norm(input,
         attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
 
     mean = helper.create_global_variable(
+        name=moving_mean_name,
         dtype=input.dtype,
         shape=param_shape,
         persistable=True,
@@ -1292,6 +1526,7 @@ def batch_norm(input,
     helper.set_variable_initializer(var=mean, initializer=Constant(0.0))
 
     variance = helper.create_global_variable(
+        name=moving_variance_name,
         dtype=input.dtype,
         shape=param_shape,
         persistable=True,
@@ -1331,6 +1566,102 @@ def batch_norm(input,
     return helper.append_activation(batch_norm_out)
 
 
+def layer_norm(input,
+               scale=True,
+               shift=True,
+               begin_norm_axis=1,
+               epsilon=1e-05,
+               param_attr=None,
+               bias_attr=None,
+               act=None,
+               name=None):
+    """
+    **Layer Normalization**
+
+    Assume feature vectors exist on dimensions 
+    :attr:`begin_norm_axis ... rank(input)` and calculate the moment statistics
+    along these dimensions for each feature vector :math:`a` with size
+    :math:`H`, then normalize each feature vector using the corresponding
+    statistics. After that, apply learnable gain and bias on the normalized
+    tensor to scale and shift if :attr:`scale` and :attr:`shift` are set.
+
+    Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
+
+    The formula is as follows:
+
+    .. math::
+
+        \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} a_i
+
+        \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}(a_i - \\mu)^2}
+
+        h & = f(\\frac{g}{\\sigma}(a - \\mu) + b)
+
+    Args:
+        input(Variable): The input tensor variable.
+        scale(bool): Whether to learn the adaptive gain :math:`g` after 
+            normalization.
+        shift(bool): Whether to learn the adaptive bias :math:`b` after 
+            normalization.
+        begin_norm_axis(bool): The normalization will be performed along 
+            dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`.
+        epsilon(float): The small value added to the variance to prevent 
+            division by zero.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+            gain :math:`g`.
+        bias_attr(ParamAttr|None): The parameter attribute for the learnable
+            bias :math:`b`.
+        act(str): Activation to be applied to the output of layer normalizaiton.
+
+    Returns:
+        Variable: A tensor variable with the same shape as the input.
+
+    Examples:
+        .. code-block:: python
+
+            data = fluid.layers.data(
+              name='data', shape=[3, 32, 32], dtype='float32')
+            x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
+    """
+    helper = LayerHelper('layer_norm', **locals())
+    dtype = helper.input_dtype()
+
+    # create intput and parameters
+    inputs = {'X': input}
+    input_shape = input.shape
+    param_shape = [reduce(lambda x, y: x * y, input_shape[begin_norm_axis:])]
+    if scale:
+        scale = helper.create_parameter(
+            attr=helper.param_attr,
+            shape=param_shape,
+            dtype=dtype,
+            default_initializer=Constant(1.0))
+        inputs['Scale'] = scale
+    if shift:
+        assert bias_attr is not False
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
+        inputs['Bias'] = bias
+
+    # create output
+    mean_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    variance_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    layer_norm_out = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type="layer_norm",
+        inputs=inputs,
+        outputs={
+            "Y": layer_norm_out,
+            "Mean": mean_out,
+            "Variance": variance_out,
+        },
+        attrs={"epsilon": epsilon,
+               "begin_norm_axis": begin_norm_axis})
+
+    return helper.append_activation(layer_norm_out)
+
+
 def beam_search_decode(ids, scores, name=None):
     helper = LayerHelper('beam_search_decode', **locals())
     sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
@@ -2291,7 +2622,8 @@ def ctc_greedy_decoder(input, blank, name=None):
                     interval [0, num_classes + 1).
 
     Returns:
-        Variable: CTC greedy decode result.
+        Variable: CTC greedy decode result. If all the sequences in result were
+        empty, the result LoDTensor will be [-1] with LoD [[0]] and dims [1, 1].
 
     Examples:
         .. code-block:: python
@@ -2707,3 +3039,55 @@ def row_conv(input, future_context_size, param_attr=None, act=None):
                 'Filter': [filter_param]},
         outputs={'Out': [out]})
     return helper.append_activation(out)
+
+
+def multiplex(inputs, index):
+    """
+    **Multiplex Layer**
+
+    Referring to the given index variable, this layer selects rows from the
+    input variables to construct a multiplex variable. Assuming that there are
+    :math:`m` input variables and :math:`I_i` represents the i-th input
+    variable and :math:`i` is in [0, :math:`m`). All input variables are
+    tensors with same shape [:math:`d_0`, :math:`d_1`, ..., :math:`d_R`].
+    Please note that rank of the input tensor should be at least 2. Each input
+    variable will be treated as a 2-D matrix with shape [:math:`M`, :math:`N`]
+    where :math:`M` for :math:`d_0` and :math:`N` for :math:`d_1` * :math:`d_2`
+    * ... * :math:`d_R`. Let :math:`I_i[j]` be the j-th row of the i-th input
+    variable. The given index variable should be a 2-D tensor with shape
+    [:math:`M`, 1]. Let `ID[i]` be the i-th index value of the index variable.
+    Then the output variable will be a tensor with shape [:math:`d_0`,
+    :math:`d_1`, ..., :math:`d_R`]. If we treat the output tensor as a 2-D
+    matrix with shape [:math:`M`, :math:`N`] and let :math:`O[i]` be the i-th
+    row of the matrix, then `O[i]` is equal to :math:`I_{ID[i]}[i]`.
+
+    Args:
+       inputs (list): A list of variables to gather from. All variables have the
+                same shape and the rank is at least 2.
+       index (Variable): Tensor<int32>, index variable which is a 2-D tensor
+                with shape [M, 1] where M is the batch size.
+
+    Returns:
+        Variable: Multiplex variable gathered from input variables.
+
+    Examples:
+        .. code-block:: python
+
+            x1 = fluid.layers.data(name='x1', shape=[4], dtype='float32')
+            x2 = fluid.layers.data(name='x2', shape=[4], dtype='float32')
+            index = fluid.layers.data(name='index', shape=[1], dtype='int32')
+            out = fluid.layers.multiplex(inputs=[x1, x2], index=index)
+    """
+    helper = LayerHelper('multiplex', **locals())
+
+    if not isinstance(inputs, list) and len(inputs) < 2:
+        raise ValueError("inputs should be a list object and contains at least "
+                         "2 elements.")
+
+    out = helper.create_tmp_variable(inputs[0].dtype)
+    helper.append_op(
+        type='multiplex',
+        inputs={'X': inputs,
+                'Ids': index},
+        outputs={'Out': [out]})
+    return out
diff --git a/python/paddle/v2/fluid/layers/ops.py b/python/paddle/v2/fluid/layers/ops.py
index 022a94cad440f13383a927233195bb008a688843..38dea2892fc18a9c493878d816a246522e9b9886 100644
--- a/python/paddle/v2/fluid/layers/ops.py
+++ b/python/paddle/v2/fluid/layers/ops.py
@@ -56,9 +56,15 @@ __all__ = [
     'elementwise_mul',
     'elementwise_max',
     'elementwise_min',
+    'elementwise_pow',
     'clip',
     'clip_by_norm',
+    'softmax',
     'sequence_softmax',
+    'logical_and',
+    'logical_or',
+    'logical_xor',
+    'logical_not',
 ] + __activations__
 
 for _OP in set(__all__):
diff --git a/python/paddle/v2/fluid/layers/tensor.py b/python/paddle/v2/fluid/layers/tensor.py
index 6e7d09459c07c77a8579300a1c67ae36dc3d2ba2..704e040b9f478ef61991cfbe175f1cdeaf102763 100644
--- a/python/paddle/v2/fluid/layers/tensor.py
+++ b/python/paddle/v2/fluid/layers/tensor.py
@@ -16,12 +16,14 @@ from ..layer_helper import LayerHelper
 from ..param_attr import ParamAttr
 from ..framework import convert_np_dtype_to_dtype_
 from ..framework import Variable
+from ..initializer import Constant
 from ..core import DataType
 import numpy
 
 __all__ = [
     'create_tensor',
     'create_parameter',
+    'create_global_var',
     'cast',
     'concat',
     'sums',
@@ -33,13 +35,15 @@ __all__ = [
 ]
 
 
-def create_tensor(dtype, name=None):
+def create_tensor(dtype, name=None, persistable=False):
     helper = LayerHelper("create_tensor", **locals())
-    return helper.create_variable(name=helper.name, dtype=dtype)
+    return helper.create_variable(
+        name=helper.name, dtype=dtype, persistable=persistable)
 
 
 def create_parameter(shape,
                      dtype,
+                     name=None,
                      attr=None,
                      is_bias=False,
                      default_initializer=None):
@@ -58,13 +62,22 @@ def create_parameter(shape,
     Returns:
         Parameter: the created parameter
     """
-    helper = LayerHelper("create_parameter")
+    helper = LayerHelper("create_parameter", **locals())
     if attr is None:
-        attr = ParamAttr()
+        attr = ParamAttr(name=name)
     return helper.create_parameter(attr, shape, dtype, is_bias,
                                    default_initializer)
 
 
+def create_global_var(shape, value, dtype, persistable=False, name=None):
+    helper = LayerHelper("global_var", **locals())
+    var = helper.create_global_variable(
+        dtype=dtype, shape=shape, persistable=persistable, name=name)
+    helper.set_variable_initializer(
+        var, initializer=Constant(value=float(value)))
+    return var
+
+
 def cast(x, dtype):
     """
     This function takes in the input with input_dtype
@@ -284,7 +297,7 @@ def fill_constant_batch_size_like(input,
     return out
 
 
-def ones(shape, dtype):
+def ones(shape, dtype, force_cpu=False):
     """
     **ones**
 
@@ -308,7 +321,7 @@ def ones(shape, dtype):
     return fill_constant(value=1.0, **locals())
 
 
-def zeros(shape, dtype):
+def zeros(shape, dtype, force_cpu=False):
     """
     **zeros**
 
diff --git a/python/paddle/v2/fluid/learning_rate_decay.py b/python/paddle/v2/fluid/learning_rate_decay.py
new file mode 100644
index 0000000000000000000000000000000000000000..13dc98075f7d32f9dda56a890b98451ef81af363
--- /dev/null
+++ b/python/paddle/v2/fluid/learning_rate_decay.py
@@ -0,0 +1,223 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import layers
+from framework import Variable
+
+__all__ = [
+    'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
+    'polynomial_decay', 'piecewise_decay'
+]
+"""
+When training a model, it's often useful to decay the
+learning rate during training process, this is called
+learning_rate_decay. There are many strategies to do
+this, this module will provide some classical method.
+User can also implement their own learning_rate_decay
+strategy according to this module.
+"""
+
+
+def exponential_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False):
+    """Applies exponential decay to the learning rate.
+
+    ```python
+    decayed_learning_rate = learning_rate *
+            decay_rate ^ (global_step / decay_steps)
+    ```
+    Args:
+        learning_rate: A scalar float32 value or a Variable. This
+          will be the initial learning rate during training
+        global_step: A Variable that record the training step.
+        decay_steps: A Python `int32` number.
+        decay_rate: A Python `float` number.
+        staircase: Boolean. If set true, decay the learning rate every decay_steps.
+
+    Returns:
+        The decayed learning rate
+    """
+    if not isinstance(global_step, Variable):
+        raise ValueError("global_step is required for exponential_decay.")
+
+    # update learning_rate
+    div_res = global_step / decay_steps
+    if staircase:
+        div_res = layers.floor(x=div_res)
+    return learning_rate * (decay_rate**div_res)
+
+
+def natural_exp_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False):
+    """Applies natural exponential decay to the initial learning rate.
+
+    ```python
+    if not staircase:
+        decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
+    else:
+        decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
+    ```
+    Args:
+        learning_rate: A scalar float32 value or a Variable. This
+          will be the initial learning rate during training
+        global_step: A Variable that record the training step.
+        decay_steps: A Python `int32` number.
+        decay_rate: A Python `float` number.
+        staircase: Boolean. If set true, decay the learning rate every decay_steps.
+
+    Returns:
+        The decayed learning rate
+    """
+    if not isinstance(global_step, Variable):
+        raise ValueError("global_step is required for natural_exp_decay.")
+
+    div_res = global_step / decay_steps
+    if staircase:
+        div_res = layers.floor(x=div_res)
+    return learning_rate * layers.exp(x=(-1 * decay_rate * div_res))
+
+
+def inverse_time_decay(learning_rate,
+                       global_step,
+                       decay_steps,
+                       decay_rate,
+                       staircase=False):
+    """Applies inverse time decay to the initial learning rate.
+
+    ```python
+    if staircase:
+      decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
+    else:
+      decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
+    ```
+    Args:
+        learning_rate: A scalar float32 value or a Variable. This
+          will be the initial learning rate during training
+        global_step: A Variable that record the training step.
+        decay_steps: A Python `int32` number.
+        decay_rate: A Python `float` number.
+        staircase: Boolean. If set true, decay the learning rate every decay_steps.
+
+    Returns:
+        The decayed learning rate
+    """
+    if not isinstance(global_step, Variable):
+        raise ValueError("global_step is required for inverse_time_decay.")
+
+    div_res = global_step / decay_steps
+    if staircase:
+        div_res = layers.floor(x=div_res)
+
+    return learning_rate / (1 + decay_rate * div_res)
+
+
+def polynomial_decay(learning_rate,
+                     global_step,
+                     decay_steps,
+                     end_learning_rate=0.0001,
+                     power=1.0,
+                     cycle=False):
+    """Applies polynomial decay to the initial learning rate.
+
+    ```python
+    if cycle:
+        decay_steps = decay_steps * ceil(global_step / decay_steps)
+    else:
+        global_step = min(global_step, decay_steps)
+    decayed_learning_rate = (learning_rate - end_learning_rate) *
+                      (1 - global_step / decay_steps) ^ power +
+                      end_learning_rate
+    ```
+    Args:
+        learning_rate: A scalar float32 value or a Variable. This
+          will be the initial learning rate during training
+        global_step: A Variable that record the training step.
+        decay_steps: A Python `int32` number.
+        end_learning_rate: A Python `float` number.
+        power: A Python `float` number
+        cycle: Boolean. If set true, decay the learning rate every decay_steps.
+
+    Returns:
+        The decayed learning rate
+    """
+    if not isinstance(global_step, Variable):
+        raise ValueError("global_step is required for inverse_time_decay.")
+
+    if cycle:
+        div_res = layers.ceil(x=(global_step / decay_steps))
+        zero_var = layers.fill_constant(shape=[1], dtype='float32', value=0.0)
+        one_var = layers.fill_constant(shape=[1], dtype='float32', value=1.0)
+
+        with layers.Switch() as switch:
+            with switch.case(layers.equal(x=global_step, y=zero_var)):
+                layers.assign(input=one_var, output=div_res)
+        decay_steps = decay_steps * div_res
+    else:
+        decay_steps_var = layers.fill_constant(
+            shape=[1], dtype='float32', value=float(decay_steps))
+        global_step = layers.elementwise_min(x=global_step, y=decay_steps_var)
+
+    return (learning_rate - end_learning_rate) * \
+           ((1 - global_step / decay_steps) ** power) + end_learning_rate
+
+
+def piecewise_decay(global_step, boundaries, values):
+    """Applies piecewise decay to the initial learning rate.
+
+    ```python
+    boundaries = [10000, 20000]
+    values = [1.0, 0.5, 0.1]
+
+    if step < 10000:
+        learning_rate = 1.0
+    elif step >= 10000 and step < 20000:
+        learning_rate = 0.5
+    else:
+        learning_rate = 0.1
+    ```
+    """
+
+    if len(values) - len(boundaries) != 1:
+        raise ValueError("len(values) - len(boundaries) should be 1")
+
+    if not isinstance(global_step, Variable):
+        raise ValueError("global_step is required for piecewise_decay.")
+
+    lr = layers.create_global_var(
+        shape=[1],
+        value=0.0,
+        dtype='float32',
+        persistable=True,
+        name="learning_rate")
+
+    with layers.Switch() as switch:
+        for i in range(len(boundaries)):
+            boundary_val = layers.fill_constant(
+                shape=[1], dtype='float32', value=float(boundaries[i]))
+            value_var = layers.fill_constant(
+                shape=[1], dtype='float32', value=float(values[i]))
+            with switch.case(layers.less_than(global_step, boundary_val)):
+                layers.assign(value_var, lr)
+        last_value_var = layers.fill_constant(
+            shape=[1], dtype='float32', value=float(values[len(values) - 1]))
+        with switch.default():
+            layers.assign(last_value_var, lr)
+
+    return lr
diff --git a/python/paddle/v2/fluid/memory_optimization_transpiler.py b/python/paddle/v2/fluid/memory_optimization_transpiler.py
index 956c5b66da28fd8e74d4fd12f249688daa72d8ac..53e0991ee8c318e0c95018b57ad48f404ce8beae 100644
--- a/python/paddle/v2/fluid/memory_optimization_transpiler.py
+++ b/python/paddle/v2/fluid/memory_optimization_transpiler.py
@@ -31,7 +31,7 @@ dtype_to_size = {
 
 
 class ControlFlowGraph(object):
-    def __init__(self, Program, ops, forward_num):
+    def __init__(self, Program, ops, forward_num, skip_opt):
         self._program = Program
         self._ops = ops
         self._forward_num = forward_num
@@ -41,6 +41,7 @@ class ControlFlowGraph(object):
         self._defs = defaultdict(set)
         self._live_in = defaultdict(set)
         self._live_out = defaultdict(set)
+        self._skip_opt = skip_opt
 
     def _add_connections(self, connections):
         for node1, node2 in connections:
@@ -91,14 +92,13 @@ class ControlFlowGraph(object):
         live_in = defaultdict(set)
         live_out = defaultdict(set)
         while True:
-            for i in range(self.op_size):
+            for i in range(self.op_size, 0, -1):
                 live_in[i] = set(self._live_in[i])
                 live_out[i] = set(self._live_out[i])
-                self._live_in[i] = self._uses[i] | (
-                    self._live_out[i] - self._defs[i])
                 for s in self._successors[i]:
                     self._live_out[i] |= self._live_in[s]
-
+                self._live_in[i] = self._uses[i] | (
+                    self._live_out[i] - self._defs[i])
             if self._reach_fixed_point(live_in, live_out):
                 break
 
@@ -130,6 +130,10 @@ class ControlFlowGraph(object):
                     block_desc, x,
                     is_forward).type() != core.VarDesc.VarType.LOD_TENSOR:
                 return False
+            if x in self._skip_opt:
+                return False
+            if not self._find_var(block_desc, x, is_forward).shape():
+                return False
             return True
 
         self._build_graph()
@@ -150,6 +154,9 @@ class ControlFlowGraph(object):
                     for x in defs_can_optimize
                 ]
                 for x, x_shape in out_pair:
+                    # If x is both in uses and defs, it can not be optimized!
+                    if x in self._uses[i]:
+                        continue
                     for index, cache_pair in enumerate(self.pool):
                         cache_var = cache_pair[0]
                         cache_shape = cache_pair[1]
@@ -197,28 +204,32 @@ def get_cfgs(input_program):
     block_desc = pdesc.block(0)
     op_size = block_desc.op_size()
     # Get global block ops
-    ops_list.append(([block_desc.op(i) for i in range(op_size)], op_size))
+    ops_list.append(
+        ([block_desc.op(i) for i in range(op_size)], op_size, set()))
 
     while_sub_block_ids = []
     while_grad_sub_block_ids = []
-    while_pair = []
+    while_block_id_pair = []
+    while_op_dict = {}
 
     for i in range(op_size):
         op = block_desc.op(i)
         if op.type() == "while":
             while_sub_block_ids.append(op.attr("sub_block").id)
+            while_op_dict[op.attr("sub_block").id] = op
         elif op.type() == "while_grad":
             while_grad_sub_block_ids.append(op.attr("sub_block").id)
+            while_op_dict[op.attr("sub_block").id] = op
 
     # Find while/while_grad block pair
     for grad_id in while_grad_sub_block_ids:
         parent_id = pdesc.block(grad_id).parent
         if parent_id in while_sub_block_ids:
-            while_pair.append((parent_id, grad_id))
+            while_block_id_pair.append((parent_id, grad_id))
             while_sub_block_ids.remove(parent_id)
 
     # Get while/while_grad block ops
-    for parent_id, grad_id in while_pair:
+    for parent_id, grad_id in while_block_id_pair:
         while_block_ops = []
         while_block = pdesc.block(parent_id)
         while_block_op_size = while_block.op_size()
@@ -230,7 +241,11 @@ def get_cfgs(input_program):
         for i in range(while_grad_block_op_size):
             while_block_ops.append(while_grad_block.op(i))
 
-        ops_list.append((while_block_ops, while_block_op_size))
+        while_op_output = set()
+        while_op_output.update(while_op_dict[parent_id].output_arg_names())
+        while_op_output.update(while_op_dict[grad_id].output_arg_names())
+
+        ops_list.append((while_block_ops, while_block_op_size, while_op_output))
 
     # Process rest while block ops
     for parent_id in while_sub_block_ids:
@@ -240,9 +255,15 @@ def get_cfgs(input_program):
         for i in range(while_block_op_size):
             while_block_ops.append(while_block.op(i))
 
-        ops_list.append((while_block_ops, while_block_op_size))
+        while_op_output = set()
+        while_op_output.update(while_op_dict[parent_id].output_arg_names())
+
+        ops_list.append((while_block_ops, while_block_op_size, while_op_output))
 
-    cfgs = [ControlFlowGraph(input_program, i, j) for i, j in ops_list]
+    cfgs = [
+        ControlFlowGraph(input_program, ops, forward_num, skip_opt)
+        for ops, forward_num, skip_opt in ops_list
+    ]
     return cfgs
 
 
diff --git a/python/paddle/v2/fluid/nets.py b/python/paddle/v2/fluid/nets.py
index cb63d43709e23ae04c4d23457bbb79e6f7f0ce3c..be7878f869b509fa1117e305aee662cc0123bbcc 100644
--- a/python/paddle/v2/fluid/nets.py
+++ b/python/paddle/v2/fluid/nets.py
@@ -194,7 +194,7 @@ def scaled_dot_product_attention(queries,
 
     Returns:
 
-        Variable: A 3-D Tensor computed by multi-head scaled dot product
+        Variable: A 3-D Tensor computed by multi-head scaled dot product \
                   attention.
 
     Raises:
@@ -333,6 +333,7 @@ def scaled_dot_product_attention(queries,
             x=product, shape=[-1, product.shape[-1]], act="softmax"),
         shape=product.shape)
     if dropout_rate:
-        weights = layers.dropout(x, dropout_prob=dropout_rate, is_test=False)
+        weights = layers.dropout(
+            weights, dropout_prob=dropout_rate, is_test=False)
     ctx_multiheads = layers.matmul(weights, v)
     return __combine_heads(ctx_multiheads)
diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py
index 0c3533b892176edd5dfd111fdd771cc17d468168..f8a00e3a5fb4038a97a951a01c3a2f1a4488ae75 100644
--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
@@ -15,6 +15,7 @@
 from collections import defaultdict
 
 import framework
+import layers
 from backward import append_backward
 from framework import unique_name, program_guard
 from initializer import Constant
@@ -33,9 +34,11 @@ class Optimizer(object):
     but need to use one of it's implementation.
     """
 
-    def __init__(self, global_step=None, regularization=None):
+    def __init__(self, learning_rate, global_step=None, regularization=None):
+        assert learning_rate is not None
         self._global_step = global_step
         self.regularization = regularization
+        self._global_learning_rate = learning_rate
         # Dictionary of accumulators. Some optimizer subclasses need to
         # allocate and manage extra variables associated with the parameters
         # to train. These variables are called accumulators.
@@ -43,6 +46,28 @@ class Optimizer(object):
         self._accumulators = defaultdict(lambda: dict())
         self.helper = None
 
+    def _create_global_learning_rate(self):
+        if isinstance(self._global_learning_rate, float):
+            self._global_learning_rate = layers.create_global_var(
+                name=unique_name("learning_rate"),
+                shape=[1],
+                value=float(self._global_learning_rate),
+                dtype='float32',
+                persistable=True)
+
+        if not isinstance(self._global_learning_rate, framework.Variable):
+            raise ValueError("learning rate should be a Variable, "
+                             "actual type is %s",
+                             type(self._global_learning_rate))
+
+    @property
+    def global_learning_rate(self):
+        """
+        get global decayed learning rate
+        :return:
+        """
+        return self._global_learning_rate
+
     def _append_optimize_op(self, block, param_and_grad):
         """ append optimize operator to block and return all the added optimize_op
         """
@@ -52,17 +77,7 @@ class Optimizer(object):
         # create learning rate variable for every parameter
         param = param_and_grad[0]
         param_lr = param.optimize_attr['learning_rate']
-        param_lr_shape = [1]
-        param_lr_var = self.helper.create_global_variable(
-            name=unique_name("learning_rate"),
-            dtype='float32',
-            shape=param_lr_shape,
-            lod_level=1,
-            persistable=True)
-        param_lr = param_lr * self._learning_rate
-        self.helper.set_variable_initializer(
-            var=param_lr_var, initializer=Constant(param_lr))
-        return param_lr_var
+        return self._global_learning_rate * param_lr
 
     def _create_accumulators(self, block, parameters):
         """Create all accumulators needed by the parameters
@@ -163,7 +178,7 @@ class Optimizer(object):
           optimization. This will include parameter update ops, global step
           update ops and any other custom ops required by subclasses to manage
           their internal state.
-          :param startup_program: 
+          :param startup_program:
         """
         # This is a default implementation of create_optimization_pass that
         # can be shared by most optimizers. This implementation assumes that
@@ -175,9 +190,12 @@ class Optimizer(object):
         # Create any accumulators
         program = loss.block.program
         with program_guard(program, startup_program):
+            global_block = framework.default_main_program().global_block()
+            start = len(global_block.ops)
             self.helper = LayerHelper(self.__class__.__name__)
             self._create_accumulators(loss.block,
                                       [p[0] for p in parameters_and_grads])
+            self._create_global_learning_rate()
 
             optimize_ops = []
             for param_and_grad in parameters_and_grads:
@@ -187,19 +205,14 @@ class Optimizer(object):
                                                            param_and_grad)
                     optimize_ops.append(optimize_op)
 
-            # Returned list of ops can include more ops in addition
-            # to optimization ops
-            return_ops = optimize_ops
-
             # Get custom finish ops for subclasses
             # FIXME: Need to fix this once we figure out how to handle dependencies
-            finish_ops = self._finish_update(loss.block)
-            if finish_ops is not None:
-                return_ops += finish_ops
+            self._finish_update(loss.block)
 
             if self._global_step is not None:
-                return_ops.append(self._increment_global_step(loss.block))
-            return return_ops
+                self._increment_global_step(loss.block)
+            end = len(global_block.ops)
+            return global_block.slice_ops(start, end)
 
     def minimize(self,
                  loss,
@@ -231,9 +244,9 @@ class SGDOptimizer(Optimizer):
 
     def __init__(self, learning_rate, **kwargs):
         assert learning_rate is not None
-        super(SGDOptimizer, self).__init__(**kwargs)
+        super(SGDOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
         self.type = "sgd"
-        self._learning_rate = learning_rate
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
@@ -259,9 +272,9 @@ class MomentumOptimizer(Optimizer):
     def __init__(self, learning_rate, momentum, use_nesterov=False, **kwargs):
         assert learning_rate is not None
         assert momentum is not None
-        super(MomentumOptimizer, self).__init__(**kwargs)
+        super(MomentumOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
         self.type = "momentum"
-        self._learning_rate = learning_rate
         self._momentum = momentum
         self._use_nesterov = bool(use_nesterov)
 
@@ -303,9 +316,9 @@ class AdagradOptimizer(Optimizer):
     def __init__(self, learning_rate, epsilon=1.0e-6, **kwargs):
         assert learning_rate is not None
         assert epsilon is not None
-        super(AdagradOptimizer, self).__init__(**kwargs)
+        super(AdagradOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
         self.type = "adagrad"
-        self._learning_rate = learning_rate
         self._epsilon = epsilon
 
     def _create_accumulators(self, block, parameters):
@@ -352,9 +365,9 @@ class AdamOptimizer(Optimizer):
         assert beta1 is not None
         assert beta2 is not None
         assert epsilon is not None
-        super(AdamOptimizer, self).__init__(**kwargs)
+        super(AdamOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
         self.type = "adam"
-        self._learning_rate = learning_rate
         self._beta1 = beta1
         self._beta2 = beta2
         self._epsilon = epsilon
@@ -457,9 +470,9 @@ class AdamaxOptimizer(Optimizer):
         assert beta1 is not None
         assert beta2 is not None
         assert epsilon is not None
-        super(AdamaxOptimizer, self).__init__(**kwargs)
+        super(AdamaxOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
         self.type = "adamax"
-        self._learning_rate = learning_rate
         self._beta1 = beta1
         self._beta2 = beta2
         self._epsilon = epsilon
@@ -535,9 +548,9 @@ class DecayedAdagradOptimizer(Optimizer):
         assert decay is not None
         assert epsilon is not None
 
-        super(DecayedAdagradOptimizer, self).__init__(**kwargs)
+        super(DecayedAdagradOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
         self.type = "decayed_adagrad"
-        self._learning_rate = learning_rate
         self._decay = decay
         self._epsilon = epsilon
 
diff --git a/python/paddle/v2/fluid/param_attr.py b/python/paddle/v2/fluid/param_attr.py
index dcca8b6c547d10864ff4cd0af1c217d89e3b522f..fc566b8a2480ce9256d610b4731405cd6d89b7e4 100644
--- a/python/paddle/v2/fluid/param_attr.py
+++ b/python/paddle/v2/fluid/param_attr.py
@@ -15,7 +15,10 @@
 from initializer import Initializer, Xavier, Constant
 from regularizer import WeightDecayRegularizer
 
-__all__ = ['ParamAttr']
+__all__ = [
+    'ParamAttr',
+    'WeightNormParamAttr',
+]
 
 
 class ParamAttr(object):
@@ -82,3 +85,20 @@ class ParamAttr(object):
         if with_initializer:
             kwargs['initializer'] = self.initializer
         return kwargs
+
+
+class WeightNormParamAttr(ParamAttr):
+    """
+    Used for weight normalization. Any field in ParamAttr can also be set here.
+    Besides, an extra field dim can be set to indicate the dimension except 
+    which to normalize.
+    """
+    # List to record the parameters reparameterized by weight normalization.
+    # If these parameters are treated as Variable rather than Parameter,
+    # it can be used to discriminate these parameters and help to serialize
+    # these paramters for inference.
+    params_with_weight_norm = []
+
+    def __init__(self, dim=None, **kwargs):
+        super(WeightNormParamAttr, self).__init__(**kwargs)
+        self.dim = dim
diff --git a/python/paddle/v2/fluid/profiler.py b/python/paddle/v2/fluid/profiler.py
index 51c1c8aa705513825b46fb936c6c99090c50fb7d..d33a4c52a8873b1e376eb2077014130bdcad2e12 100644
--- a/python/paddle/v2/fluid/profiler.py
+++ b/python/paddle/v2/fluid/profiler.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2.fluid.core as core
+import core
 from contextlib import contextmanager
 import os
 
-__all__ = ['CudaProfiler']
+__all__ = ['cuda_profiler', 'reset_profiler', 'profiler']
 
 NVPROF_CONFIG = [
     "gpustarttimestamp",
@@ -103,10 +103,10 @@ def profiler(state, sorted_key=None):
     core.enable_profiler(prof_state)
     yield
 
-    if sorted_key not in ['calls', 'total', 'max', 'min', 'ave']:
-        raise ValueError("The state must be in 'calls', 'total', "
-                         "'max', 'min', 'ave'")
     sorted_key = 'default' if sorted_key is None else sorted_key
+    if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']:
+        raise ValueError("The sorted_key must be None or in 'calls', 'total', "
+                         "'max', 'min' and 'ave'")
     key_map = {
         'default': core.EventSortingKey.kDefault,
         'calls': core.EventSortingKey.kCalls,
diff --git a/python/paddle/v2/fluid/regularizer.py b/python/paddle/v2/fluid/regularizer.py
index c2f28eecfda71e305d96c5a6b62c4f5f0fbf3fa6..0273da647afb6e95a136b5ecd0975347d9a378ff 100644
--- a/python/paddle/v2/fluid/regularizer.py
+++ b/python/paddle/v2/fluid/regularizer.py
@@ -87,6 +87,11 @@ class WeightDecayRegularizer(object):
         """
         raise NotImplementedError()
 
+    def __str__(self):
+        """Debug string
+        """
+        raise NotImplementedError()
+
 
 class L2DecayRegularizer(WeightDecayRegularizer):
     """Implements the L2 Weight Decay Regularization
@@ -123,6 +128,9 @@ class L2DecayRegularizer(WeightDecayRegularizer):
 
         return decay
 
+    def __str__(self):
+        return "L2Decay, regularization_coeff=%f" % self._regularization_coeff
+
 
 class L1DecayRegularizer(WeightDecayRegularizer):
     """Implements the L1 Weight Decay Regularization
@@ -163,6 +171,9 @@ class L1DecayRegularizer(WeightDecayRegularizer):
 
         return decay
 
+    def __str__(self):
+        return "L1Decay, regularization_coeff=%f" % self._regularization_coeff
+
 
 # We short the class name, since users will use the regulaizer with the package
 # name. The sample code:
diff --git a/python/paddle/v2/fluid/tests/CMakeLists.txt b/python/paddle/v2/fluid/tests/CMakeLists.txt
index 83053160820a70bb5e54f721c0d7b881c5765004..26a80abcb5839e80b5a22f9415315519ce3042e8 100644
--- a/python/paddle/v2/fluid/tests/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/CMakeLists.txt
@@ -1,8 +1,15 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+if(NOT WITH_DISTRIBUTE)
+    list(REMOVE_ITEM TEST_OPS test_recv_op)
+endif(NOT WITH_DISTRIBUTE)
+
+list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
+py_test(test_warpctc_op SRCS test_warpctc_op.py ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR})
 
 add_subdirectory(book)
 add_subdirectory(book_distribute)
diff --git a/python/paddle/v2/fluid/tests/book/.gitignore b/python/paddle/v2/fluid/tests/book/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..dd28d354f4160b4be68b46a7bebcdf2097d5811a
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/.gitignore
@@ -0,0 +1 @@
+*.inference.model
diff --git a/python/paddle/v2/fluid/tests/book/CMakeLists.txt b/python/paddle/v2/fluid/tests/book/CMakeLists.txt
index a35abe3e0c436be4eaed01c9b9183344c6d3b275..673c965b662a022739f8d489c331f4de9455a926 100644
--- a/python/paddle/v2/fluid/tests/book/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/book/CMakeLists.txt
@@ -1,10 +1,6 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
-list(REMOVE_ITEM TEST_OPS test_image_classification_train)
-py_test(test_image_classification_train_resnet SRCS test_image_classification_train.py ARGS resnet)
-py_test(test_image_classification_train_vgg SRCS test_image_classification_train.py ARGS vgg)
-
 # default test
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
diff --git a/python/paddle/v2/fluid/tests/book/__init__.py b/python/paddle/v2/fluid/tests/book/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b94a21a7e406b833797f8f521c62a2351c2bc30a
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/__init__.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
index 0b954c60b6bc2d721c0373243e747056f8f572cf..06860a2a465c6f8590336670372eb6ff43b10594 100644
--- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
@@ -12,44 +12,78 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
+import contextlib
+import unittest
+import math
+import sys
 
-x = fluid.layers.data(name='x', shape=[13], dtype='float32')
 
-y_predict = fluid.layers.fc(input=x, size=1, act=None)
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
 
-y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
 
-cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-avg_cost = fluid.layers.mean(x=cost)
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
 
-sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-sgd_optimizer.minimize(avg_cost)
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
 
-BATCH_SIZE = 20
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    avg_cost = fluid.layers.mean(x=cost)
 
-train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.uci_housing.train(), buf_size=500),
-    batch_size=BATCH_SIZE)
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    sgd_optimizer.minimize(avg_cost)
 
-place = fluid.CPUPlace()
-feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-exe = fluid.Executor(place)
+    BATCH_SIZE = 20
 
-exe.run(fluid.default_startup_program())
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.uci_housing.train(), buf_size=500),
+        batch_size=BATCH_SIZE)
 
-PASS_NUM = 100
-for pass_id in range(PASS_NUM):
-    fluid.io.save_persistables(exe, "./fit_a_line.model/")
-    fluid.io.load_persistables(exe, "./fit_a_line.model/")
-    for data in train_reader():
-        avg_loss_value, = exe.run(fluid.default_main_program(),
-                                  feed=feeder.feed(data),
-                                  fetch_list=[avg_cost])
-        print(avg_loss_value)
-        if avg_loss_value[0] < 10.0:
-            exit(0)  # if avg cost less than 10.0, we think our code is good.
-exit(1)
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+    exe = fluid.Executor(place)
+
+    exe.run(fluid.default_startup_program())
+
+    PASS_NUM = 100
+    for pass_id in range(PASS_NUM):
+        fluid.io.save_persistables(exe, "./fit_a_line.model/")
+        fluid.io.load_persistables(exe, "./fit_a_line.model/")
+        for data in train_reader():
+            avg_loss_value, = exe.run(fluid.default_main_program(),
+                                      feed=feeder.feed(data),
+                                      fetch_list=[avg_cost])
+            print(avg_loss_value)
+            if avg_loss_value[0] < 10.0:
+                return
+            if math.isnan(float(avg_loss_value)):
+                sys.exit("got NaN loss, training failed.")
+    raise AssertionError("Fit a line cost is too large, {0:2.2}".format(
+        avg_loss_value[0]))
+
+
+class TestFitALine(unittest.TestCase):
+    def test_cpu(self):
+        with self.program_scope_guard():
+            main(use_cuda=False)
+
+    def test_cuda(self):
+        with self.program_scope_guard():
+            main(use_cuda=True)
+
+    @contextlib.contextmanager
+    def program_scope_guard(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/book/test_image_classification.py b/python/paddle/v2/fluid/tests/book/test_image_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffbe5bdbd646a03884868df659eb9d0089f9479e
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification.py
@@ -0,0 +1,234 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import contextlib
+import math
+import sys
+import numpy
+import unittest
+
+
+def resnet_cifar10(input, depth=32):
+    def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+        tmp = fluid.layers.conv2d(
+            input=input,
+            filter_size=filter_size,
+            num_filters=ch_out,
+            stride=stride,
+            padding=padding,
+            act=None,
+            bias_attr=False)
+        return fluid.layers.batch_norm(input=tmp, act=act)
+
+    def shortcut(input, ch_in, ch_out, stride):
+        if ch_in != ch_out:
+            return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+        else:
+            return input
+
+    def basicblock(input, ch_in, ch_out, stride):
+        tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
+        tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None)
+        short = shortcut(input, ch_in, ch_out, stride)
+        return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
+
+    def layer_warp(block_func, input, ch_in, ch_out, count, stride):
+        tmp = block_func(input, ch_in, ch_out, stride)
+        for i in range(1, count):
+            tmp = block_func(tmp, ch_out, ch_out, 1)
+        return tmp
+
+    assert (depth - 2) % 6 == 0
+    n = (depth - 2) / 6
+    conv1 = conv_bn_layer(
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    return pool
+
+
+def vgg16_bn_drop(input):
+    def conv_block(input, num_filter, groups, dropouts):
+        return fluid.nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max')
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    return fc2
+
+
+def train(net_type, use_cuda, save_dirname):
+    classdim = 10
+    data_shape = [3, 32, 32]
+
+    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    if net_type == "vgg":
+        print("train vgg net")
+        net = vgg16_bn_drop(images)
+    elif net_type == "resnet":
+        print("train resnet")
+        net = resnet_cifar10(images, 32)
+    else:
+        raise ValueError("%s network is not supported" % net_type)
+
+    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=predict, label=label)
+
+    # Test program 
+    test_program = fluid.default_main_program().clone()
+
+    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
+    optimizer.minimize(avg_cost)
+
+    BATCH_SIZE = 128
+    PASS_NUM = 1
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10(), buf_size=128 * 10),
+        batch_size=BATCH_SIZE)
+
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
+    exe.run(fluid.default_startup_program())
+
+    loss = 0.0
+    for pass_id in range(PASS_NUM):
+        for batch_id, data in enumerate(train_reader()):
+            exe.run(feed=feeder.feed(data))
+
+            if (batch_id % 10) == 0:
+                acc_list = []
+                avg_loss_list = []
+                for tid, test_data in enumerate(test_reader()):
+                    loss_t, acc_t = exe.run(program=test_program,
+                                            feed=feeder.feed(test_data),
+                                            fetch_list=[avg_cost, acc])
+                    if math.isnan(float(loss_t)):
+                        sys.exit("got NaN loss, training failed.")
+                    acc_list.append(float(acc_t))
+                    avg_loss_list.append(float(loss_t))
+                    break  # Use 1 segment for speeding up CI
+
+                acc_value = numpy.array(acc_list).mean()
+                avg_loss_value = numpy.array(avg_loss_list).mean()
+
+                print(
+                    'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
+                    format(pass_id, batch_id + 1,
+                           float(avg_loss_value), float(acc_value)))
+
+                if acc_value > 0.01:  # Low threshold for speeding up CI
+                    fluid.io.save_inference_model(save_dirname, ["pixel"],
+                                                  [predict], exe)
+                    return
+
+
+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    # Use fluid.io.load_inference_model to obtain the inference program desc,
+    # the feed_target_names (the names of variables that will be feeded 
+    # data using feed operators), and the fetch_targets (variables that 
+    # we want to obtain data from using fetch operators).
+    [inference_program, feed_target_names,
+     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+    # The input's dimension of conv should be 4-D or 5-D.
+    tensor_img = numpy.random.rand(1, 3, 32, 32).astype("float32")
+
+    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+    # and results will contain a list of data corresponding to fetch_targets.
+    results = exe.run(inference_program,
+                      feed={feed_target_names[0]: tensor_img},
+                      fetch_list=fetch_targets)
+    print("infer results: ", results[0])
+
+
+def main(net_type, use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    # Directory for saving the trained model
+    save_dirname = "image_classification_" + net_type + ".inference.model"
+
+    train(net_type, use_cuda, save_dirname)
+    infer(use_cuda, save_dirname)
+
+
+class TestImageClassification(unittest.TestCase):
+    def test_vgg_cuda(self):
+        with self.scope_prog_guard():
+            main('vgg', use_cuda=True)
+
+    def test_resnet_cuda(self):
+        with self.scope_prog_guard():
+            main('resnet', use_cuda=True)
+
+    def test_vgg_cpu(self):
+        with self.scope_prog_guard():
+            main('vgg', use_cuda=False)
+
+    def test_resnet_cpu(self):
+        with self.scope_prog_guard():
+            main('resnet', use_cuda=False)
+
+    @contextlib.contextmanager
+    def scope_prog_guard(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
deleted file mode 100644
index 30582a21d0a5eeab125f3a2764b45b51aa4f94b6..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
+++ /dev/null
@@ -1,144 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import sys
-
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-
-
-def resnet_cifar10(input, depth=32):
-    def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
-        tmp = fluid.layers.conv2d(
-            input=input,
-            filter_size=filter_size,
-            num_filters=ch_out,
-            stride=stride,
-            padding=padding,
-            act=None,
-            bias_attr=False)
-        return fluid.layers.batch_norm(input=tmp, act=act)
-
-    def shortcut(input, ch_in, ch_out, stride):
-        if ch_in != ch_out:
-            return conv_bn_layer(input, ch_out, 1, stride, 0, None)
-        else:
-            return input
-
-    def basicblock(input, ch_in, ch_out, stride):
-        tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
-        tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None)
-        short = shortcut(input, ch_in, ch_out, stride)
-        return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
-
-    def layer_warp(block_func, input, ch_in, ch_out, count, stride):
-        tmp = block_func(input, ch_in, ch_out, stride)
-        for i in range(1, count):
-            tmp = block_func(tmp, ch_out, ch_out, 1)
-        return tmp
-
-    assert (depth - 2) % 6 == 0
-    n = (depth - 2) / 6
-    conv1 = conv_bn_layer(
-        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
-    res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
-    res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
-    res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
-    pool = fluid.layers.pool2d(
-        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
-    return pool
-
-
-def vgg16_bn_drop(input):
-    def conv_block(input, num_filter, groups, dropouts):
-        return fluid.nets.img_conv_group(
-            input=input,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act='relu',
-            conv_with_batchnorm=True,
-            conv_batchnorm_drop_rate=dropouts,
-            pool_type='max')
-
-    conv1 = conv_block(input, 64, 2, [0.3, 0])
-    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
-    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
-    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
-    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-
-    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
-    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
-    bn = fluid.layers.batch_norm(input=fc1, act='relu')
-    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
-    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
-    return fc2
-
-
-classdim = 10
-data_shape = [3, 32, 32]
-
-images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
-label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-net_type = "vgg"
-if len(sys.argv) >= 2:
-    net_type = sys.argv[1]
-
-if net_type == "vgg":
-    print("train vgg net")
-    net = vgg16_bn_drop(images)
-elif net_type == "resnet":
-    print("train resnet")
-    net = resnet_cifar10(images, 32)
-else:
-    raise ValueError("%s network is not supported" % net_type)
-
-predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
-cost = fluid.layers.cross_entropy(input=predict, label=label)
-avg_cost = fluid.layers.mean(x=cost)
-
-optimizer = fluid.optimizer.Adam(learning_rate=0.001)
-opts = optimizer.minimize(avg_cost)
-
-accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
-
-BATCH_SIZE = 128
-PASS_NUM = 1
-
-train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.cifar.train10(), buf_size=128 * 10),
-    batch_size=BATCH_SIZE)
-
-place = fluid.CPUPlace()
-exe = fluid.Executor(place)
-feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
-exe.run(fluid.default_startup_program())
-
-for pass_id in range(PASS_NUM):
-    accuracy.reset(exe)
-    for data in train_reader():
-        loss, acc = exe.run(fluid.default_main_program(),
-                            feed=feeder.feed(data),
-                            fetch_list=[avg_cost] + accuracy.metrics)
-        pass_acc = accuracy.eval(exe)
-        print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
-            pass_acc))
-        # this model is slow, so if we can train two mini batch, we think it works properly.
-        exit(0)
-exit(1)
diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
index 1a342bf1fbbc0e5f4e3c7d440424b66c4b9f732f..1491f7a8d5496445f8300d3db1d367bb3167d2c7 100644
--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
@@ -18,7 +18,9 @@ import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.dataset.conll05 as conll05
 import paddle.v2.fluid as fluid
+import contextlib
 import time
+import unittest
 
 word_dict, verb_dict, label_dict = conll05.get_dict()
 word_dict_len = len(word_dict)
@@ -127,7 +129,15 @@ def to_lodtensor(data, place):
     return res
 
 
-def main():
+def create_random_lodtensor(lod, place, low, high):
+    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
+    res = fluid.LoDTensor()
+    res.set(data, place)
+    res.set_lod([lod])
+    return res
+
+
+def train(use_cuda, save_dirname=None):
     # define network topology
     word = fluid.layers.data(
         name='word_data', shape=[1], dtype='int64', lod_level=1)
@@ -175,8 +185,8 @@ def main():
         paddle.reader.shuffle(
             paddle.dataset.conll05.test(), buf_size=8192),
         batch_size=BATCH_SIZE)
-    #place = fluid.CPUPlace()
-    place = fluid.CUDAPlace(0)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     feeder = fluid.DataFeeder(
         feed_list=[
             word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
@@ -211,12 +221,102 @@ def main():
                 if batch_id != 0:
                     print("second per batch: " + str((time.time() - start_time)
                                                      / batch_id))
-
-            # exit early for CI
-            exit(0)
+                # Set the threshold low to speed up the CI test
+                if float(pass_precision) > 0.05:
+                    if save_dirname is not None:
+                        fluid.io.save_inference_model(save_dirname, [
+                            'word_data', 'verb_data', 'ctx_n2_data',
+                            'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
+                            'ctx_p2_data', 'mark_data'
+                        ], [feature_out], exe)
+                    return
 
             batch_id = batch_id + 1
 
 
+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    # Use fluid.io.load_inference_model to obtain the inference program desc,
+    # the feed_target_names (the names of variables that will be feeded 
+    # data using feed operators), and the fetch_targets (variables that 
+    # we want to obtain data from using fetch operators).
+    [inference_program, feed_target_names,
+     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+    lod = [0, 4, 10]
+    ts_word = create_random_lodtensor(lod, place, low=0, high=1)
+    ts_pred = create_random_lodtensor(lod, place, low=0, high=1)
+    ts_ctx_n2 = create_random_lodtensor(lod, place, low=0, high=1)
+    ts_ctx_n1 = create_random_lodtensor(lod, place, low=0, high=1)
+    ts_ctx_0 = create_random_lodtensor(lod, place, low=0, high=1)
+    ts_ctx_p1 = create_random_lodtensor(lod, place, low=0, high=1)
+    ts_ctx_p2 = create_random_lodtensor(lod, place, low=0, high=1)
+    ts_mark = create_random_lodtensor(lod, place, low=0, high=1)
+
+    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+    # and results will contain a list of data corresponding to fetch_targets.
+    assert feed_target_names[0] == 'word_data'
+    assert feed_target_names[1] == 'verb_data'
+    assert feed_target_names[2] == 'ctx_n2_data'
+    assert feed_target_names[3] == 'ctx_n1_data'
+    assert feed_target_names[4] == 'ctx_0_data'
+    assert feed_target_names[5] == 'ctx_p1_data'
+    assert feed_target_names[6] == 'ctx_p2_data'
+    assert feed_target_names[7] == 'mark_data'
+
+    results = exe.run(inference_program,
+                      feed={
+                          feed_target_names[0]: ts_word,
+                          feed_target_names[1]: ts_pred,
+                          feed_target_names[2]: ts_ctx_n2,
+                          feed_target_names[3]: ts_ctx_n1,
+                          feed_target_names[4]: ts_ctx_0,
+                          feed_target_names[5]: ts_ctx_p1,
+                          feed_target_names[6]: ts_ctx_p2,
+                          feed_target_names[7]: ts_mark
+                      },
+                      fetch_list=fetch_targets,
+                      return_numpy=False)
+    print(results[0].lod())
+    np_data = np.array(results[0])
+    print("Inference Shape: ", np_data.shape)
+    print("Inference results: ", np_data)
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    # Directory for saving the trained model
+    save_dirname = "label_semantic_roles.inference.model"
+
+    train(use_cuda, save_dirname)
+    infer(use_cuda, save_dirname)
+
+
+class TestLabelSemanticRoles(unittest.TestCase):
+    def test_cuda(self):
+        with self.scope_prog_guard():
+            main(use_cuda=True)
+
+    def test_cpu(self):
+        with self.scope_prog_guard():
+            main(use_cuda=False)
+
+    @contextlib.contextmanager
+    def scope_prog_guard(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+
+
 if __name__ == '__main__':
-    main()
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/book/test_machine_translation.py b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
index 82b760d693560dae1ab1fa39afdc186f60423e65..5716ddd3dda90958ad1008679e018542c4fb73d7 100644
--- a/python/paddle/v2/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
@@ -11,21 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import contextlib
 
 import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
-import paddle.v2.fluid.core as core
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.layers as pd
 from paddle.v2.fluid.executor import Executor
+import unittest
 
 dict_size = 30000
 source_dict_dim = target_dict_dim = dict_size
-src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
 hidden_dim = 32
 word_dim = 16
-IS_SPARSE = True
 batch_size = 2
 max_length = 8
 topk_size = 50
@@ -34,10 +33,8 @@ beam_size = 2
 
 decoder_size = hidden_dim
 
-place = core.CPUPlace()
 
-
-def encoder():
+def encoder(is_sparse):
     # encoder
     src_word_id = pd.data(
         name="src_word_id", shape=[1], dtype='int64', lod_level=1)
@@ -45,7 +42,7 @@ def encoder():
         input=src_word_id,
         size=[dict_size, word_dim],
         dtype='float32',
-        is_sparse=IS_SPARSE,
+        is_sparse=is_sparse,
         param_attr=fluid.ParamAttr(name='vemb'))
 
     fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
@@ -54,7 +51,7 @@ def encoder():
     return encoder_out
 
 
-def decoder_train(context):
+def decoder_train(context, is_sparse):
     # decoder
     trg_language_word = pd.data(
         name="target_language_word", shape=[1], dtype='int64', lod_level=1)
@@ -62,7 +59,7 @@ def decoder_train(context):
         input=trg_language_word,
         size=[dict_size, word_dim],
         dtype='float32',
-        is_sparse=IS_SPARSE,
+        is_sparse=is_sparse,
         param_attr=fluid.ParamAttr(name='vemb'))
 
     rnn = pd.DynamicRNN()
@@ -82,10 +79,10 @@ def decoder_train(context):
     return rnn()
 
 
-def decoder_decode(context):
+def decoder_decode(context, is_sparse):
     init_state = context
     array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
-    counter = pd.zeros(shape=[1], dtype='int64')
+    counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True)
 
     # fill the first element with init_state
     state_array = pd.create_array('float32')
@@ -117,7 +114,7 @@ def decoder_decode(context):
             input=pre_ids,
             size=[dict_size, word_dim],
             dtype='float32',
-            is_sparse=IS_SPARSE)
+            is_sparse=is_sparse)
 
         # use rnn unit to update rnn
         current_state = pd.fc(input=[pre_ids_emb, pre_state_expanded],
@@ -150,7 +147,7 @@ def decoder_decode(context):
 
 
 def set_init_lod(data, lod, place):
-    res = core.LoDTensor()
+    res = fluid.LoDTensor()
     res.set(data, place)
     res.set_lod(lod)
     return res
@@ -165,15 +162,19 @@ def to_lodtensor(data, place):
         lod.append(cur_len)
     flattened_data = np.concatenate(data, axis=0).astype("int64")
     flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = core.LoDTensor()
+    res = fluid.LoDTensor()
     res.set(flattened_data, place)
     res.set_lod([lod])
     return res
 
 
-def train_main():
-    context = encoder()
-    rnn_out = decoder_train(context)
+def train_main(use_cuda, is_sparse):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    context = encoder(is_sparse)
+    rnn_out = decoder_train(context, is_sparse)
     label = pd.data(
         name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
     cost = pd.cross_entropy(input=rnn_out, label=label)
@@ -212,9 +213,13 @@ def train_main():
             batch_id += 1
 
 
-def decode_main():
-    context = encoder()
-    translation_ids, translation_scores = decoder_decode(context)
+def decode_main(use_cuda, is_sparse):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    context = encoder(is_sparse)
+    translation_ids, translation_scores = decoder_decode(context, is_sparse)
 
     exe = Executor(place)
     exe.run(framework.default_startup_program())
@@ -250,6 +255,60 @@ def decode_main():
         break
 
 
+class TestMachineTranslation(unittest.TestCase):
+    pass
+
+
+@contextlib.contextmanager
+def scope_prog_guard():
+    prog = fluid.Program()
+    startup_prog = fluid.Program()
+    scope = fluid.core.Scope()
+    with fluid.scope_guard(scope):
+        with fluid.program_guard(prog, startup_prog):
+            yield
+
+
+def inject_test_train(use_cuda, is_sparse):
+    f_name = 'test_{0}_{1}_train'.format('cuda' if use_cuda else 'cpu', 'sparse'
+                                         if is_sparse else 'dense')
+
+    def f(*args):
+        with scope_prog_guard():
+            train_main(use_cuda, is_sparse)
+
+    setattr(TestMachineTranslation, f_name, f)
+
+
+def inject_test_decode(use_cuda, is_sparse, decorator=None):
+    f_name = 'test_{0}_{1}_decode'.format('cuda'
+                                          if use_cuda else 'cpu', 'sparse'
+                                          if is_sparse else 'dense')
+
+    def f(*args):
+        with scope_prog_guard():
+            decode_main(use_cuda, is_sparse)
+
+    if decorator is not None:
+        f = decorator(f)
+
+    setattr(TestMachineTranslation, f_name, f)
+
+
+for _use_cuda_ in (False, True):
+    for _is_sparse_ in (False, True):
+        inject_test_train(_use_cuda_, _is_sparse_)
+
+for _use_cuda_ in (False, True):
+    for _is_sparse_ in (False, True):
+
+        _decorator_ = None
+        if _use_cuda_:
+            _decorator_ = unittest.skip(
+                reason='Beam Search does not support CUDA!')
+
+        inject_test_decode(
+            is_sparse=_is_sparse_, use_cuda=_use_cuda_, decorator=_decorator_)
+
 if __name__ == '__main__':
-    # train_main()
-    decode_main()
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
new file mode 100644
index 0000000000000000000000000000000000000000..244c1749cd522faec26f8cf8e71f7469843f534e
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
@@ -0,0 +1,245 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import paddle.v2.fluid as fluid
+import paddle.v2 as paddle
+import sys
+import numpy
+import unittest
+import math
+import sys
+
+
+def parse_arg():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "nn_type",
+        help="The neural network type, in ['mlp', 'conv']",
+        type=str,
+        choices=['mlp', 'conv'])
+    parser.add_argument(
+        "--parallel",
+        help='Run in parallel or not',
+        default=False,
+        action="store_true")
+    parser.add_argument(
+        "--use_cuda",
+        help="Run the program by using CUDA",
+        default=False,
+        action="store_true")
+    return parser.parse_args()
+
+
+BATCH_SIZE = 64
+
+
+def loss_net(hidden, label):
+    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_loss = fluid.layers.mean(x=loss)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+    return prediction, avg_loss, acc
+
+
+def mlp(img, label):
+    hidden = fluid.layers.fc(input=img, size=200, act='tanh')
+    hidden = fluid.layers.fc(input=hidden, size=200, act='tanh')
+    return loss_net(hidden, label)
+
+
+def conv_net(img, label):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    return loss_net(conv_pool_2, label)
+
+
+def train(nn_type, use_cuda, parallel, save_dirname, save_param_filename):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    if nn_type == 'mlp':
+        net_conf = mlp
+    else:
+        net_conf = conv_net
+
+    if parallel:
+        places = fluid.layers.get_places()
+        pd = fluid.layers.ParallelDo(places)
+        with pd.do():
+            img_ = pd.read_input(img)
+            label_ = pd.read_input(label)
+            prediction, avg_loss, acc = net_conf(img_, label_)
+            for o in [avg_loss, acc]:
+                pd.write_output(o)
+
+        avg_loss, acc = pd()
+        # get mean loss and acc through every devices.
+        avg_loss = fluid.layers.mean(x=avg_loss)
+        acc = fluid.layers.mean(x=acc)
+    else:
+        prediction, avg_loss, acc = net_conf(img, label)
+
+    test_program = fluid.default_main_program().clone()
+
+    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
+    optimizer.minimize(avg_loss)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=500),
+        batch_size=BATCH_SIZE)
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
+    feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
+
+    PASS_NUM = 100
+    for pass_id in range(PASS_NUM):
+        for batch_id, data in enumerate(train_reader()):
+            # train a mini-batch, fetch nothing
+            exe.run(feed=feeder.feed(data))
+            if (batch_id + 1) % 10 == 0:
+                acc_set = []
+                avg_loss_set = []
+                for test_data in test_reader():
+                    acc_np, avg_loss_np = exe.run(program=test_program,
+                                                  feed=feeder.feed(test_data),
+                                                  fetch_list=[acc, avg_loss])
+                    acc_set.append(float(acc_np))
+                    avg_loss_set.append(float(avg_loss_np))
+                # get test acc and loss
+                acc_val = numpy.array(acc_set).mean()
+                avg_loss_val = numpy.array(avg_loss_set).mean()
+                if float(acc_val) > 0.85:  # test acc > 85%
+                    if save_dirname is not None:
+                        fluid.io.save_inference_model(
+                            save_dirname, ["img"], [prediction],
+                            exe,
+                            save_file_name=save_param_filename)
+                    return
+                else:
+                    print(
+                        'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
+                        format(pass_id, batch_id + 1,
+                               float(avg_loss_val), float(acc_val)))
+                    if math.isnan(float(avg_loss_val)):
+                        sys.exit("got NaN loss, training failed.")
+    raise AssertionError("Loss of recognize digits is too large")
+
+
+def infer(use_cuda, save_dirname=None, param_filename=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    # Use fluid.io.load_inference_model to obtain the inference program desc,
+    # the feed_target_names (the names of variables that will be feeded 
+    # data using feed operators), and the fetch_targets (variables that 
+    # we want to obtain data from using fetch operators).
+    [inference_program, feed_target_names, fetch_targets
+     ] = fluid.io.load_inference_model(save_dirname, exe, param_filename)
+
+    # The input's dimension of conv should be 4-D or 5-D.
+    # Use normilized image pixels as input data, which should be in the range [-1.0, 1.0].
+    batch_size = 1
+    tensor_img = numpy.random.uniform(-1.0, 1.0,
+                                      [batch_size, 1, 28, 28]).astype("float32")
+
+    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+    # and results will contain a list of data corresponding to fetch_targets.
+    results = exe.run(inference_program,
+                      feed={feed_target_names[0]: tensor_img},
+                      fetch_list=fetch_targets)
+    print("infer results: ", results[0])
+
+
+def main(use_cuda, parallel, nn_type, combine):
+    if not use_cuda and not parallel:
+        save_dirname = "recognize_digits_" + nn_type + ".inference.model"
+        save_filename = None
+        if combine == True:
+            save_filename = "__params_combined__"
+    else:
+        save_dirname = None
+        save_filename = None
+
+    train(
+        nn_type=nn_type,
+        use_cuda=use_cuda,
+        parallel=parallel,
+        save_dirname=save_dirname,
+        save_param_filename=save_filename)
+    infer(
+        use_cuda=use_cuda,
+        save_dirname=save_dirname,
+        param_filename=save_filename)
+
+
+class TestRecognizeDigits(unittest.TestCase):
+    pass
+
+
+def inject_test_method(use_cuda, parallel, nn_type, combine):
+    def __impl__(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                main(use_cuda, parallel, nn_type, combine)
+
+    fn = 'test_{0}_{1}_{2}_{3}'.format(nn_type, 'cuda'
+                                       if use_cuda else 'cpu', 'parallel'
+                                       if parallel else 'normal', 'combine'
+                                       if combine else 'separate')
+
+    setattr(TestRecognizeDigits, fn, __impl__)
+
+
+def inject_all_tests():
+    for use_cuda in (False, True):
+        for parallel in (False, True):
+            for nn_type in ('mlp', 'conv'):
+                inject_test_method(use_cuda, parallel, nn_type, True)
+
+    # One unit-test for saving parameters as separate files
+    inject_test_method(False, False, 'mlp', False)
+
+
+inject_all_tests()
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
deleted file mode 100644
index 4710d16c24e95a11108801a014f94687558fd91e..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
+++ /dev/null
@@ -1,74 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-
-images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
-label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-conv_pool_1 = fluid.nets.simple_img_conv_pool(
-    input=images,
-    filter_size=5,
-    num_filters=20,
-    pool_size=2,
-    pool_stride=2,
-    act="relu")
-conv_pool_2 = fluid.nets.simple_img_conv_pool(
-    input=conv_pool_1,
-    filter_size=5,
-    num_filters=50,
-    pool_size=2,
-    pool_stride=2,
-    act="relu")
-
-predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax")
-cost = fluid.layers.cross_entropy(input=predict, label=label)
-avg_cost = fluid.layers.mean(x=cost)
-optimizer = fluid.optimizer.Adam(learning_rate=0.01)
-optimizer.minimize(avg_cost)
-
-accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
-
-BATCH_SIZE = 50
-PASS_NUM = 3
-train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.mnist.train(), buf_size=500),
-    batch_size=BATCH_SIZE)
-
-place = fluid.CPUPlace()
-exe = fluid.Executor(place)
-feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
-exe.run(fluid.default_startup_program())
-
-for pass_id in range(PASS_NUM):
-    accuracy.reset(exe)
-    for data in train_reader():
-        loss, acc = exe.run(fluid.default_main_program(),
-                            feed=feeder.feed(data),
-                            fetch_list=[avg_cost] + accuracy.metrics)
-        pass_acc = accuracy.eval(exe)
-        print("pass_id=" + str(pass_id) + " acc=" + str(acc) + " pass_acc=" +
-              str(pass_acc))
-        # print loss, acc
-        if loss < 10.0 and pass_acc > 0.9:
-            # if avg cost less than 10.0 and accuracy is larger than 0.9, we think our code is good.
-            exit(0)
-
-    pass_acc = accuracy.eval(exe)
-    print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc))
-
-exit(1)
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
deleted file mode 100644
index 8776a65bf804e93dfeb295ecca34fac0840b0a90..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
+++ /dev/null
@@ -1,96 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-
-BATCH_SIZE = 128
-image = fluid.layers.data(name='x', shape=[784], dtype='float32')
-
-regularizer = fluid.regularizer.L2Decay(0.0005 * BATCH_SIZE)
-
-hidden1 = fluid.layers.fc(input=image,
-                          size=128,
-                          act='relu',
-                          param_attr=fluid.ParamAttr(
-                              regularizer=regularizer,
-                              gradient_clip=fluid.clip.ClipByValue(10)))
-
-hidden2 = fluid.layers.fc(input=hidden1,
-                          size=64,
-                          act='relu',
-                          param_attr=regularizer)
-
-predict = fluid.layers.fc(input=hidden2,
-                          size=10,
-                          act='softmax',
-                          param_attr=regularizer)
-
-label = fluid.layers.data(name='y', shape=[1], dtype='int64')
-
-cost = fluid.layers.cross_entropy(input=predict, label=label)
-avg_cost = fluid.layers.mean(x=cost)
-
-optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
-opts = optimizer.minimize(avg_cost)
-
-accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
-
-inference_program = fluid.default_main_program().clone()
-with fluid.program_guard(inference_program):
-    test_accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
-    test_target = [avg_cost] + test_accuracy.metrics + test_accuracy.states
-    inference_program = fluid.io.get_inference_program(test_target)
-
-train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.mnist.train(), buf_size=8192),
-    batch_size=BATCH_SIZE)
-
-test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
-
-place = fluid.CPUPlace()
-exe = fluid.Executor(place)
-feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
-exe.run(fluid.default_startup_program())
-
-PASS_NUM = 100
-for pass_id in range(PASS_NUM):
-    accuracy.reset(exe)
-    for data in train_reader():
-        out, acc = exe.run(fluid.default_main_program(),
-                           feed=feeder.feed(data),
-                           fetch_list=[avg_cost] + accuracy.metrics)
-        pass_acc = accuracy.eval(exe)
-
-        test_accuracy.reset(exe)
-        for data in test_reader():
-            out, acc = exe.run(inference_program,
-                               feed=feeder.feed(data),
-                               fetch_list=[avg_cost] + test_accuracy.metrics)
-
-        test_pass_acc = test_accuracy.eval(exe)
-        print("pass_id=" + str(pass_id) + " train_cost=" + str(
-            out) + " train_acc=" + str(acc) + " train_pass_acc=" + str(pass_acc)
-              + " test_acc=" + str(test_pass_acc))
-
-        if test_pass_acc > 0.7:
-            fluid.io.save_inference_model(
-                "./recognize_digits_mlp.inference.model/", ["x"], [predict],
-                exe)
-            exit(0)
-
-exit(1)
diff --git a/python/paddle/v2/fluid/tests/book/test_recommender_system.py b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
index d4a694e5721415fd9c953a83d927b25b80f5fb47..612d51e08e4fc05b397df9d8aaaf675ba9d783af 100644
--- a/python/paddle/v2/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import math
+import sys
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.core as core
+import paddle.v2.fluid as fluid
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.nets as nets
@@ -102,7 +104,8 @@ def get_mov_combined_features():
 
     CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
 
-    category_id = layers.data(name='category_id', shape=[1], dtype='int64')
+    category_id = layers.data(
+        name='category_id', shape=[1], dtype='int64', lod_level=1)
 
     mov_categories_emb = layers.embedding(
         input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
@@ -112,7 +115,8 @@ def get_mov_combined_features():
 
     MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
 
-    mov_title_id = layers.data(name='movie_title', shape=[1], dtype='int64')
+    mov_title_id = layers.data(
+        name='movie_title', shape=[1], dtype='int64', lod_level=1)
 
     mov_title_emb = layers.embedding(
         input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
@@ -142,23 +146,22 @@ def model():
     scale_infer = layers.scale(x=inference, scale=5.0)
 
     label = layers.data(name='score', shape=[1], dtype='float32')
-
     square_cost = layers.square_error_cost(input=scale_infer, label=label)
-
     avg_cost = layers.mean(x=square_cost)
 
-    return avg_cost
+    return scale_infer, avg_cost
+
 
+def train(use_cuda, save_dirname):
+    scale_infer, avg_cost = model()
+
+    # test program
+    test_program = fluid.default_main_program().clone()
 
-def main():
-    cost = model()
     sgd_optimizer = SGDOptimizer(learning_rate=0.2)
-    opts = sgd_optimizer.minimize(cost)
+    opts = sgd_optimizer.minimize(avg_cost)
 
-    if USE_GPU:
-        place = core.CUDAPlace(0)
-    else:
-        place = core.CPUPlace()
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     exe = Executor(place)
     exe.run(framework.default_startup_program())
@@ -167,6 +170,8 @@ def main():
         paddle.reader.shuffle(
             paddle.dataset.movielens.train(), buf_size=8192),
         batch_size=BATCH_SIZE)
+    test_reader = paddle.batch(
+        paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)
 
     feeding = {
         'user_id': 0,
@@ -182,7 +187,7 @@ def main():
     def func_feed(feeding, data):
         feed_tensors = {}
         for (key, idx) in feeding.iteritems():
-            tensor = core.LoDTensor()
+            tensor = fluid.LoDTensor()
             if key != "category_id" and key != "movie_title":
                 if key == "score":
                     numpy_data = np.array(map(lambda x: x[idx], data)).astype(
@@ -209,14 +214,117 @@ def main():
 
     PASS_NUM = 100
     for pass_id in range(PASS_NUM):
-        for data in train_reader():
-            outs = exe.run(framework.default_main_program(),
+        for batch_id, data in enumerate(train_reader()):
+            # train a mini-batch
+            outs = exe.run(program=fluid.default_main_program(),
                            feed=func_feed(feeding, data),
-                           fetch_list=[cost])
+                           fetch_list=[avg_cost])
             out = np.array(outs[0])
-            if out[0] < 6.0:
-                # if avg cost less than 6.0, we think our code is good.
-                exit(0)
-
-
-main()
+            if (batch_id + 1) % 10 == 0:
+                avg_cost_set = []
+                for test_data in test_reader():
+                    avg_cost_np = exe.run(program=test_program,
+                                          feed=func_feed(feeding, test_data),
+                                          fetch_list=[avg_cost])
+                    avg_cost_set.append(avg_cost_np[0])
+                    break  # test only 1 segment for speeding up CI
+
+                # get test avg_cost
+                test_avg_cost = np.array(avg_cost_set).mean()
+                if test_avg_cost < 6.0:
+                    # if avg_cost less than 6.0, we think our code is good.
+                    if save_dirname is not None:
+                        fluid.io.save_inference_model(save_dirname, [
+                            "user_id", "gender_id", "age_id", "job_id",
+                            "movie_id", "category_id", "movie_title"
+                        ], [scale_infer], exe)
+                    return
+
+            if math.isnan(float(out[0])):
+                sys.exit("got NaN loss, training failed.")
+
+
+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    # Use fluid.io.load_inference_model to obtain the inference program desc,
+    # the feed_target_names (the names of variables that will be feeded
+    # data using feed operators), and the fetch_targets (variables that
+    # we want to obtain data from using fetch operators).
+    [inference_program, feed_target_names,
+     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+    def create_lod_tensor(data, lod=None):
+        tensor = fluid.LoDTensor()
+        if lod is None:
+            # Tensor, the shape is [batch_size, 1]
+            index = 0
+            lod_0 = [index]
+            for l in range(len(data)):
+                index += 1
+                lod_0.append(index)
+            lod = [lod_0]
+        tensor.set_lod(lod)
+
+        flattened_data = np.concatenate(data, axis=0).astype("int64")
+        flattened_data = flattened_data.reshape([len(flattened_data), 1])
+        tensor.set(flattened_data, place)
+        return tensor
+
+    # Use the first data from paddle.dataset.movielens.test() as input
+    assert feed_target_names[0] == "user_id"
+    user_id = create_lod_tensor([[1]])
+
+    assert feed_target_names[1] == "gender_id"
+    gender_id = create_lod_tensor([[1]])
+
+    assert feed_target_names[2] == "age_id"
+    age_id = create_lod_tensor([[0]])
+
+    assert feed_target_names[3] == "job_id"
+    job_id = create_lod_tensor([[10]])
+
+    assert feed_target_names[4] == "movie_id"
+    movie_id = create_lod_tensor([[783]])
+
+    assert feed_target_names[5] == "category_id"
+    category_id = create_lod_tensor([[10], [8], [9]], [[0, 3]])
+
+    assert feed_target_names[6] == "movie_title"
+    movie_title = create_lod_tensor([[1069], [4140], [2923], [710], [988]],
+                                    [[0, 5]])
+
+    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+    # and results will contain a list of data corresponding to fetch_targets.
+    results = exe.run(inference_program,
+                      feed={
+                          feed_target_names[0]: user_id,
+                          feed_target_names[1]: gender_id,
+                          feed_target_names[2]: age_id,
+                          feed_target_names[3]: job_id,
+                          feed_target_names[4]: movie_id,
+                          feed_target_names[5]: category_id,
+                          feed_target_names[6]: movie_title
+                      },
+                      fetch_list=fetch_targets,
+                      return_numpy=False)
+    print("inferred score: ", np.array(results[0]))
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    # Directory for saving the inference model
+    save_dirname = "recommender_system.inference.model"
+
+    train(use_cuda, save_dirname)
+    infer(use_cuda, save_dirname)
+
+
+if __name__ == '__main__':
+    main(USE_GPU)
diff --git a/python/paddle/v2/fluid/tests/book/test_rnn_encoder_decoder.py b/python/paddle/v2/fluid/tests/book/test_rnn_encoder_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fe43c680ca9319682c42836986308856185a464
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_rnn_encoder_decoder.py
@@ -0,0 +1,290 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
+import contextlib
+import math
+import sys
+import unittest
+from paddle.v2.fluid.executor import Executor
+
+dict_size = 30000
+source_dict_dim = target_dict_dim = dict_size
+src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+hidden_dim = 32
+embedding_dim = 16
+batch_size = 10
+max_length = 50
+topk_size = 50
+encoder_size = decoder_size = hidden_dim
+IS_SPARSE = True
+USE_PEEPHOLES = False
+
+
+def bi_lstm_encoder(input_seq, hidden_size):
+    input_forward_proj = fluid.layers.fc(input=input_seq,
+                                         size=hidden_size * 4,
+                                         bias_attr=True)
+    forward, _ = fluid.layers.dynamic_lstm(
+        input=input_forward_proj,
+        size=hidden_size * 4,
+        use_peepholes=USE_PEEPHOLES)
+    input_backward_proj = fluid.layers.fc(input=input_seq,
+                                          size=hidden_size * 4,
+                                          bias_attr=True)
+    backward, _ = fluid.layers.dynamic_lstm(
+        input=input_backward_proj,
+        size=hidden_size * 4,
+        is_reverse=True,
+        use_peepholes=USE_PEEPHOLES)
+
+    forward_last = fluid.layers.sequence_last_step(input=forward)
+    backward_first = fluid.layers.sequence_first_step(input=backward)
+
+    return forward_last, backward_first
+
+
+# FIXME(peterzhang2029): Replace this function with the lstm_unit_op.
+def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
+    def linear(inputs):
+        return fluid.layers.fc(input=inputs, size=size, bias_attr=True)
+
+    forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t]))
+
+    cell_t = fluid.layers.sums(input=[
+        fluid.layers.elementwise_mul(
+            x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul(
+                x=input_gate, y=cell_tilde)
+    ])
+
+    hidden_t = fluid.layers.elementwise_mul(
+        x=output_gate, y=fluid.layers.tanh(x=cell_t))
+
+    return hidden_t, cell_t
+
+
+def lstm_decoder_without_attention(target_embedding, decoder_boot, context,
+                                   decoder_size):
+    rnn = fluid.layers.DynamicRNN()
+
+    cell_init = fluid.layers.fill_constant_batch_size_like(
+        input=decoder_boot,
+        value=0.0,
+        shape=[-1, decoder_size],
+        dtype='float32')
+    cell_init.stop_gradient = False
+
+    with rnn.block():
+        current_word = rnn.step_input(target_embedding)
+        context = rnn.static_input(context)
+
+        hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
+        cell_mem = rnn.memory(init=cell_init)
+        decoder_inputs = fluid.layers.concat(
+            input=[context, current_word], axis=1)
+        h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size)
+        rnn.update_memory(hidden_mem, h)
+        rnn.update_memory(cell_mem, c)
+        out = fluid.layers.fc(input=h,
+                              size=target_dict_dim,
+                              bias_attr=True,
+                              act='softmax')
+        rnn.output(out)
+    return rnn()
+
+
+def seq_to_seq_net():
+    """Construct a seq2seq network."""
+
+    src_word_idx = fluid.layers.data(
+        name='source_sequence', shape=[1], dtype='int64', lod_level=1)
+
+    src_embedding = fluid.layers.embedding(
+        input=src_word_idx,
+        size=[source_dict_dim, embedding_dim],
+        dtype='float32')
+
+    src_forward_last, src_backward_first = bi_lstm_encoder(
+        input_seq=src_embedding, hidden_size=encoder_size)
+
+    encoded_vector = fluid.layers.concat(
+        input=[src_forward_last, src_backward_first], axis=1)
+
+    decoder_boot = fluid.layers.fc(input=src_backward_first,
+                                   size=decoder_size,
+                                   bias_attr=False,
+                                   act='tanh')
+
+    trg_word_idx = fluid.layers.data(
+        name='target_sequence', shape=[1], dtype='int64', lod_level=1)
+
+    trg_embedding = fluid.layers.embedding(
+        input=trg_word_idx,
+        size=[target_dict_dim, embedding_dim],
+        dtype='float32')
+
+    prediction = lstm_decoder_without_attention(trg_embedding, decoder_boot,
+                                                encoded_vector, decoder_size)
+    label = fluid.layers.data(
+        name='label_sequence', shape=[1], dtype='int64', lod_level=1)
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    return avg_cost, prediction
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = core.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def create_random_lodtensor(lod, place, low, high):
+    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
+    res = fluid.LoDTensor()
+    res.set(data, place)
+    res.set_lod([lod])
+    return res
+
+
+def train(use_cuda, save_dirname=None):
+    [avg_cost, prediction] = seq_to_seq_net()
+
+    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
+    optimizer.minimize(avg_cost)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=batch_size)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = Executor(place)
+
+    exe.run(framework.default_startup_program())
+
+    batch_id = 0
+    for pass_id in xrange(2):
+        for data in train_data():
+            word_data = to_lodtensor(map(lambda x: x[0], data), place)
+            trg_word = to_lodtensor(map(lambda x: x[1], data), place)
+            trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
+
+            outs = exe.run(framework.default_main_program(),
+                           feed={
+                               'source_sequence': word_data,
+                               'target_sequence': trg_word,
+                               'label_sequence': trg_word_next
+                           },
+                           fetch_list=[avg_cost])
+
+            avg_cost_val = np.array(outs[0])
+            print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
+                  " avg_cost=" + str(avg_cost_val))
+            if math.isnan(float(avg_cost_val[0])):
+                sys.exit("got NaN loss, training failed.")
+            if batch_id > 3:
+                if save_dirname is not None:
+                    fluid.io.save_inference_model(
+                        save_dirname, ['source_sequence',
+                                       'target_sequence'], [prediction], exe)
+                return
+
+            batch_id += 1
+
+
+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    # Use fluid.io.load_inference_model to obtain the inference program desc,
+    # the feed_target_names (the names of variables that will be feeded 
+    # data using feed operators), and the fetch_targets (variables that 
+    # we want to obtain data from using fetch operators).
+    [inference_program, feed_target_names,
+     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+    lod = [0, 4, 10]
+    word_data = create_random_lodtensor(lod, place, low=0, high=1)
+    trg_word = create_random_lodtensor(lod, place, low=0, high=1)
+
+    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+    # and results will contain a list of data corresponding to fetch_targets.
+    assert feed_target_names[0] == 'source_sequence'
+    assert feed_target_names[1] == 'target_sequence'
+    results = exe.run(inference_program,
+                      feed={
+                          feed_target_names[0]: word_data,
+                          feed_target_names[1]: trg_word,
+                      },
+                      fetch_list=fetch_targets,
+                      return_numpy=False)
+    print(results[0].lod())
+    np_data = np.array(results[0])
+    print("Inference shape: ", np_data.shape)
+    print("Inference results: ", np_data)
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    # Directory for saving the trained model
+    save_dirname = "rnn_encoder_decoder.inference.model"
+
+    train(use_cuda, save_dirname)
+    infer(use_cuda, save_dirname)
+
+
+class TestRnnEncoderDecoder(unittest.TestCase):
+    def test_cuda(self):
+        with self.scope_prog_guard():
+            main(use_cuda=True)
+
+    def test_cpu(self):
+        with self.scope_prog_guard():
+            main(use_cuda=False)
+
+    @contextlib.contextmanager
+    def scope_prog_guard(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c5cb667aed7456b54d32dcd650852cfdbd6cce1
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py
@@ -0,0 +1,158 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.v2.fluid as fluid
+import paddle.v2 as paddle
+import contextlib
+import math
+import sys
+
+
+def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
+                    hid_dim=32):
+    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
+    conv_3 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=3,
+        act="tanh",
+        pool_type="sqrt")
+    conv_4 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=4,
+        act="tanh",
+        pool_type="sqrt")
+    prediction = fluid.layers.fc(input=[conv_3, conv_4],
+                                 size=class_dim,
+                                 act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
+    adam_optimizer.minimize(avg_cost)
+    accuracy = fluid.layers.accuracy(input=prediction, label=label)
+    return avg_cost, accuracy
+
+
+def stacked_lstm_net(data,
+                     label,
+                     input_dim,
+                     class_dim=2,
+                     emb_dim=128,
+                     hid_dim=512,
+                     stacked_num=3):
+    assert stacked_num % 2 == 1
+
+    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
+    # add bias attr
+
+    # TODO(qijun) linear act
+    fc1 = fluid.layers.fc(input=emb, size=hid_dim)
+    lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)
+
+    inputs = [fc1, lstm1]
+
+    for i in range(2, stacked_num + 1):
+        fc = fluid.layers.fc(input=inputs, size=hid_dim)
+        lstm, cell = fluid.layers.dynamic_lstm(
+            input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
+        inputs = [fc, lstm]
+
+    fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
+    lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
+
+    prediction = fluid.layers.fc(input=[fc_last, lstm_last],
+                                 size=class_dim,
+                                 act='softmax')
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
+    adam_optimizer.minimize(avg_cost)
+    accuracy = fluid.layers.accuracy(input=prediction, label=label)
+    return avg_cost, accuracy
+
+
+def main(word_dict, net_method, use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    BATCH_SIZE = 128
+    PASS_NUM = 5
+    dict_dim = len(word_dict)
+    class_dim = 2
+
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    cost, acc_out = net_method(
+        data, label, input_dim=dict_dim, class_dim=class_dim)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=1000),
+        batch_size=BATCH_SIZE)
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
+
+    exe.run(fluid.default_startup_program())
+
+    for pass_id in xrange(PASS_NUM):
+        for data in train_data():
+            cost_val, acc_val = exe.run(fluid.default_main_program(),
+                                        feed=feeder.feed(data),
+                                        fetch_list=[cost, acc_out])
+            print("cost=" + str(cost_val) + " acc=" + str(acc_val))
+            if cost_val < 0.4 and acc_val > 0.8:
+                return
+            if math.isnan(float(cost_val)):
+                sys.exit("got NaN loss, training failed.")
+    raise AssertionError("Cost is too large for {0}".format(
+        net_method.__name__))
+
+
+class TestUnderstandSentiment(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.word_dict = paddle.dataset.imdb.word_dict()
+
+    @contextlib.contextmanager
+    def new_program_scope(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+
+    def test_conv_cpu(self):
+        with self.new_program_scope():
+            main(self.word_dict, net_method=convolution_net, use_cuda=False)
+
+    def test_stacked_lstm_cpu(self):
+        with self.new_program_scope():
+            main(self.word_dict, net_method=stacked_lstm_net, use_cuda=False)
+
+    def test_conv_gpu(self):
+        with self.new_program_scope():
+            main(self.word_dict, net_method=convolution_net, use_cuda=True)
+
+    def test_stacked_lstm_gpu(self):
+        with self.new_program_scope():
+            main(self.word_dict, net_method=stacked_lstm_net, use_cuda=True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
deleted file mode 100644
index df27399dd215a579d7e3f8a1659180a06b1e7f64..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
+++ /dev/null
@@ -1,101 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-
-
-def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
-                    hid_dim=32):
-    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
-    conv_3 = fluid.nets.sequence_conv_pool(
-        input=emb,
-        num_filters=hid_dim,
-        filter_size=3,
-        act="tanh",
-        pool_type="sqrt")
-    conv_4 = fluid.nets.sequence_conv_pool(
-        input=emb,
-        num_filters=hid_dim,
-        filter_size=4,
-        act="tanh",
-        pool_type="sqrt")
-    prediction = fluid.layers.fc(input=[conv_3, conv_4],
-                                 size=class_dim,
-                                 act="softmax")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
-    adam_optimizer.minimize(avg_cost)
-    accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
-    return avg_cost, accuracy, accuracy.metrics[0]
-
-
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
-def main():
-    BATCH_SIZE = 100
-    PASS_NUM = 5
-
-    word_dict = paddle.dataset.imdb.word_dict()
-    dict_dim = len(word_dict)
-    class_dim = 2
-
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-    cost, accuracy, acc_out = convolution_net(
-        data, label, input_dim=dict_dim, class_dim=class_dim)
-
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.imdb.train(word_dict), buf_size=1000),
-        batch_size=BATCH_SIZE)
-    place = fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
-
-    exe.run(fluid.default_startup_program())
-
-    for pass_id in xrange(PASS_NUM):
-        accuracy.reset(exe)
-        for data in train_data():
-            cost_val, acc_val = exe.run(fluid.default_main_program(),
-                                        feed=feeder.feed(data),
-                                        fetch_list=[cost, acc_out])
-            pass_acc = accuracy.eval(exe)
-            print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
-                  " pass_acc=" + str(pass_acc))
-            if cost_val < 1.0 and pass_acc > 0.8:
-                exit(0)
-    exit(1)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
deleted file mode 100644
index 117f74c59ad5bf6bb67711801cd7b9a41f39f1f8..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
+++ /dev/null
@@ -1,160 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-from paddle.v2.fluid.layer_helper import LayerHelper
-
-
-def lstm(x, c_pre_init, hidden_dim, forget_bias=None):
-    """
-    This function helps create an operator for the LSTM (Long Short Term
-    Memory) cell that can be used inside an RNN.
-    """
-    helper = LayerHelper('lstm_unit', **locals())
-    rnn = fluid.layers.StaticRNN()
-    with rnn.step():
-        c_pre = rnn.memory(init=c_pre_init)
-        x_t = rnn.step_input(x)
-
-        before_fc = fluid.layers.concat(input=[x_t, c_pre], axis=1)
-        after_fc = fluid.layers.fc(input=before_fc, size=hidden_dim * 4)
-
-        dtype = x.dtype
-        c = helper.create_tmp_variable(dtype)
-        h = helper.create_tmp_variable(dtype)
-
-        helper.append_op(
-            type='lstm_unit',
-            inputs={"X": after_fc,
-                    "C_prev": c_pre},
-            outputs={"C": c,
-                     "H": h},
-            attrs={"forget_bias": forget_bias})
-
-        rnn.update_memory(c_pre, c)
-        rnn.output(h)
-
-    return rnn()
-
-
-def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50):
-    data = fluid.layers.data(
-        name="words",
-        shape=[seq_len * batch_size, 1],
-        append_batch_size=False,
-        dtype="int64",
-        lod_level=1)
-    label = fluid.layers.data(
-        name="label",
-        shape=[batch_size, 1],
-        append_batch_size=False,
-        dtype="int64")
-
-    emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
-    emb = fluid.layers.reshape(x=emb, shape=[batch_size, seq_len, emb_dim])
-    emb = fluid.layers.transpose(x=emb, perm=[1, 0, 2])
-
-    c_pre_init = fluid.layers.fill_constant(
-        dtype=emb.dtype, shape=[batch_size, emb_dim], value=0.0)
-    c_pre_init.stop_gradient = False
-    layer_1_out = lstm(emb, c_pre_init=c_pre_init, hidden_dim=emb_dim)
-    layer_1_out = fluid.layers.transpose(x=layer_1_out, perm=[1, 0, 2])
-
-    prediction = fluid.layers.fc(input=layer_1_out,
-                                 size=class_dim,
-                                 act="softmax")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-
-    avg_cost = fluid.layers.mean(x=cost)
-    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
-    adam_optimizer.minimize(avg_cost)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-
-    return avg_cost, acc
-
-
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
-def chop_data(data, chop_len=80, batch_size=50):
-    data = [(x[0][:chop_len], x[1]) for x in data if len(x[0]) >= chop_len]
-
-    return data[:batch_size]
-
-
-def prepare_feed_data(data, place):
-    tensor_words = to_lodtensor(map(lambda x: x[0], data), place)
-
-    label = np.array(map(lambda x: x[1], data)).astype("int64")
-    label = label.reshape([len(label), 1])
-    tensor_label = fluid.LoDTensor()
-    tensor_label.set(label, place)
-
-    return tensor_words, tensor_label
-
-
-def main():
-    BATCH_SIZE = 100
-    PASS_NUM = 5
-
-    word_dict = paddle.dataset.imdb.word_dict()
-    print "load word dict successfully"
-    dict_dim = len(word_dict)
-    class_dim = 2
-
-    cost, acc = lstm_net(dict_dim=dict_dim, class_dim=class_dim)
-
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.imdb.train(word_dict), buf_size=BATCH_SIZE * 10),
-        batch_size=BATCH_SIZE)
-    place = fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    exe.run(fluid.default_startup_program())
-
-    for pass_id in xrange(PASS_NUM):
-        for data in train_data():
-            chopped_data = chop_data(data)
-            tensor_words, tensor_label = prepare_feed_data(chopped_data, place)
-
-            outs = exe.run(fluid.default_main_program(),
-                           feed={"words": tensor_words,
-                                 "label": tensor_label},
-                           fetch_list=[cost, acc])
-            cost_val = np.array(outs[0])
-            acc_val = np.array(outs[1])
-
-            print("cost=" + str(cost_val) + " acc=" + str(acc_val))
-            if acc_val > 0.7:
-                exit(0)
-    exit(1)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/python/paddle/v2/fluid/tests/book/test_word2vec.py b/python/paddle/v2/fluid/tests/book/test_word2vec.py
index 8cf54846fe5dba2742ce69e34e0788e124a1a85d..f013d7f1551bdbfb2f725809e2fb4d7d686560fe 100644
--- a/python/paddle/v2/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py
@@ -12,76 +12,150 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
+import unittest
+import os
+import math
+import sys
 
-PASS_NUM = 100
-EMBED_SIZE = 32
-HIDDEN_SIZE = 256
-N = 5
-BATCH_SIZE = 32
-IS_SPARSE = True
-
-word_dict = paddle.dataset.imikolov.build_dict()
-dict_size = len(word_dict)
-
-first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
-second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
-third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
-forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
-next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
-
-embed_first = fluid.layers.embedding(
-    input=first_word,
-    size=[dict_size, EMBED_SIZE],
-    dtype='float32',
-    is_sparse=IS_SPARSE,
-    param_attr='shared_w')
-embed_second = fluid.layers.embedding(
-    input=second_word,
-    size=[dict_size, EMBED_SIZE],
-    dtype='float32',
-    is_sparse=IS_SPARSE,
-    param_attr='shared_w')
-embed_third = fluid.layers.embedding(
-    input=third_word,
-    size=[dict_size, EMBED_SIZE],
-    dtype='float32',
-    is_sparse=IS_SPARSE,
-    param_attr='shared_w')
-embed_forth = fluid.layers.embedding(
-    input=forth_word,
-    size=[dict_size, EMBED_SIZE],
-    dtype='float32',
-    is_sparse=IS_SPARSE,
-    param_attr='shared_w')
-
-concat_embed = fluid.layers.concat(
-    input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
-hidden1 = fluid.layers.fc(input=concat_embed, size=HIDDEN_SIZE, act='sigmoid')
-predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax')
-cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
-avg_cost = fluid.layers.mean(x=cost)
-sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-sgd_optimizer.minimize(avg_cost)
-
-train_reader = paddle.batch(
-    paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
-
-place = fluid.CPUPlace()
-exe = fluid.Executor(place)
-feeder = fluid.DataFeeder(
-    feed_list=[first_word, second_word, third_word, forth_word, next_word],
-    place=place)
-
-exe.run(fluid.default_startup_program())
-
-for pass_id in range(PASS_NUM):
-    for data in train_reader():
-        avg_cost_np = exe.run(fluid.default_main_program(),
-                              feed=feeder.feed(data),
-                              fetch_list=[avg_cost])
-        if avg_cost_np[0] < 5.0:
-            exit(0)  # if avg cost less than 10.0, we think our code is good.
-exit(1)
+
+def main(use_cuda, is_sparse, parallel):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    PASS_NUM = 100
+    EMBED_SIZE = 32
+    HIDDEN_SIZE = 256
+    N = 5
+    BATCH_SIZE = 32
+    IS_SPARSE = is_sparse
+
+    def __network__(words):
+        embed_first = fluid.layers.embedding(
+            input=words[0],
+            size=[dict_size, EMBED_SIZE],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr='shared_w')
+        embed_second = fluid.layers.embedding(
+            input=words[1],
+            size=[dict_size, EMBED_SIZE],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr='shared_w')
+        embed_third = fluid.layers.embedding(
+            input=words[2],
+            size=[dict_size, EMBED_SIZE],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr='shared_w')
+        embed_forth = fluid.layers.embedding(
+            input=words[3],
+            size=[dict_size, EMBED_SIZE],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr='shared_w')
+
+        concat_embed = fluid.layers.concat(
+            input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
+        hidden1 = fluid.layers.fc(input=concat_embed,
+                                  size=HIDDEN_SIZE,
+                                  act='sigmoid')
+        predict_word = fluid.layers.fc(input=hidden1,
+                                       size=dict_size,
+                                       act='softmax')
+        cost = fluid.layers.cross_entropy(input=predict_word, label=words[4])
+        avg_cost = fluid.layers.mean(x=cost)
+        return avg_cost
+
+    word_dict = paddle.dataset.imikolov.build_dict()
+    dict_size = len(word_dict)
+
+    first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
+    second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
+    third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
+    forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
+    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+
+    if not parallel:
+        avg_cost = __network__(
+            [first_word, second_word, third_word, forth_word, next_word])
+    else:
+        places = fluid.layers.get_places()
+        pd = fluid.layers.ParallelDo(places)
+        with pd.do():
+            avg_cost = __network__(
+                map(pd.read_input, [
+                    first_word, second_word, third_word, forth_word, next_word
+                ]))
+            pd.write_output(avg_cost)
+
+        avg_cost = fluid.layers.mean(x=pd())
+
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    sgd_optimizer.minimize(avg_cost)
+
+    train_reader = paddle.batch(
+        paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(
+        feed_list=[first_word, second_word, third_word, forth_word, next_word],
+        place=place)
+
+    exe.run(fluid.default_startup_program())
+
+    for pass_id in range(PASS_NUM):
+        for data in train_reader():
+            avg_cost_np = exe.run(fluid.default_main_program(),
+                                  feed=feeder.feed(data),
+                                  fetch_list=[avg_cost])
+            if avg_cost_np[0] < 5.0:
+                return
+            if math.isnan(float(avg_cost_np[0])):
+                sys.exit("got NaN loss, training failed.")
+
+    raise AssertionError("Cost is too large {0:2.2}".format(avg_cost_np[0]))
+
+
+FULL_TEST = os.getenv('FULL_TEST',
+                      '0').lower() in ['true', '1', 't', 'y', 'yes', 'on']
+SKIP_REASON = "Only run minimum number of tests in CI server, to make CI faster"
+
+
+class W2VTest(unittest.TestCase):
+    pass
+
+
+def inject_test_method(use_cuda, is_sparse, parallel):
+    fn_name = "test_{0}_{1}_{2}".format("cuda" if use_cuda else "cpu", "sparse"
+                                        if is_sparse else "dense", "parallel"
+                                        if parallel else "normal")
+
+    def __impl__(*args, **kwargs):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                main(use_cuda=use_cuda, is_sparse=is_sparse, parallel=parallel)
+
+    if use_cuda and is_sparse and parallel:
+        fn = __impl__
+    else:
+        # skip the other test when on CI server
+        fn = unittest.skipUnless(
+            condition=FULL_TEST, reason=SKIP_REASON)(__impl__)
+
+    setattr(W2VTest, fn_name, fn)
+
+
+for use_cuda in (False, True):
+    for is_sparse in (False, True):
+        for parallel in (False, True):
+            inject_test_method(use_cuda, is_sparse, parallel)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_image_classification.py b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_image_classification.py
index 218dea31e10757d901c5524567f13501b64dbea5..298ecfc386b3ae093cf714a41f5072759cb2cf2e 100644
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_image_classification.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_image_classification.py
@@ -1,21 +1,19 @@
-#Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 from __future__ import print_function
 
-import sys
-
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
 import os
@@ -106,10 +104,10 @@ if len(sys.argv) >= 2:
     net_type = sys.argv[1]
 
 if net_type == "vgg":
-    print("train vgg net")
+    print("training vgg net")
     net = vgg16_bn_drop(images)
 elif net_type == "resnet":
-    print("train resnet")
+    print("training resnet")
     net = resnet_cifar10(images, 32)
 else:
     raise ValueError("%s network is not supported" % net_type)
@@ -129,6 +127,7 @@ train_reader = paddle.batch(
     batch_size=BATCH_SIZE)
 
 place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
 exe = fluid.Executor(place)
 
 t = fluid.DistributeTranspiler()
@@ -146,17 +145,14 @@ if training_role == "PSERVER":
     if not current_endpoint:
         print("need env SERVER_ENDPOINT")
         exit(1)
-    print("start pserver at:", current_endpoint)
     pserver_prog = t.get_pserver_program(current_endpoint)
     pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
     exe.run(pserver_startup)
     exe.run(pserver_prog)
-    print("pserver run end")
 elif training_role == "TRAINER":
-    print("start trainer")
     trainer_prog = t.get_trainer_program()
-    feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
     exe.run(fluid.default_startup_program())
+
     for pass_id in range(PASS_NUM):
         accuracy.reset(exe)
         for data in train_reader():
@@ -164,9 +160,10 @@ elif training_role == "TRAINER":
                                 feed=feeder.feed(data),
                                 fetch_list=[avg_cost] + accuracy.metrics)
             pass_acc = accuracy.eval(exe)
-            print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
-                pass_acc))
-            # this model is slow, so if we can train two mini batch, we think it works properly.
+            print("pass_id:" + str(pass_id) + "loss:" + str(loss) + " pass_acc:"
+                  + str(pass_acc))
+            # this model is slow, so if we can train two mini batches,
+            # we think it works properly.
     print("trainer run end")
 else:
     print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_recommender_system_dist.py b/python/paddle/v2/fluid/tests/book_distribute/notest_recommender_system_dist.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d8885e377b0a10d8b5bad4e8fcecb9cc6fc8b64
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_recommender_system_dist.py
@@ -0,0 +1,216 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import os
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
+from paddle.v2.fluid.optimizer import SGDOptimizer
+
+IS_SPARSE = True
+BATCH_SIZE = 256
+PASS_NUM = 100
+
+
+def get_usr_combined_features():
+    USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1
+    uid = layers.data(name='user_id', shape=[1], dtype='int64')
+    usr_emb = layers.embedding(
+        input=uid,
+        dtype='float32',
+        size=[USR_DICT_SIZE, 32],
+        param_attr='user_table',
+        is_sparse=IS_SPARSE)
+    usr_fc = layers.fc(input=usr_emb, size=32)
+    USR_GENDER_DICT_SIZE = 2
+
+    usr_gender_id = layers.data(name='gender_id', shape=[1], dtype='int64')
+    usr_gender_emb = layers.embedding(
+        input=usr_gender_id,
+        size=[USR_GENDER_DICT_SIZE, 16],
+        param_attr='gender_table',
+        is_sparse=IS_SPARSE)
+    usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
+
+    USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
+    usr_age_id = layers.data(name='age_id', shape=[1], dtype="int64")
+    usr_age_emb = layers.embedding(
+        input=usr_age_id,
+        size=[USR_AGE_DICT_SIZE, 16],
+        is_sparse=IS_SPARSE,
+        param_attr='age_table')
+    usr_age_fc = layers.fc(input=usr_age_emb, size=16)
+
+    USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
+    usr_job_id = layers.data(name='job_id', shape=[1], dtype="int64")
+    usr_job_emb = layers.embedding(
+        input=usr_job_id,
+        size=[USR_JOB_DICT_SIZE, 16],
+        param_attr='job_table',
+        is_sparse=IS_SPARSE)
+    usr_job_fc = layers.fc(input=usr_job_emb, size=16)
+
+    concat_embed = layers.concat(
+        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1)
+
+    usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
+    return usr_combined_features
+
+
+def get_mov_combined_features():
+    MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1
+    mov_id = layers.data(name='movie_id', shape=[1], dtype='int64')
+    mov_emb = layers.embedding(
+        input=mov_id,
+        dtype='float32',
+        size=[MOV_DICT_SIZE, 32],
+        param_attr='movie_table',
+        is_sparse=IS_SPARSE)
+    mov_fc = layers.fc(input=mov_emb, size=32)
+
+    CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
+    category_id = layers.data(name='category_id', shape=[1], dtype='int64')
+    mov_categories_emb = layers.embedding(
+        input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
+    mov_categories_hidden = layers.sequence_pool(
+        input=mov_categories_emb, pool_type="sum")
+
+    MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
+    mov_title_id = layers.data(name='movie_title', shape=[1], dtype='int64')
+    mov_title_emb = layers.embedding(
+        input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
+    mov_title_conv = nets.sequence_conv_pool(
+        input=mov_title_emb,
+        num_filters=32,
+        filter_size=3,
+        act="tanh",
+        pool_type="sum")
+
+    concat_embed = layers.concat(
+        input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1)
+
+    mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
+    return mov_combined_features
+
+
+def model():
+    usr_combined_features = get_usr_combined_features()
+    mov_combined_features = get_mov_combined_features()
+
+    # need cos sim
+    inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
+    scale_infer = layers.scale(x=inference, scale=5.0)
+
+    label = layers.data(name='score', shape=[1], dtype='float32')
+    square_cost = layers.square_error_cost(input=scale_infer, label=label)
+    avg_cost = layers.mean(x=square_cost)
+
+    return avg_cost
+
+
+def func_feed(feeding, data, place):
+    feed_tensors = {}
+    for (key, idx) in feeding.iteritems():
+        tensor = core.LoDTensor()
+        if key != "category_id" and key != "movie_title":
+            if key == "score":
+                numpy_data = np.array(map(lambda x: x[idx], data)).astype(
+                    "float32")
+            else:
+                numpy_data = np.array(map(lambda x: x[idx], data)).astype(
+                    "int64")
+        else:
+            numpy_data = map(lambda x: np.array(x[idx]).astype("int64"), data)
+            lod_info = [len(item) for item in numpy_data]
+            offset = 0
+            lod = [offset]
+            for item in lod_info:
+                offset += item
+                lod.append(offset)
+            numpy_data = np.concatenate(numpy_data, axis=0)
+            tensor.set_lod([lod])
+
+        numpy_data = numpy_data.reshape([numpy_data.shape[0], 1])
+        tensor.set(numpy_data, place)
+        feed_tensors[key] = tensor
+    return feed_tensors
+
+
+def main():
+    cost = model()
+    optimizer = SGDOptimizer(learning_rate=0.2)
+    optimize_ops, params_grads = optimizer.minimize(cost)
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.movielens.train(), buf_size=8192),
+        batch_size=BATCH_SIZE)
+
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    t = fluid.DistributeTranspiler()
+
+    # all parameter server endpoints list for spliting parameters
+    pserver_endpoints = os.getenv("PSERVERS")
+    # server endpoint for current node
+    current_endpoint = os.getenv("SERVER_ENDPOINT")
+    # run as trainer or parameter server
+    training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+    t.transpile(
+        optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+    if training_role == "PSERVER":
+        if not current_endpoint:
+            print("need env SERVER_ENDPOINT")
+            exit(1)
+        pserver_prog = t.get_pserver_program(current_endpoint)
+        pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+        exe.run(pserver_startup)
+        exe.run(pserver_prog)
+    elif training_role == "TRAINER":
+        exe.run(fluid.default_startup_program())
+        trainer_prog = t.get_trainer_program()
+
+        feeding = {
+            'user_id': 0,
+            'gender_id': 1,
+            'age_id': 2,
+            'job_id': 3,
+            'movie_id': 4,
+            'category_id': 5,
+            'movie_title': 6,
+            'score': 7
+        }
+
+        for pass_id in range(PASS_NUM):
+            for data in train_reader():
+                outs = exe.run(trainer_prog,
+                               feed=func_feed(feeding, data, place),
+                               fetch_list=[cost])
+                out = np.array(outs[0])
+                print("cost=" + str(out[0]))
+                if out[0] < 6.0:
+                    print("Training complete. Average cost is less than 6.0.")
+                    # if avg cost less than 6.0, we think our code is good.
+                    exit(0)
+    else:
+        print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py b/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_dynamic_lstm.py
similarity index 62%
rename from python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
rename to python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_dynamic_lstm.py
index 529223eba8af6d968b490068f34559880312515d..bff376a0e2ee0fbb0d869e0dddf4460ed5dc4ac6 100644
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_dynamic_lstm.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import numpy as np
+import os
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
 
@@ -50,9 +51,9 @@ def stacked_lstm_net(data,
     cost = fluid.layers.cross_entropy(input=prediction, label=label)
     avg_cost = fluid.layers.mean(x=cost)
     adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
-    adam_optimizer.minimize(avg_cost)
+    optimize_ops, params_grads = adam_optimizer.minimize(avg_cost)
     accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
-    return avg_cost, accuracy, accuracy.metrics[0]
+    return avg_cost, accuracy, accuracy.metrics[0], optimize_ops, params_grads
 
 
 def to_lodtensor(data, place):
@@ -75,14 +76,14 @@ def main():
     PASS_NUM = 5
 
     word_dict = paddle.dataset.imdb.word_dict()
-    print "load word dict successfully"
+    print "loaded word dict successfully"
     dict_dim = len(word_dict)
     class_dim = 2
 
     data = fluid.layers.data(
         name="words", shape=[1], dtype="int64", lod_level=1)
     label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-    cost, accuracy, acc_out = stacked_lstm_net(
+    cost, accuracy, acc_out, optimize_ops, params_grads = stacked_lstm_net(
         data, label, input_dim=dict_dim, class_dim=class_dim)
 
     train_data = paddle.batch(
@@ -93,20 +94,41 @@ def main():
     exe = fluid.Executor(place)
     feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
 
-    exe.run(fluid.default_startup_program())
-
-    for pass_id in xrange(PASS_NUM):
-        accuracy.reset(exe)
-        for data in train_data():
-            cost_val, acc_val = exe.run(fluid.default_main_program(),
-                                        feed=feeder.feed(data),
-                                        fetch_list=[cost, acc_out])
-            pass_acc = accuracy.eval(exe)
-            print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
-                  " pass_acc=" + str(pass_acc))
-            if cost_val < 1.0 and acc_val > 0.8:
-                exit(0)
-    exit(1)
+    t = fluid.DistributeTranspiler()
+    # all parameter server endpoints list for spliting parameters
+    pserver_endpoints = os.getenv("PSERVERS")
+    # server endpoint for current node
+    current_endpoint = os.getenv("SERVER_ENDPOINT")
+    # run as trainer or parameter server
+    training_role = os.getenv(
+        "TRAINING_ROLE", "TRAINER")  # get the training role: trainer/pserver
+    t.transpile(
+        optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+    if training_role == "PSERVER":
+        if not current_endpoint:
+            print("need env SERVER_ENDPOINT")
+            exit(1)
+        pserver_prog = t.get_pserver_program(current_endpoint)
+        pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+        exe.run(pserver_startup)
+        exe.run(pserver_prog)
+    elif training_role == "TRAINER":
+        exe.run(fluid.default_startup_program())
+        trainer_prog = t.get_trainer_program()
+        for pass_id in xrange(PASS_NUM):
+            accuracy.reset(exe)
+            for data in train_data():
+                cost_val, acc_val = exe.run(trainer_prog,
+                                            feed=feeder.feed(data),
+                                            fetch_list=[cost, acc_out])
+                pass_acc = accuracy.eval(exe)
+                print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
+                      " pass_acc=" + str(pass_acc))
+                if cost_val < 1.0 and acc_val > 0.8:
+                    exit(0)
+    else:
+        print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
index 7ad5e2c594f24999e298533b6c05ba688a935f0b..045db8390cd52689a2a803c3387c90776a44ee73 100644
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -15,6 +15,8 @@
 import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
+import math
+import sys
 
 # need to fix random seed and training data to compare the loss
 # value accurately calculated by the default and the memory optimization
@@ -63,4 +65,6 @@ for pass_id in range(PASS_NUM):
 
         if avg_loss_value[0] < 10.0:
             exit(0)  # if avg cost less than 10.0, we think our code is good.
+        if math.isnan(float(avg_loss_value)):
+            sys.exit("got NaN loss, training failed.")
 exit(1)
diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
index 26673afd83c48328c3f354e82bfa3725aa4805b5..9fbb36d3638bd537020247d6f762afd4ed5d402f 100644
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
@@ -18,6 +18,8 @@ import sys
 
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
+import math
+import sys
 
 # need to fix random seed and training data to compare the loss
 # value accurately calculated by the default and the memory optimization
@@ -152,7 +154,10 @@ for pass_id in range(PASS_NUM):
         print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
             pass_acc))
         # this model is slow, so if we can train two mini batch, we think it works properly.
+
         if i > 2:
             exit(0)
+        if math.isnan(float(loss)):
+            sys.exit("got NaN loss, training failed.")
         i += 1
 exit(1)
diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
index ffd53e7a78142162317a677de49c1821635a65b5..48abaa8d87563b7132c5d8962bc33283a104e67a 100644
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
@@ -19,6 +19,8 @@ import paddle.v2.fluid.core as core
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
+import math
+import sys
 
 dict_size = 30000
 source_dict_dim = target_dict_dim = dict_size
@@ -137,6 +139,8 @@ def main():
                   " avg_cost=" + str(avg_cost_val))
             if batch_id > 2:
                 exit(0)
+            if math.isnan(float(avg_cost_val)):
+                sys.exit("got NaN loss, training failed.")
             batch_id += 1
 
 
diff --git a/python/paddle/v2/fluid/tests/op_test.py b/python/paddle/v2/fluid/tests/op_test.py
index 56f54de86f680653fbd97a7ce1d3f547d1657587..3f6d7070c2987d0557c60db84a2c679cd2cfe36b 100644
--- a/python/paddle/v2/fluid/tests/op_test.py
+++ b/python/paddle/v2/fluid/tests/op_test.py
@@ -334,7 +334,7 @@ class OpTest(unittest.TestCase):
 
     def check_output(self, atol=1e-5):
         places = [core.CPUPlace()]
-        if core.is_compile_gpu() and core.op_support_gpu(self.op_type):
+        if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
             places.append(core.CUDAPlace(0))
         for place in places:
             self.check_output_with_place(place, atol)
@@ -367,7 +367,7 @@ class OpTest(unittest.TestCase):
                    max_relative_error=0.005,
                    user_defined_grads=None):
         places = [core.CPUPlace()]
-        if core.is_compile_gpu() and core.op_support_gpu(self.op_type):
+        if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
             places.append(core.CUDAPlace(0))
         for place in places:
             self.check_grad_with_place(place, inputs_to_check, output_names,
diff --git a/python/paddle/v2/fluid/tests/test_activation_op.py b/python/paddle/v2/fluid/tests/test_activation_op.py
index 18605e60652a1614571a91918a012f0c08c8f1b3..1de5d446b8eaf57d3718dde7540c929996ee3432 100644
--- a/python/paddle/v2/fluid/tests/test_activation_op.py
+++ b/python/paddle/v2/fluid/tests/test_activation_op.py
@@ -186,8 +186,7 @@ class TestFloor(OpTest):
         self.op_type = "floor"
         x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
         self.inputs = {'X': x}
-        # numpy floor need +1
-        self.outputs = {'Out': np.floor(self.inputs['X']) + 1.0}
+        self.outputs = {'Out': np.floor(self.inputs['X'])}
 
     def test_check_output(self):
         self.check_output()
diff --git a/python/paddle/v2/fluid/tests/test_adagrad_op.py b/python/paddle/v2/fluid/tests/test_adagrad_op.py
index 86b0567ce123b00bace639fb8fe76cf3894abd6d..3556bcf8ba0d7f16b1d9bf50e46aebde83de2e25 100644
--- a/python/paddle/v2/fluid/tests/test_adagrad_op.py
+++ b/python/paddle/v2/fluid/tests/test_adagrad_op.py
@@ -180,7 +180,7 @@ class TestSparseAdagradOp(unittest.TestCase):
 
     def test_sparse_adagrad(self):
         places = [core.CPUPlace()]
-        if core.is_compile_gpu():
+        if core.is_compiled_with_cuda():
             places.append(core.CUDAPlace(0))
         for place in places:
             self.check_with_place(place)
diff --git a/python/paddle/v2/fluid/tests/test_adam_op.py b/python/paddle/v2/fluid/tests/test_adam_op.py
index 10580adca714beeb7571312b8fdc4235ecaaccfe..df1fa8983c1984a9bb9f204aded148c17d3d609d 100644
--- a/python/paddle/v2/fluid/tests/test_adam_op.py
+++ b/python/paddle/v2/fluid/tests/test_adam_op.py
@@ -305,7 +305,7 @@ class TestSparseAdamOp(unittest.TestCase):
 
     def test_sparse_sgd(self):
         places = [core.CPUPlace()]
-        if core.is_compile_gpu():
+        if core.is_compiled_with_cuda():
             places.append(core.CUDAPlace(0))
         for place in places:
             self.check_with_place(place)
diff --git a/python/paddle/v2/fluid/tests/test_batch_norm_op.py b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
index 371bd426781b457582e74c33c80c46b5d56946fa..cf13166f255c782bdcec622d58d073a0943c8e1e 100644
--- a/python/paddle/v2/fluid/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
@@ -352,7 +352,7 @@ class TestBatchNormOp(OpTest):
             print "op test backward passed: ", str(place), data_layout
 
         places = [core.CPUPlace()]
-        if core.is_compile_gpu() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
             places.append(core.CUDAPlace(0))
 
         for place in places:
diff --git a/python/paddle/v2/fluid/tests/test_bipartite_match_op.py b/python/paddle/v2/fluid/tests/test_bipartite_match_op.py
index 74138298978c7c18936f53761b313887f07aea81..4943bbb3388c3a476596b2fd4dd28605ee7be9e0 100644
--- a/python/paddle/v2/fluid/tests/test_bipartite_match_op.py
+++ b/python/paddle/v2/fluid/tests/test_bipartite_match_op.py
@@ -62,7 +62,7 @@ def batch_bipartite_match(distance, lod):
     return match_indices, match_dist
 
 
-class TestBipartiteMatchOpForWithLoD(OpTest):
+class TestBipartiteMatchOpWithLoD(OpTest):
     def setUp(self):
         self.op_type = 'bipartite_match'
         lod = [[0, 5, 11, 23]]
@@ -72,7 +72,7 @@ class TestBipartiteMatchOpForWithLoD(OpTest):
         self.inputs = {'DistMat': (dist, lod)}
         self.outputs = {
             'ColToRowMatchIndices': (match_indices),
-            'ColToRowMatchDis': (match_dist),
+            'ColToRowMatchDist': (match_dist),
         }
 
     def test_check_output(self):
@@ -89,7 +89,7 @@ class TestBipartiteMatchOpWithoutLoD(OpTest):
         self.inputs = {'DistMat': dist}
         self.outputs = {
             'ColToRowMatchIndices': match_indices,
-            'ColToRowMatchDis': match_dist,
+            'ColToRowMatchDist': match_dist,
         }
 
     def test_check_output(self):
diff --git a/python/paddle/v2/fluid/tests/test_box_coder_op.py b/python/paddle/v2/fluid/tests/test_box_coder_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..0dc18476fd5dce7cd293f6cb85f419be7d88ec95
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_box_coder_op.py
@@ -0,0 +1,127 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import sys
+import math
+from op_test import OpTest
+
+
+def box_coder(target_box, prior_box, prior_box_var, output_box, code_type):
+    prior_box_x = (
+        (prior_box[:, 2] + prior_box[:, 0]) / 2).reshape(1, prior_box.shape[0])
+    prior_box_y = (
+        (prior_box[:, 3] + prior_box[:, 1]) / 2).reshape(1, prior_box.shape[0])
+    prior_box_width = (
+        (prior_box[:, 2] - prior_box[:, 0])).reshape(1, prior_box.shape[0])
+    prior_box_height = (
+        (prior_box[:, 3] - prior_box[:, 1])).reshape(1, prior_box.shape[0])
+    prior_box_var = prior_box_var.reshape(1, prior_box_var.shape[0],
+                                          prior_box_var.shape[1])
+
+    if (code_type == "EncodeCenterSize"):
+        target_box_x = ((target_box[:, 2] + target_box[:, 0]) / 2).reshape(
+            target_box.shape[0], 1)
+        target_box_y = ((target_box[:, 3] + target_box[:, 1]) / 2).reshape(
+            target_box.shape[0], 1)
+        target_box_width = ((target_box[:, 2] - target_box[:, 0])).reshape(
+            target_box.shape[0], 1)
+        target_box_height = ((target_box[:, 3] - target_box[:, 1])).reshape(
+            target_box.shape[0], 1)
+
+        output_box[:,:,0] = (target_box_x - prior_box_x) / prior_box_width / \
+                prior_box_var[:,:,0]
+        output_box[:,:,1] = (target_box_y - prior_box_y) / prior_box_height / \
+                prior_box_var[:,:,1]
+        output_box[:,:,2] = np.log(np.fabs(target_box_width / prior_box_width)) / \
+                prior_box_var[:,:,2]
+        output_box[:,:,3] = np.log(np.fabs(target_box_height / prior_box_height)) / \
+                prior_box_var[:,:,3]
+
+    elif (code_type == "DecodeCenterSize"):
+        target_box = target_box.reshape(target_box.shape[0], 1,
+                                        target_box.shape[1])
+        target_box_x = prior_box_var[:,:,0] * target_box[:,:,0] * \
+                       prior_box_width + prior_box_x
+        target_box_y = prior_box_var[:,:,1] * target_box[:,:,1] * \
+                       prior_box_height + prior_box_y
+        target_box_width = np.exp(prior_box_var[:,:,2] * target_box[:,:,2]) * \
+                           prior_box_width
+        target_box_height = np.exp(prior_box_var[:,:,3] * target_box[:,:,3]) * \
+                            prior_box_height
+        output_box[:, :, 0] = target_box_x - target_box_width / 2
+        output_box[:, :, 1] = target_box_y - target_box_height / 2
+        output_box[:, :, 2] = target_box_x + target_box_width / 2
+        output_box[:, :, 3] = target_box_y + target_box_height / 2
+
+
+def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type):
+    n = target_box.shape[0]
+    m = prior_box.shape[0]
+    output_box = np.zeros((n, m, 4), dtype=np.float32)
+    for i in range(len(lod) - 1):
+        box_coder(target_box[lod[i]:lod[i + 1], :], prior_box, prior_box_var,
+                  output_box[lod[i]:lod[i + 1], :, :], code_type)
+    return output_box
+
+
+class TestBoxCoderOp(OpTest):
+    def test_check_output(self):
+        self.check_output()
+
+    def setUp(self):
+        self.op_type = "box_coder"
+        lod = [[0, 20]]
+        prior_box = np.random.random((10, 4)).astype('float32')
+        prior_box_var = np.random.random((10, 4)).astype('float32')
+        target_box = np.random.random((20, 4)).astype('float32')
+        code_type = "DecodeCenterSize"
+        output_box = batch_box_coder(prior_box, prior_box_var, target_box,
+                                     lod[0], code_type)
+
+        self.inputs = {
+            'PriorBox': prior_box,
+            'PriorBoxVar': prior_box_var,
+            'TargetBox': target_box,
+        }
+        self.attrs = {'code_type': 'decode_center_size'}
+        self.outputs = {'OutputBox': output_box}
+
+
+class TestBoxCoderOpWithLoD(OpTest):
+    def test_check_output(self):
+        self.check_output()
+
+    def setUp(self):
+        self.op_type = "box_coder"
+        lod = [[0, 4, 12, 20]]
+        prior_box = np.random.random((10, 4)).astype('float32')
+        prior_box_var = np.random.random((10, 4)).astype('float32')
+        target_box = np.random.random((20, 4)).astype('float32')
+        code_type = "EncodeCenterSize"
+        output_box = batch_box_coder(prior_box, prior_box_var, target_box,
+                                     lod[0], code_type)
+
+        self.inputs = {
+            'PriorBox': prior_box,
+            'PriorBoxVar': prior_box_var,
+            'TargetBox': (target_box, lod),
+        }
+        self.attrs = {'code_type': 'encode_center_size'}
+        self.outputs = {'OutputBox': output_box}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_conv2d_op.py b/python/paddle/v2/fluid/tests/test_conv2d_op.py
index 24de74d730eedbccb4837598bd6d2eb92da59e0d..7512ea333e37d5f4f0102531d8d13f8c2a744b8d 100644
--- a/python/paddle/v2/fluid/tests/test_conv2d_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv2d_op.py
@@ -241,6 +241,30 @@ class TestCUDNNWith1x1(TestWith1x1):
         self.op_type = "conv2d"
 
 
+class TestDepthwiseConv(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
+class TestDepthwiseConv2(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
 #  cudnn v5 does not support dilation conv.
 # class TestCUDNNWithDilation(TestWithDilation):
 #     def init_op_type(self):
diff --git a/python/paddle/v2/fluid/tests/test_cpp_reader.py b/python/paddle/v2/fluid/tests/test_cpp_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..970f57ed0008b0d7d99ad8b5de1cb7895239ed2c
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_cpp_reader.py
@@ -0,0 +1,74 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import numpy as np
+
+prog = fluid.framework.Program()
+block = prog.current_block()
+
+random_reader = block.create_var(
+    type=fluid.core.VarDesc.VarType.READER, name="RandomDataGenerator")
+random_reader.desc.set_lod_levels([0, 0])
+
+create_random_data_generator_op = block.append_op(
+    type="create_random_data_generator",
+    outputs={"Out": random_reader},
+    attrs={
+        "shape_concat": [1, 2, 1, 1],
+        "ranks": [2, 2],
+        "min": 0.0,
+        "max": 1.0
+    })
+shuffle_reader = block.create_var(
+    type=fluid.core.VarDesc.VarType.READER, name="ShuffleReader")
+shuffle_reader.desc.set_lod_levels([0, 0])
+
+create_shuffle_reader_op = block.append_op(
+    type="create_shuffle_reader",
+    inputs={"UnderlyingReader": random_reader},
+    outputs={"Out": shuffle_reader},
+    attrs={"buffer_size": 7})
+
+batch_reader = block.create_var(
+    type=fluid.core.VarDesc.VarType.READER, name="BatchReader")
+batch_reader.desc.set_lod_levels([1, 1])
+
+create_batch_reader_op = block.append_op(
+    type="create_batch_reader",
+    inputs={"UnderlyingReader": shuffle_reader},
+    outputs={"Out": batch_reader},
+    attrs={"batch_size": 10})
+
+out1 = block.create_var(type=fluid.core.VarDesc.VarType.LOD_TENSOR, name="Out1")
+out2 = block.create_var(type=fluid.core.VarDesc.VarType.LOD_TENSOR, name="Out2")
+
+read_op = block.append_op(
+    type="read", inputs={"Reader": batch_reader},
+    outputs={"Out": [out1, out2]})
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+
+[res1, res2] = exe.run(prog, fetch_list=[out1, out2], return_numpy=False)
+
+test_pass = res1.lod() == [range(0, 11)] and res1.lod() == [
+    range(0, 11)
+] and np.array(res1).shape == (10, 2) and np.array(res2).shape == (10, 1)
+
+if not test_pass:
+    exit(1)
+
+exit(0)
diff --git a/python/paddle/v2/fluid/tests/test_ctc_align.py b/python/paddle/v2/fluid/tests/test_ctc_align.py
index 773c69d1ad0794d2e4edfb1f6f8140cbcd64bee6..cc815d8e9e16d36c4612009bd40414c454dc59fd 100644
--- a/python/paddle/v2/fluid/tests/test_ctc_align.py
+++ b/python/paddle/v2/fluid/tests/test_ctc_align.py
@@ -31,6 +31,8 @@ def CTCAlign(input, lod, blank, merge_repeated):
                 result.append(token)
             prev_token = token
     result = np.array(result).reshape([len(result), 1]).astype("int32")
+    if len(result) == 0:
+        result = np.array([-1])
     return result
 
 
@@ -72,5 +74,14 @@ class TestCTCAlignOpCase1(TestCTCAlignOp):
                 [19, 1]).astype("int32")
 
 
+class TestCTCAlignOpCase2(TestCTCAlignOp):
+    def config(self):
+        self.op_type = "ctc_align"
+        self.input_lod = [[0, 4]]
+        self.blank = 0
+        self.merge_repeated = True
+        self.input = np.array([0, 0, 0, 0]).reshape([4, 1]).astype("int32")
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_dropout_op.py b/python/paddle/v2/fluid/tests/test_dropout_op.py
index 107b9567dc4a8539532c2fff40df437cc72cc163..b0c55df9f58834688846c5362113464996eb286a 100644
--- a/python/paddle/v2/fluid/tests/test_dropout_op.py
+++ b/python/paddle/v2/fluid/tests/test_dropout_op.py
@@ -21,7 +21,7 @@ class TestDropoutOp(OpTest):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.0, 'is_test': False}
+        self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False}
         self.outputs = {
             'Out': self.inputs['X'],
             'Mask': np.ones((32, 64)).astype('float32')
@@ -38,7 +38,7 @@ class TestDropoutOp2(TestDropoutOp):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 1.0, 'is_test': False}
+        self.attrs = {'dropout_prob': 1.0, 'fix_seed': True, 'is_test': False}
         self.outputs = {
             'Out': np.zeros((32, 64)).astype('float32'),
             'Mask': np.zeros((32, 64)).astype('float32')
@@ -49,7 +49,7 @@ class TestDropoutOp3(TestDropoutOp):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64, 2)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.0, 'is_test': False}
+        self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False}
         self.outputs = {
             'Out': self.inputs['X'],
             'Mask': np.ones((32, 64, 2)).astype('float32')
@@ -60,7 +60,7 @@ class TestDropoutOp4(OpTest):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.35, 'is_test': True}
+        self.attrs = {'dropout_prob': 0.35, 'fix_seed': True, 'is_test': True}
         self.outputs = {
             'Out': self.inputs['X'] * (1.0 - self.attrs['dropout_prob'])
         }
diff --git a/python/paddle/v2/fluid/tests/test_elementwise_pow_op.py b/python/paddle/v2/fluid/tests/test_elementwise_pow_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e31749df9baf10215fcd0cca3c1097f00c163ec7
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_elementwise_pow_op.py
@@ -0,0 +1,43 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestElementwisePowOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        }
+        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestElementwisePowOp_scalar(TestElementwisePowOp):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype('float32'),
+            'Y': np.random.rand(1).astype('float32')
+        }
+        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_fetch_var.py b/python/paddle/v2/fluid/tests/test_fetch_var.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed75a350b0bcb220c8435d60e1978c27da84a24c
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_fetch_var.py
@@ -0,0 +1,37 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.layers as layers
+import op_test
+import numpy
+import unittest
+
+
+class TestFetchVar(op_test.OpTest):
+    def test_fetch_var(self):
+        val = numpy.array([1, 3, 5]).astype(numpy.int32)
+        x = layers.create_tensor(dtype="int32", persistable=True, name="x")
+        layers.assign(input=val, output=x)
+        exe = fluid.Executor(fluid.CPUPlace())
+        exe.run(fluid.default_main_program(), feed={}, fetch_list=[])
+        fetched_x = fluid.fetch_var("x")
+        self.assertTrue(
+            numpy.array_equal(fetched_x, val),
+            "fetch_x=%s val=%s" % (fetched_x, val))
+        self.assertEqual(fetched_x.dtype, val.dtype)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
index 82842534d4ac7ad8b0a8e0d877c6a638fb53cadc..79beb8b1fcef610bc2f3e8d18da4345baa9b99c3 100644
--- a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
@@ -33,7 +33,7 @@ class TestGaussianRandomOp(unittest.TestCase):
         self.gaussian_random_test(place=fluid.CPUPlace())
 
     def test_gpu(self):
-        if core.is_compile_gpu():
+        if core.is_compiled_with_cuda():
             self.gaussian_random_test(place=fluid.CUDAPlace(0))
 
     def gaussian_random_test(self, place):
diff --git a/python/paddle/v2/fluid/tests/test_label_smooth_op.py b/python/paddle/v2/fluid/tests/test_label_smooth_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..19a4df57446c0c83b415909df3e0246bf2716881
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_label_smooth_op.py
@@ -0,0 +1,55 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestLabelSmoothOp(OpTest):
+    def config(self):
+        self.op_type = "label_smooth"
+        self.epsilon = 0.1
+        batch_size, self.label_dim = 5, 10
+        self.label = np.zeros((batch_size, self.label_dim)).astype("float64")
+        nonzero_index = np.random.randint(self.label_dim, size=(batch_size))
+        self.label[np.arange(batch_size), nonzero_index] = 1
+
+    def setUp(self):
+        self.config()
+        smoothed_label = (1 - self.epsilon
+                          ) * self.label + self.epsilon / self.label_dim
+        self.inputs = {'X': self.label}
+        self.attrs = {'epsilon': self.epsilon}
+        self.outputs = {'Out': smoothed_label}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestLabelSmoothOpWithPriorDist(TestLabelSmoothOp):
+    def setUp(self):
+        self.config()
+        dist = np.random.random((1, self.label_dim))
+        smoothed_label = (1 - self.epsilon) * self.label + self.epsilon * dist
+        self.inputs = {'X': self.label, 'PriorDist': dist}
+        self.attrs = {'epsilon': self.epsilon}
+        self.outputs = {'Out': smoothed_label}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_layer_norm_op.py b/python/paddle/v2/fluid/tests/test_layer_norm_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..4460ffaf9c46966178497419a35ef4044464ac9f
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_layer_norm_op.py
@@ -0,0 +1,251 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+
+from operator import mul
+from op_test import OpTest
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.op import Operator
+from paddle.v2.fluid.framework import grad_var_name
+
+np.random.random(123)
+
+
+def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1):
+    x_shape = x.shape
+    N = reduce(mul, x_shape[0:begin_norm_axis], 1)
+    D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
+    x.shape = [N, D]
+
+    mean = np.mean(x, axis=1)
+    var = np.var(x, axis=1) + epsilon
+    output = scale.reshape([1, D]) * np.divide(
+        (x - mean.reshape([N, 1])),
+        (np.sqrt(var)).reshape([N, 1])) + beta.reshape([1, D])
+
+    x.shape, output.shape = x_shape, x_shape
+    return output, mean, var
+
+
+def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1):
+    x_shape = x.shape
+    scale_shape = scale.shape
+    N = reduce(mul, x_shape[0:begin_norm_axis], 1)
+    D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
+    x.shape, grad_y.shape = [N, D], [N, D]
+    var.shape, mean.shape = [N, 1], [N, 1]
+    scale.shape = [1, D]
+
+    # d_bias
+    d_bias = np.sum(grad_y, axis=0).reshape([1, D])
+    # d_scale
+    d_scale = np.sum(((x - mean) * np.sqrt(1 / var)) * grad_y,
+                     axis=0).reshape([1, D])
+    # dx
+    dx_end = scale * np.sqrt(1.0 / var) * grad_y
+    d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * scale, axis=1).reshape(
+        [N, 1])  # the second part equals to zero.
+    d_mean = 1.0 / D * d_mean_0
+    d_std = np.sum(
+        -(1.0 / var) * (x - mean) * grad_y * scale, axis=1).reshape([N, 1]) * (
+            1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) * (x - mean))
+
+    grad_x = dx_end + d_mean + d_std
+
+    grad_x.shape, x.shape, grad_y.shape = x_shape, x_shape, x_shape
+    scale.shape = scale_shape
+    var.shape, mean.shape = [N, ], [N, ]
+    return grad_x, d_scale, d_bias
+
+
+def get_backward_op(scope, op, no_grad_set):
+    backward_op = core.Operator.backward(op, no_grad_set)
+    for input in backward_op.input_vars():
+        var = scope.var(input)
+        var.get_tensor()
+    for output in backward_op.output_vars():
+        var = scope.var(output)
+        var.get_tensor()
+    return backward_op
+
+
+def create_or_get_tensor(scope, var_name, var, place):
+    tensor = scope.var(var_name).get_tensor()
+    if var is not None:
+        assert isinstance(var, np.ndarray)
+        tensor.set_lod([[]])
+        tensor.set_dims(var.shape)
+        tensor.set(var, place)
+    return tensor
+
+
+def set_output_grad(scope, outputs, place, feed_dict=None):
+    def __set_tensor__(name, data=None):
+        out_tensor = scope.find_var(name).get_tensor()
+        grad_tensor = scope.var(grad_var_name(name)).get_tensor()
+        out_dtype = out_tensor.dtype()
+        if data is None:
+            if out_dtype == core.DataType.FP64:
+                data = np.ones(out_tensor.shape(), dtype=np.float64)
+            elif out_dtype == core.DataType.FP32:
+                data = np.ones(out_tensor.shape(), dtype=np.float32)
+            else:
+                raise ValueError("Not supported data type " + str(out_dtype))
+        grad_tensor.set(data, place)
+
+    for output in outputs:
+        data = None
+        if output in feed_dict:
+            data = feed_dict[output]
+        __set_tensor__(output, data)
+
+
+class TestLayerNormdOp(OpTest):
+    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
+
+    def __assert_grad_close(self,
+                            tensor,
+                            np_array,
+                            name,
+                            place,
+                            max_relative_error=0.02):
+        a = np.array(tensor)
+        b = np_array
+        abs_a = np.abs(a)
+        abs_a[abs_a < 1e-5] = 1
+
+        diff_mat = np.abs(a - b) / abs_a
+        max_diff = np.max(diff_mat)
+
+        def err_msg():
+            offset = np.argmax(diff_mat > max_relative_error)
+            return ("%s Variable %s max gradient diff %f over limit %f, "
+                    "the first error element is %d, %f, %f") % (
+                        "Gradient Check On %s" % str(place), name, max_diff,
+                        max_relative_error, offset, a.flatten()[offset],
+                        b.flatten()[offset])
+
+        self.assertLessEqual(max_diff, max_relative_error, err_msg())
+
+    def check_forward_backward(self, shape, begin_norm_axis):
+        def test_with_place(place, shape, begin_norm_axis=1):
+            # setUp
+            assert begin_norm_axis > 0 and begin_norm_axis < len(
+                shape), 'begin_norm_axis must be between 0 and len(shape)-1.'
+            # attr
+            epsilon = 0.00001
+            x_shape = shape
+            D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
+            scale_shape = [D]
+
+            x_val = np.random.random_sample(x_shape).astype(np.float32)
+            scale_val = np.random.random_sample(scale_shape).astype(np.float32)
+            bias_val = np.random.random_sample(scale_shape).astype(np.float32)
+            y_grad = np.random.random_sample(x_shape).astype(np.float32)
+
+            # run forward
+            y_out, saved_mean, var_ref = _reference_layer_norm_naive(
+                x_val, scale_val, bias_val, epsilon, begin_norm_axis)
+            naive_fw = {"Y": y_out, "Mean": saved_mean, "Variance": var_ref}
+
+            # get gradient
+            x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_layer_norm_grad(
+                x_val, y_grad, scale_val, saved_mean, var_ref, begin_norm_axis)
+            naive_grad = {
+                "X": x_grad_ref,
+                "Scale": scale_grad_ref,
+                "Bias": bias_grad_ref
+            }
+
+            scope = core.Scope()
+
+            # create input
+            input_map = {"X": x_val, "Scale": scale_val, "Bias": bias_val}
+            for i_name in input_map:
+                create_or_get_tensor(scope, i_name, input_map[i_name], place)
+
+            # create output
+            output_map = {"Y": None, "Mean": None, "Variance": None}
+            output_tensor = {}
+            for o_name in output_map:
+                output_tensor[o_name] = create_or_get_tensor(
+                    scope, o_name, output_map[o_name], place)
+
+            layer_norm_op = Operator(
+                "layer_norm",
+                # inputs
+                X="X",
+                Scale="Scale",
+                Bias="Bias",
+                # outputs
+                Y="Y",
+                Mean="Mean",
+                Variance="Variance",
+                # attrs
+                epsilon=epsilon,
+                begin_norm_axis=begin_norm_axis)
+
+            layer_norm_op.run(scope, place)
+
+            # check forward result
+            atol = 5e-2 if isinstance(place, core.CUDAPlace) else 1e-4
+            for o_tensor in output_tensor:
+                self.__assert_close(output_tensor[o_tensor], naive_fw[o_tensor],
+                                    o_tensor, atol)
+
+            # run backward
+            layer_norm_op_grad = get_backward_op(scope, layer_norm_op, set())
+            set_output_grad(
+                scope, ["Y", "Mean", "Variance"],
+                place,
+                feed_dict={"Y": y_grad})
+            layer_norm_op_grad.run(scope, place)
+
+            # get output
+            grad_tensor = {}
+            for o_name in naive_grad:
+                grad_tensor[o_name] = x_ = create_or_get_tensor(
+                    scope, grad_var_name(o_name), None, place)
+
+            # check gradient output
+            for o_grad in naive_grad:
+                self.__assert_grad_close(grad_tensor[o_grad],
+                                         naive_grad[o_grad], o_grad + "@GRAD",
+                                         place)
+
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
+            places.append(core.CUDAPlace(0))
+
+        for place in places:
+            test_with_place(place, shape, begin_norm_axis)
+
+    def test_check_forward_backward_with_scale_and_bias(self):
+        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
+        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3)
+
+    def test_check_forward_backward_with_scale(self):
+        pass  # TODO(zcd)
+
+    def test_check_forward_backward_with_bias(self):
+        pass  # TODO(zcd)
+
+    def test_check_forward_backward(self):
+        pass  # TODO(zcd)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py
index 566fbba9abff36a2e1faccc8086bdabda0115d66..aea43c2517a02c72c1ee3307afdd3b21910f0064 100644
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -202,6 +202,18 @@ class TestBook(unittest.TestCase):
                     x_t=x_t, hidden_t_prev=prev_hidden, cell_t_prev=prev_cell))
         print(str(program))
 
+    def test_dynamic_lstmp(self):
+        program = Program()
+        with program_guard(program):
+            hidden_dim, proj_dim = 16, 8
+            seq_data = layers.data(
+                name='seq_data', shape=[10, 10], dtype='float32', lod_level=1)
+            fc_out = layers.fc(input=seq_data, size=4 * hidden_dim)
+            self.assertIsNotNone(
+                layers.dynamic_lstmp(
+                    input=fc_out, size=4 * hidden_dim, proj_size=proj_dim))
+        print(str(program))
+
     def test_sequence_softmax(self):
         program = Program()
         with program_guard(program):
@@ -211,6 +223,14 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(layers.sequence_softmax(x=seq))
         print(str(program))
 
+    def test_softmax(self):
+        program = Program()
+        with program_guard(program):
+            data = layers.data(name='data', shape=[10], dtype='float32')
+            hid = layers.fc(input=data, size=20)
+            self.assertIsNotNone(layers.softmax(x=hid))
+        print(str(program))
+
     def test_get_places(self):
         program = Program()
         with program_guard(program):
@@ -279,6 +299,16 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(out)
         print(str(program))
 
+    def test_multiplex(self):
+        program = Program()
+        with program_guard(program):
+            x1 = layers.data(name='x1', shape=[4], dtype='float32')
+            x2 = layers.data(name='x2', shape=[4], dtype='float32')
+            index = layers.data(name='index', shape=[1], dtype='int32')
+            out = layers.multiplex(inputs=[x1, x2], index=index)
+            self.assertIsNotNone(out)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_learning_rate_decay.py b/python/paddle/v2/fluid/tests/test_learning_rate_decay.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6bab3d6c44b2b3403778d5db086e405bb30dee
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_learning_rate_decay.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import math
+import copy
+
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.learning_rate_decay as lr_decay
+
+
+def exponential_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False):
+    exponent = float(global_step) / float(decay_steps)
+    if staircase:
+        exponent = math.floor(exponent)
+    return learning_rate * decay_rate**exponent
+
+
+def natural_exp_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False):
+    exponent = float(global_step) / float(decay_steps)
+    if staircase:
+        exponent = math.floor(exponent)
+    return learning_rate * math.exp(-1 * decay_rate * exponent)
+
+
+def inverse_time_decay(learning_rate,
+                       global_step,
+                       decay_steps,
+                       decay_rate,
+                       staircase=False):
+    temp = float(global_step) / float(decay_steps)
+    if staircase:
+        temp = math.floor(temp)
+    return learning_rate / (1 + decay_rate * temp)
+
+
+def polynomial_decay(learning_rate,
+                     global_step,
+                     decay_steps,
+                     end_learning_rate=0.0001,
+                     power=1.0,
+                     cycle=False):
+    if cycle:
+        div = math.ceil(global_step / float(decay_steps))
+        if div == 0:
+            div = 1
+        decay_steps = decay_steps * div
+    else:
+        global_step = min(global_step, decay_steps)
+    return (learning_rate - end_learning_rate) * \
+           ((1 - float(global_step) / float(decay_steps)) ** power) + end_learning_rate
+
+
+def piecewise_decay(global_step, boundaries, values):
+    assert len(boundaries) + 1 == len(values)
+    for i in range(len(boundaries)):
+        if global_step < boundaries[i]:
+            return values[i]
+    return values[len(values) - 1]
+
+
+class TestLearningRateDecay(unittest.TestCase):
+    def check_decay(self, python_decay_fn, fluid_decay_fn, kwargs):
+        global_step = layers.create_global_var(
+            shape=[1], value=0.0, dtype='float32', persistable=True)
+
+        decayed_lr = fluid_decay_fn(global_step=global_step, **kwargs)
+        layers.increment(global_step, 1.0)
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+
+        exe.run(fluid.default_startup_program())
+        for step in range(10):
+            step_val, lr_val = exe.run(fluid.default_main_program(),
+                                       feed=[],
+                                       fetch_list=[global_step, decayed_lr])
+            python_decayed_lr = python_decay_fn(global_step=step, **kwargs)
+            self.assertAlmostEqual(python_decayed_lr, lr_val[0])
+
+    def test_decay(self):
+        common_kwargs_true = {
+            "learning_rate": 1.0,
+            "decay_steps": 5,
+            "decay_rate": 0.5,
+            "staircase": True
+        }
+        common_kwargs_false = copy.deepcopy(common_kwargs_true)
+        common_kwargs_false["staircase"] = False
+
+        decay_fns = [
+            (exponential_decay, lr_decay.exponential_decay, common_kwargs_true),
+            (exponential_decay, lr_decay.exponential_decay,
+             common_kwargs_false),
+            (natural_exp_decay, lr_decay.natural_exp_decay, common_kwargs_true),
+            (natural_exp_decay, lr_decay.natural_exp_decay,
+             common_kwargs_false),
+            (inverse_time_decay, lr_decay.inverse_time_decay,
+             common_kwargs_true),
+            (inverse_time_decay, lr_decay.inverse_time_decay,
+             common_kwargs_false),
+            (polynomial_decay, lr_decay.polynomial_decay, {
+                "learning_rate": 1.0,
+                "decay_steps": 5,
+                "cycle": True
+            }),
+            (polynomial_decay, lr_decay.polynomial_decay, {
+                "learning_rate": 1.0,
+                "decay_steps": 5,
+                "cycle": False
+            }),
+            (piecewise_decay, lr_decay.piecewise_decay, {
+                "boundaries": [3, 6, 9],
+                "values": [0.1, 0.2, 0.3, 0.4]
+            }),
+        ]
+
+        for py_decay_fn, fluid_decay_fn, kwargs in decay_fns:
+            print("decay_fn=" + py_decay_fn.__name__ + " kwargs=" + str(kwargs))
+            main_program = framework.Program()
+            startup_program = framework.Program()
+            with framework.program_guard(main_program, startup_program):
+                self.check_decay(py_decay_fn, fluid_decay_fn, kwargs)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_lstm_op.py b/python/paddle/v2/fluid/tests/test_lstm_op.py
index d9fa01e247ae613fb2a7ed523a447e31a5bd5994..3e79f9d8e157bc744f14ecfa7c9a6d7de4eae1f9 100644
--- a/python/paddle/v2/fluid/tests/test_lstm_op.py
+++ b/python/paddle/v2/fluid/tests/test_lstm_op.py
@@ -42,7 +42,7 @@ def relu(x):
     return np.maximum(x, 0)
 
 
-ACTVATION = {
+ACTIVATION = {
     'identity': identity,
     'sigmoid': sigmoid,
     'tanh': tanh,
@@ -158,8 +158,8 @@ class TestLstmOp(OpTest):
         w_b = b[:, 0:4 * self.D]
         w_c = b[:, 4 * self.D:] if self.use_peepholes else None
         h, c = lstm(x, self.lod, h0, c0, w, w_b, w_c, self.is_reverse,
-                    ACTVATION[self.act_gate], ACTVATION[self.act_cell],
-                    ACTVATION[self.act_cand])
+                    ACTIVATION[self.act_gate], ACTIVATION[self.act_cell],
+                    ACTIVATION[self.act_cand])
 
         self.inputs = {'Input': (x, self.lod), 'Weight': w}
 
diff --git a/python/paddle/v2/fluid/tests/test_lstmp_op.py b/python/paddle/v2/fluid/tests/test_lstmp_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..92a954a9aa5574c3016cf9744e1765fff9e9c091
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_lstmp_op.py
@@ -0,0 +1,286 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import numpy as np
+import test_lstm_op as LstmTest
+
+ACTIVATION = {
+    'identity': LstmTest.identity,
+    'sigmoid': LstmTest.sigmoid,
+    'tanh': LstmTest.tanh,
+    'relu': LstmTest.relu
+}
+
+
+# LSTM with recurrent projection Layer
+def lstmp(
+        input,  # T x 4D
+        lod,  # 1 x N
+        h0=None,  # N x D
+        c0=None,  # N x D
+        w_r=None,  # P x 4D
+        w_rh=None,  # D x P
+        w_b=None,  # 1 x 4D
+        w_c=None,  # 1 x 3D
+        is_reverse=False,
+        act_gate=None,
+        act_cell=None,
+        act_cand=None,
+        act_proj=None):
+    def _step(x, w_r, w_rh, w_c, r_pre, c_pre, act_gate, act_cell, act_cand,
+              act_proj):
+        g = np.dot(r_pre, w_r)  # 1 x 4D
+        g = g + x
+        g = np.reshape(g, (1, g.size))
+        c, g_i, g_f, g_o = np.split(g, 4, axis=1)
+        if w_c is None:
+            g_i = act_gate(g_i)  # 1 x D
+            g_f = act_gate(g_f)  # 1 x D
+        else:
+            w_ic, w_fc, _ = np.split(w_c, 3, axis=1)
+            g_i = act_gate(g_i + w_ic * c_pre)  # 1 x D
+            g_f = act_gate(g_f + w_fc * c_pre)  # 1 x D
+        c = g_f * c_pre + g_i * act_cand(c)  # 1 x D
+
+        if w_c is None:
+            g_o = act_gate(g_o)  # 1 x D
+        else:
+            _, _, w_oc = np.split(w_c, 3, axis=1)
+            g_o = act_gate(g_o + w_oc * c)  # 1 x D
+        h = g_o * act_cell(c)
+        # projection
+        r = np.dot(h, w_rh)
+        r = act_proj(r)
+        return r, c
+
+    def _reverse(x, lod):
+        y = np.zeros_like(x)
+        for i in range(len(lod) - 1):
+            b, e = lod[i], lod[i + 1]
+            y[b:e, :] = np.flip(x[b:e, :], 0)
+        return y
+
+    offset = lod[0]
+    batch_size = len(offset) - 1
+    # recurrent projection state
+    projection = []
+    cell = []
+    input = _reverse(input, offset) if is_reverse else input
+    if w_b is not None:
+        input = input + np.tile(w_b, (offset[-1], 1))
+    for i in range(batch_size):
+        # compute one sequence
+        seq_len = offset[i + 1] - offset[i]
+        x = input[offset[i]:offset[i + 1], :]
+        r_pre = np.dot(h0[i], w_rh)  # 1 x P
+        r_pre = act_proj(r_pre)
+        c_pre = c0[i]  # 1 x D
+        for j in range(seq_len):
+            # compute one step
+            r_pre, c_pre = _step(x[j], w_r, w_rh, w_c, r_pre, c_pre, act_gate,
+                                 act_cell, act_cand, act_proj)
+            projection.append(r_pre.flatten())
+            cell.append(c_pre.flatten())
+
+    projection = np.array(projection).astype('float64')
+    cell = np.array(cell).astype('float64')
+
+    projection = _reverse(projection, offset) if is_reverse else projection
+    cell = _reverse(cell, offset) if is_reverse else cell
+
+    assert projection.shape == (input.shape[0], w_r.shape[0])  # T x P
+    assert cell.shape == (input.shape[0], input.shape[1] / 4)  # T x D
+    return projection, cell
+
+
+class TestLstmpOp(LstmTest.TestLstmOp):
+    def reset_argument(self):
+        pass
+
+    def setUp(self):
+        self.set_argument()
+        # projection size
+        self.P = 10
+        self.act_proj = self.act_cell
+
+        self.reset_argument()
+        self.op_type = 'lstmp'
+
+        T = self.lod[0][-1]
+        N = len(self.lod[0]) - 1
+
+        x = np.random.normal(size=(T, 4 * self.D)).astype('float64')
+        if self.has_initial_state:
+            h0 = np.random.normal(size=(N, self.D)).astype('float64')
+            c0 = np.random.normal(size=(N, self.D)).astype('float64')
+        else:
+            h0 = np.zeros((N, self.D)).astype('float64')
+            c0 = np.zeros((N, self.D)).astype('float64')
+        w = np.random.normal(size=(self.P, 4 * self.D)).astype('float64')
+        if self.use_peepholes:
+            b = np.random.normal(size=(1, 7 * self.D)).astype('float64')
+        else:
+            b = np.random.normal(size=(1, 4 * self.D)).astype('float64')
+
+        w_b = b[:, 0:4 * self.D]
+        w_c = b[:, 4 * self.D:] if self.use_peepholes else None
+        w_rh = np.random.normal(size=(self.D, self.P)).astype('float64')
+        r, c = lstmp(x, self.lod, h0, c0, w, w_rh, w_b, w_c, self.is_reverse,
+                     ACTIVATION[self.act_gate], ACTIVATION[self.act_cell],
+                     ACTIVATION[self.act_cand], ACTIVATION[self.act_proj])
+
+        self.inputs = {'Input': (x, self.lod), 'Weight': w, 'ProjWeight': w_rh}
+
+        self.inputs['Bias'] = b
+
+        if self.has_initial_state:
+            self.inputs['H0'] = h0
+            self.inputs['C0'] = c0
+
+        self.outputs = {
+            'Projection': (r, self.lod),
+            'Cell': (c, self.lod),
+        }
+        self.attrs = {
+            'use_peepholes': self.use_peepholes,
+            'is_reverse': self.is_reverse,
+            'gate_activation': self.act_gate,
+            'cell_activation': self.act_cell,
+            'candidate_activation': self.act_cand,
+            'proj_activation': self.act_proj
+        }
+
+    def test_check_output(self):
+        self.check_output(atol=1e-8)
+
+    def test_check_grad(self):
+        # TODO(qingqing) remove folowing lines after the check_grad is refined.
+        N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'ProjWeight', 'Bias'], ['Projection'],
+            max_relative_error=1e-2)
+
+
+class TestLstmpOpHasInitial(TestLstmpOp):
+    def reset_argument(self):
+        self.has_initial_state = True
+
+    def test_check_grad(self):
+        # TODO(qingqing) remove folowing lines after the check_grad is refined.
+        N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'ProjWeight', 'Bias', 'H0', 'C0'],
+            ['Projection'],
+            max_relative_error=1e-2)
+
+    def test_check_grad_ingore_bias(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'ProjWeight', 'Weight'], ['Projection'],
+            max_relative_error=1e-2,
+            no_grad_set=set('Bias'))
+
+    def test_check_grad_ingore_weight(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'ProjWeight', 'Bias'], ['Projection'],
+            max_relative_error=1e-2,
+            no_grad_set=set('Weight'))
+
+    def test_check_grad_ingore_proj_weight(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'Bias'], ['Projection'],
+            max_relative_error=1e-2,
+            no_grad_set=set('ProjWeight'))
+
+    def test_check_grad_ingore_input(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Weight', 'ProjWeight', 'Bias'], ['Projection'],
+            max_relative_error=1e-2,
+            no_grad_set=set('Input'))
+
+    def test_check_grad_ingore_h0(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'ProjWeight', 'Bias', 'C0'], ['Projection'],
+            max_relative_error=1e-2,
+            no_grad_set=set('H0'))
+
+    def test_check_grad_ingore_c0(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'ProjWeight', 'Bias', 'H0'], ['Projection'],
+            max_relative_error=1e-2,
+            no_grad_set=set('C0'))
+
+
+class TestLstmpOpRerverse(TestLstmpOp):
+    def reset_argument(self):
+        self.is_reverse = True
+
+
+class TestLstmpOpNotUsePeepholes(TestLstmpOp):
+    def reset_argument(self):
+        self.use_peepholes = False
+
+
+class TestLstmpOpLinearProjection(TestLstmpOp):
+    def reset_argument(self):
+        self.act_proj = 'identity'
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_mine_hard_examples_op.py b/python/paddle/v2/fluid/tests/test_mine_hard_examples_op.py
new file mode 100755
index 0000000000000000000000000000000000000000..c27573c3d69037bc48e0b6a90636b3f027f15a41
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_mine_hard_examples_op.py
@@ -0,0 +1,100 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import sys
+import math
+from op_test import OpTest
+
+
+class TestMineHardExamplesOp(OpTest):
+    def set_data(self):
+        self.init_test_data()
+        self.inputs = {
+            'ClsLoss': self.cls_loss,
+            'LocLoss': self.loc_loss,
+            'MatchIndices': self.match_indices,
+            'MatchDist': self.match_dis
+        }
+
+        self.attrs = {
+            'neg_pos_ratio': self.neg_pos_ratio,
+            'neg_overlap': self.neg_overlap,
+            'sample_size': self.sample_size,
+            'mining_type': self.mining_type
+        }
+
+        self.outputs = {
+            'NegIndices': (self.neg_indices, self.neg_indices_lod),
+            'UpdatedMatchIndices': self.updated_match_indices
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        return
+
+    def setUp(self):
+        self.op_type = "mine_hard_examples"
+        self.set_data()
+
+    def init_test_data(self):
+        self.neg_pos_ratio = 1.0
+        self.neg_overlap = 0.5
+        self.sample_size = 0
+        self.mining_type = "max_negative"
+        self.cls_loss = np.array([[0.1, 0.1, 0.3],
+                                  [0.3, 0.1, 0.1]]).astype('float32')
+
+        self.loc_loss = np.array([[0.1, 0.2, 0.3],
+                                  [0.3, 0.4, 0.1]]).astype('float32')
+
+        self.match_dis = np.array([[0.2, 0.4, 0.8],
+                                   [0.1, 0.9, 0.3]]).astype('float32')
+
+        self.match_indices = np.array([[0, -1, -1],
+                                       [-1, 0, -1]]).astype('int32')
+
+        self.updated_match_indices = self.match_indices
+
+        self.neg_indices_lod = [[0, 1, 2]]
+        self.neg_indices = np.array([[1], [0]]).astype('int32')
+
+
+class TestMineHardExamplesOpHardExample(TestMineHardExamplesOp):
+    def init_test_data(self):
+        super(TestMineHardExamplesOpHardExample, self).init_test_data()
+        self.mining_type = "hard_example"
+        self.sample_size = 2
+
+        self.cls_loss = np.array([[0.5, 0.1, 0.3],
+                                  [0.3, 0.1, 0.1]]).astype('float32')
+
+        self.loc_loss = np.array([[0.2, 0.2, 0.3],
+                                  [0.3, 0.1, 0.2]]).astype('float32')
+
+        self.match_indices = np.array([[0, -1, -1],
+                                       [-1, 0, -1]]).astype('int32')
+
+        self.updated_match_indices = np.array([[0, -1, -1],
+                                               [-1, -1, -1]]).astype('int32')
+
+        self.neg_indices_lod = [[0, 1, 3]]
+        self.neg_indices = np.array([[2], [0], [2]]).astype('int32')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py b/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b80d2359b083d30f9a5a7b8cc18aaf1ca5146c1
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py
@@ -0,0 +1,226 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import numpy as np
+import copy
+from op_test import OpTest
+
+
+def iou(box_a, box_b):
+    """Apply intersection-over-union overlap between box_a and box_b
+    """
+    xmin_a = min(box_a[0], box_a[2])
+    ymin_a = min(box_a[1], box_a[3])
+    xmax_a = max(box_a[0], box_a[2])
+    ymax_a = max(box_a[1], box_a[3])
+
+    xmin_b = min(box_b[0], box_b[2])
+    ymin_b = min(box_b[1], box_b[3])
+    xmax_b = max(box_b[0], box_b[2])
+    ymax_b = max(box_b[1], box_b[3])
+
+    area_a = (ymax_a - ymin_a) * (xmax_a - xmin_a)
+    area_b = (ymax_b - ymin_b) * (xmax_b - xmin_b)
+    if area_a <= 0 and area_b <= 0:
+        return 0.0
+
+    xa = max(xmin_a, xmin_b)
+    ya = max(ymin_a, ymin_b)
+    xb = min(xmax_a, xmax_b)
+    yb = min(ymax_a, ymax_b)
+
+    inter_area = max(xb - xa, 0.0) * max(yb - ya, 0.0)
+
+    box_a_area = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1])
+    box_b_area = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1])
+
+    iou_ratio = inter_area / (area_a + area_b - inter_area)
+
+    return iou_ratio
+
+
+def nms(boxes, scores, score_threshold, nms_threshold, top_k=200, eta=1.0):
+    """Apply non-maximum suppression at test time to avoid detecting too many
+    overlapping bounding boxes for a given object.
+    Args:
+        boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
+        scores: (tensor) The class predscores for the img, Shape:[num_priors].
+        score_threshold: (float) The confidence thresh for filtering low
+            confidence boxes.
+        nms_threshold: (float) The overlap thresh for suppressing unnecessary
+            boxes.
+        top_k: (int) The maximum number of box preds to consider.
+        eta: (float) The parameter for adaptive NMS.
+    Return:
+        The indices of the kept boxes with respect to num_priors.
+    """
+    all_scores = copy.deepcopy(scores)
+    all_scores = all_scores.flatten()
+    selected_indices = np.argwhere(all_scores > score_threshold)
+    selected_indices = selected_indices.flatten()
+    all_scores = all_scores[selected_indices]
+
+    sorted_indices = np.argsort(-all_scores, axis=0, kind='mergesort')
+    sorted_scores = all_scores[sorted_indices]
+    if top_k > -1 and top_k < sorted_indices.shape[0]:
+        sorted_indices = sorted_indices[:top_k]
+        sorted_scores = sorted_scores[:top_k]
+
+    selected_indices = []
+    adaptive_threshold = nms_threshold
+    for i in range(sorted_scores.shape[0]):
+        idx = sorted_indices[i]
+        keep = True
+        for k in range(len(selected_indices)):
+            if keep:
+                kept_idx = selected_indices[k]
+                overlap = iou(boxes[idx], boxes[kept_idx])
+                keep = True if overlap <= adaptive_threshold else False
+            else:
+                break
+        if keep:
+            selected_indices.append(idx)
+        if keep and eta < 1 and adaptive_threshold > 0.5:
+            adaptive_threshold *= eta
+    return selected_indices
+
+
+def multiclass_nms(boxes, scores, background, score_threshold, nms_threshold,
+                   nms_top_k, keep_top_k):
+    class_num = scores.shape[0]
+    priorbox_num = scores.shape[1]
+
+    selected_indices = {}
+    num_det = 0
+    for c in range(class_num):
+        if c == background: continue
+        indices = nms(boxes, scores[c], score_threshold, nms_threshold,
+                      nms_top_k)
+        selected_indices[c] = indices
+        num_det += len(indices)
+
+    if keep_top_k > -1 and num_det > keep_top_k:
+        score_index = []
+        for c, indices in selected_indices.iteritems():
+            for idx in indices:
+                score_index.append((scores[c][idx], c, idx))
+
+        sorted_score_index = sorted(
+            score_index, key=lambda tup: tup[0], reverse=True)
+        sorted_score_index = sorted_score_index[:keep_top_k]
+        selected_indices = {}
+
+        for _, c, _ in sorted_score_index:
+            selected_indices[c] = []
+        for s, c, idx in sorted_score_index:
+            selected_indices[c].append(idx)
+        num_det = keep_top_k
+
+    return selected_indices, num_det
+
+
+def batched_multiclass_nms(boxes, scores, background, score_threshold,
+                           nms_threshold, nms_top_k, keep_top_k):
+    batch_size = scores.shape[0]
+
+    det_outs = []
+    lod = [0]
+    for n in range(batch_size):
+        nmsed_outs, nmsed_num = multiclass_nms(boxes, scores[n], background,
+                                               score_threshold, nms_threshold,
+                                               nms_top_k, keep_top_k)
+        lod.append(lod[-1] + nmsed_num)
+        if nmsed_num == 0: continue
+
+        for c, indices in nmsed_outs.iteritems():
+            for idx in indices:
+                xmin, ymin, xmax, ymax = boxes[idx][:]
+                det_outs.append([c, scores[n][c][idx], xmin, ymin, xmax, ymax])
+
+    return det_outs, lod
+
+
+class TestMulticlassNMSOp(OpTest):
+    def set_argument(self):
+        self.score_threshold = 0.01
+
+    def setUp(self):
+        self.set_argument()
+        N = 7
+        M = 1200
+        C = 21
+        BOX_SIZE = 4
+
+        background = 0
+        nms_threshold = 0.3
+        nms_top_k = 400
+        keep_top_k = 200
+        score_threshold = self.score_threshold
+
+        scores = np.random.random((N * M, C)).astype('float32')
+
+        def softmax(x):
+            shiftx = x - np.max(x).clip(-64.)
+            exps = np.exp(shiftx)
+            return exps / np.sum(exps)
+
+        scores = np.apply_along_axis(softmax, 1, scores)
+        scores = np.reshape(scores, (N, M, C))
+        scores = np.transpose(scores, (0, 2, 1))
+
+        boxes = np.random.random((M, BOX_SIZE)).astype('float32')
+        boxes[:, 0:2] = boxes[:, 0:2] * 0.5
+        boxes[:, 2:4] = boxes[:, 2:4] * 0.5 + 0.5
+
+        nmsed_outs, lod = batched_multiclass_nms(boxes, scores, background,
+                                                 score_threshold, nms_threshold,
+                                                 nms_top_k, keep_top_k)
+        nmsed_outs = [-1] if not nmsed_outs else nmsed_outs
+        nmsed_outs = np.array(nmsed_outs).astype('float32')
+
+        self.op_type = 'multiclass_nms'
+        self.inputs = {'BBoxes': boxes, 'Scores': scores}
+        self.outputs = {'Out': (nmsed_outs, [lod])}
+        self.attrs = {
+            'background_label': 0,
+            'nms_threshold': nms_threshold,
+            'nms_top_k': nms_top_k,
+            'keep_top_k': keep_top_k,
+            'score_threshold': score_threshold,
+            'nms_eta': 1.0,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestMulticlassNMSOpNoOutput(TestMulticlassNMSOp):
+    def set_argument(self):
+        # Here set 2.0 to test the case there is no outputs.
+        # In practical use, 0.0 < score_threshold < 1.0 
+        self.score_threshold = 2.0
+
+
+class TestIOU(unittest.TestCase):
+    def test_iou(self):
+        box1 = np.array([4.0, 3.0, 7.0, 5.0]).astype('float32')
+        box2 = np.array([3.0, 4.0, 6.0, 8.0]).astype('float32')
+
+        expt_output = np.array([2.0 / 16.0]).astype('float32')
+        calc_output = np.array([iou(box1, box2)]).astype('float32')
+        self.assertTrue(np.allclose(calc_output, expt_output))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_multihead_attention.py b/python/paddle/v2/fluid/tests/test_multihead_attention.py
index 54ec3e3d6e53f35d6a518ef659853e1a13c1711f..a2b300a645fe21931cc12a4e7bb8ebe9b85707c9 100644
--- a/python/paddle/v2/fluid/tests/test_multihead_attention.py
+++ b/python/paddle/v2/fluid/tests/test_multihead_attention.py
@@ -58,7 +58,7 @@ class TestMultiheadAttention(unittest.TestCase):
         """Run the test program.
         """
         places = [core.CPUPlace()]
-        if core.is_compile_gpu():
+        if core.is_compiled_with_cuda():
             places.append(core.CUDAPlace(0))
 
         for place in places:
diff --git a/python/paddle/v2/fluid/tests/test_normalization_wrapper.py b/python/paddle/v2/fluid/tests/test_normalization_wrapper.py
index 57f14f6b9cc9c7cf9ae93274cf3d7763350e6e10..6b71f2a923f0cf0744d6b2190aa35830dcf15f24 100644
--- a/python/paddle/v2/fluid/tests/test_normalization_wrapper.py
+++ b/python/paddle/v2/fluid/tests/test_normalization_wrapper.py
@@ -46,7 +46,7 @@ class TestNormalization(unittest.TestCase):
         """Run the test program.
         """
         places = [core.CPUPlace()]
-        if core.is_compile_gpu():
+        if core.is_compiled_with_cuda():
             places.append(core.CUDAPlace(0))
 
         for place in places:
diff --git a/python/paddle/v2/fluid/tests/test_one_hot_op.py b/python/paddle/v2/fluid/tests/test_one_hot_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e51ea27d14d0637021f8902fa935beb318658018
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_one_hot_op.py
@@ -0,0 +1,110 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import math
+from op_test import OpTest
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.framework as framework
+from paddle.v2.fluid.framework import Program, program_guard
+
+
+class TestOneHotOp(OpTest):
+    def setUp(self):
+        self.op_type = 'one_hot'
+        depth = 10
+        dimension = 12
+        x_lod = [[0, 4, 5, 8, 11]]
+        x = [np.random.randint(0, depth - 1) for i in xrange(x_lod[0][-1])]
+        x = np.array(x).astype('int').reshape([x_lod[0][-1], 1])
+
+        out = np.zeros(shape=(np.product(x.shape[:-1]),
+                              depth)).astype('float32')
+
+        for i in xrange(np.product(x.shape)):
+            out[i, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod)}
+        self.attrs = {'depth': depth, 'dtype': int(core.DataType.FP32)}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestOneHotOp_default_dtype(OpTest):
+    def setUp(self):
+        self.op_type = 'one_hot'
+        depth = 10
+        dimension = 12
+        x_lod = [[0, 4, 5, 8, 11]]
+        x = [np.random.randint(0, depth - 1) for i in xrange(x_lod[0][-1])]
+        x = np.array(x).astype('int').reshape([x_lod[0][-1], 1])
+
+        out = np.zeros(shape=(np.product(x.shape[:-1]),
+                              depth)).astype('float32')
+
+        for i in xrange(np.product(x.shape)):
+            out[i, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod)}
+        self.attrs = {'depth': depth}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestOneHotOp_exception(OpTest):
+    def setUp(self):
+        self.op_type = 'one_hot'
+        self.depth = 10
+        self.place = core.CPUPlace()
+        self.dimension = 12
+        self.x = core.LoDTensor()
+        x_lod = [[0, 4, 5, 8, 11]]
+        data = [np.random.randint(11, 20) for i in xrange(x_lod[0][-1])]
+        data = np.array(data).astype('int').reshape([x_lod[0][-1], 1])
+        self.x.set(data, self.place)
+        self.x.set_lod(x_lod)
+
+    def test_check_output(self):
+        program = Program()
+        with program_guard(program):
+            x = fluid.layers.data(
+                name='x', shape=[self.dimension], dtype='float32', lod_level=1)
+            block = program.current_block()
+            one_hot_out = block.create_var(
+                name="one_hot_out",
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                dtype='float32')
+            block.append_op(
+                type='one_hot',
+                inputs={'X': x},
+                attrs={'depth': self.depth},
+                outputs={'Out': one_hot_out})
+            exe = fluid.Executor(self.place)
+
+            def run():
+                exe.run(feed={'x': self.x},
+                        fetch_list=[one_hot_out],
+                        return_numpy=False)
+
+            self.assertRaises(core.EnforceNotMet, run)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_op_support_gpu.py b/python/paddle/v2/fluid/tests/test_op_support_gpu.py
index 34939818126b1d747fb76861bbd691894fb3759b..7de02a8fda22a3db82a2e0b5e6fa9c9f2718fa12 100644
--- a/python/paddle/v2/fluid/tests/test_op_support_gpu.py
+++ b/python/paddle/v2/fluid/tests/test_op_support_gpu.py
@@ -18,7 +18,8 @@ import paddle.v2.fluid.core as core
 
 class TestOpSupportGPU(unittest.TestCase):
     def test_case(self):
-        self.assertEqual(core.is_compile_gpu(), core.op_support_gpu("sum"))
+        self.assertEqual(core.is_compiled_with_cuda(),
+                         core.op_support_gpu("sum"))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_optimizer.py b/python/paddle/v2/fluid/tests/test_optimizer.py
index 480ee7091579ba171ca957cb4d25f0034e0534c0..dc6b84dcdc04dd185d97c3cc4b9f00305a911efb 100644
--- a/python/paddle/v2/fluid/tests/test_optimizer.py
+++ b/python/paddle/v2/fluid/tests/test_optimizer.py
@@ -42,9 +42,9 @@ class TestOptimizer(unittest.TestCase):
             type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
         sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01)
         opts, _ = sgd_optimizer.minimize(mean_out, init_program)
-        self.assertEqual(len(opts), 1)
-        sgd_op = opts[0]
-        self.assertEqual(sgd_op.type, "sgd")
+        self.assertEqual(len(opts), 3)
+        self.assertEqual([op.type for op in opts],
+                         ["fill_constant", "elementwise_mul", "sgd"])
 
     def test_sgd_optimizer_with_global_step(self):
         init_program = framework.Program()
@@ -72,11 +72,10 @@ class TestOptimizer(unittest.TestCase):
         sgd_optimizer = optimizer.SGDOptimizer(
             learning_rate=learning_rate, global_step=global_step)
         opts, _ = sgd_optimizer.minimize(mean_out, init_program)
-        self.assertEqual(len(opts), 2)
-        sgd_op = opts[0]
-        self.assertEqual(sgd_op.type, "sgd")
-        increment_op = opts[1]
-        self.assertEqual(increment_op.type, "increment")
+        self.assertEqual(len(opts), 4)
+        self.assertEqual(
+            [op.type for op in opts],
+            ["fill_constant", "elementwise_mul", "sgd", "increment"])
 
         # Check init_program
         init_ops = init_program.global_block().ops
@@ -121,9 +120,10 @@ class TestMomentumOptimizer(unittest.TestCase):
         self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
         opts = momentum_optimizer.create_optimization_pass(
             params_grads, mul_out, init_program)
-        self.assertEqual(len(opts), 1)
-        sgd_op = opts[0]
-        self.assertEqual(sgd_op.type, "momentum")
+        self.assertEqual(len(opts), 3)
+        sgd_op = opts[-1]
+        self.assertEqual([op.type for op in opts],
+                         ["fill_constant", "elementwise_mul", "momentum"])
         self.assertFalse(sgd_op.attr('use_nesterov'))
 
         # Check accumulators
@@ -170,9 +170,10 @@ class TestMomentumOptimizer(unittest.TestCase):
         self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
         opts = momentum_optimizer.create_optimization_pass(
             params_grads, mul_out, init_program)
-        self.assertEqual(len(opts), 1)
-        sgd_op = opts[0]
-        self.assertEqual(sgd_op.type, "momentum")
+        self.assertEqual(len(opts), 3)
+        sgd_op = opts[-1]
+        self.assertEqual([op.type for op in opts],
+                         ["fill_constant", "elementwise_mul", "momentum"])
         self.assertTrue(sgd_op.attr('use_nesterov'))
 
         # Check accumulators
@@ -228,9 +229,9 @@ class TestAdagradOptimizer(unittest.TestCase):
         self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
         opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out,
                                                           init_program)
-        self.assertEqual(len(opts), 1)
-        adagrad_op = opts[0]
-        self.assertEqual(adagrad_op.type, "adagrad")
+        self.assertEqual(len(opts), 3)
+        self.assertEqual([op.type for op in opts],
+                         ["fill_constant", "elementwise_mul", "adagrad"])
 
         # Check accumulators
         accumulators = adagrad_optimizer.get_accumulators()
@@ -288,9 +289,10 @@ class TestAdamOptimizer(unittest.TestCase):
         self.assertEqual(len(adam_optimizer.get_accumulators()), 0)
         opts = adam_optimizer.create_optimization_pass(params_grads, mul_out,
                                                        init_program)
-        self.assertEqual(len(opts), 3)
-        adam_op = opts[0]
-        self.assertEqual(adam_op.type, "adam")
+        self.assertEqual(len(opts), 5)
+        self.assertEqual(
+            [op.type for op in opts],
+            ["fill_constant", "elementwise_mul", "adam", "scale", "scale"])
 
         # Check accumulators
         accumulators = adam_optimizer.get_accumulators()
@@ -350,9 +352,10 @@ class TestAdamaxOptimizer(unittest.TestCase):
         self.assertEqual(len(adamax_optimizer.get_accumulators()), 0)
         opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out,
                                                          init_program)
-        self.assertEqual(len(opts), 2)
-        adam_op = opts[0]
-        self.assertEqual(adam_op.type, "adamax")
+        self.assertEqual(len(opts), 4)
+        self.assertEqual(
+            [op.type for op in opts],
+            ["fill_constant", "elementwise_mul", "adamax", "scale"])
 
         # Check accumulators
         accumulators = adamax_optimizer.get_accumulators()
@@ -409,9 +412,10 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
         self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0)
         opts = decayed_adagrad_optimizer.create_optimization_pass(
             params_grads, mul_out, init_program)
-        self.assertEqual(len(opts), 1)
-        decayed_adagrad_op = opts[0]
-        self.assertEqual(decayed_adagrad_op.type, "decayed_adagrad")
+        self.assertEqual(len(opts), 3)
+        self.assertEqual(
+            [op.type for op in opts],
+            ["fill_constant", "elementwise_mul", "decayed_adagrad"])
 
         # Check accumulators
         accumulators = decayed_adagrad_optimizer.get_accumulators()
diff --git a/python/paddle/v2/fluid/tests/test_parallel_op.py b/python/paddle/v2/fluid/tests/test_parallel_op.py
index 5394df7360e5ba60408b7f53494c3e0152c424a7..367cc8b1aaf0aff24c685031f33d35becb9eb7ef 100644
--- a/python/paddle/v2/fluid/tests/test_parallel_op.py
+++ b/python/paddle/v2/fluid/tests/test_parallel_op.py
@@ -53,7 +53,7 @@ class BaseParallelForTest(unittest.TestCase):
             fetch=fetch,
             place=cpu,
             use_parallel=True)
-        if fluid.core.is_compile_gpu():
+        if fluid.core.is_compiled_with_cuda():
             gpu = fluid.CUDAPlace(0)
             result_gpu = self._run_test_impl_(
                 callback=callback,
diff --git a/python/paddle/v2/fluid/tests/test_profiler.py b/python/paddle/v2/fluid/tests/test_profiler.py
index 34700df37d22cf71bad2d86efa4718a3767c2d4f..09b2d08401878448b4b3f3c6c03193e255e9ffeb 100644
--- a/python/paddle/v2/fluid/tests/test_profiler.py
+++ b/python/paddle/v2/fluid/tests/test_profiler.py
@@ -23,7 +23,7 @@ import paddle.v2.fluid.core as core
 
 class TestProfiler(unittest.TestCase):
     def test_nvprof(self):
-        if not fluid.core.is_compile_gpu():
+        if not fluid.core.is_compiled_with_cuda():
             return
         epoc = 8
         dshape = [4, 3, 28, 28]
@@ -42,7 +42,7 @@ class TestProfiler(unittest.TestCase):
         os.remove(output_file)
 
     def net_profiler(self, state):
-        if state == 'GPU' and not core.is_compile_gpu():
+        if state == 'GPU' and not core.is_compiled_with_cuda():
             return
         startup_program = fluid.Program()
         main_program = fluid.Program()
diff --git a/python/paddle/v2/fluid/tests/test_protobuf_descs.py b/python/paddle/v2/fluid/tests/test_protobuf_descs.py
index 9034b2f4ef1c983ef224b14b8f602f87e6ce94b0..c590bf1c6570a2320962f2d610619dbd88b473d1 100644
--- a/python/paddle/v2/fluid/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/fluid/tests/test_protobuf_descs.py
@@ -115,6 +115,17 @@ class TestVarDesc(unittest.TestCase):
         self.assertEqual(src_shape, res_shape)
         self.assertEqual(core.VarDesc.VarType.SELECTED_ROWS, var.type())
 
+    def test_multiple_shape(self):
+        program_desc = core.ProgramDesc()
+        block = program_desc.block(0)
+        var = block.var('my_reader')
+        var.set_type(core.VarDesc.VarType.READER)
+        src_shapes = [[2, 3, 3], [4, 5], [6, 7, 8, 9]]
+        var.set_shapes(src_shapes)
+        res_shapes = var.shapes()
+        self.assertEqual(src_shapes, res_shapes)
+        self.assertEqual(core.VarDesc.VarType.READER, var.type())
+
     def test_dtype(self):
         program_desc = core.ProgramDesc()
         block = program_desc.block(0)
@@ -124,6 +135,28 @@ class TestVarDesc(unittest.TestCase):
         self.assertEqual(core.DataType.INT32, var.dtype())
         self.assertEqual(core.VarDesc.VarType.LOD_TENSOR, var.type())
 
+    def test_multiple_dtype(self):
+        program_desc = core.ProgramDesc()
+        block = program_desc.block(0)
+        var = block.var('my_reader')
+        var.set_type(core.VarDesc.VarType.READER)
+        src_types = [
+            core.DataType.INT32, core.DataType.FP64, core.DataType.FP32
+        ]
+        var.set_dtypes(src_types)
+        self.assertEqual(src_types, var.dtypes())
+        self.assertEqual(core.VarDesc.VarType.READER, var.type())
+
+    def test_multiple_lod_level(self):
+        program_desc = core.ProgramDesc()
+        block = program_desc.block(0)
+        var = block.var('my_reader')
+        var.set_type(core.VarDesc.VarType.READER)
+        src_types = [3, 1, 2]
+        var.set_lod_levels(src_types)
+        self.assertEqual(src_types, var.lod_levels())
+        self.assertEqual(core.VarDesc.VarType.READER, var.type())
+
 
 class TestBlockDesc(unittest.TestCase):
     def test_add_var(self):
diff --git a/python/paddle/v2/fluid/tests/test_recv_op.py b/python/paddle/v2/fluid/tests/test_recv_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a02b882410fe896cd2add03060127a01cbdaa38
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_recv_op.py
@@ -0,0 +1,70 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.layers as layers
+import numpy
+from multiprocessing import Process
+import os, sys
+import time
+
+
+class TestRecvOp(unittest.TestCase):
+    def test_send(self):
+        # Run init_serv in a thread
+        place = fluid.CPUPlace()
+        p = Process(target=self.init_serv, args=(place, ))
+        p.daemon = True
+        p.start()
+        time.sleep(1)
+        self.init_client(place)
+        # FIXME(typhoonzero): find a way to gracefully shutdown the server.
+        os.system("kill -9 %d" % p.pid)
+        p.join()
+
+    def init_serv(self, place):
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            x = layers.data(
+                shape=[32, 32],
+                dtype='float32',
+                name="X",
+                append_batch_size=False)
+            fluid.initializer.Constant(value=1.0)(x, main.global_block())
+            serv = layers.ListenAndServ("127.0.0.1:6174", optimizer_mode=False)
+            with serv.do():
+                o = layers.scale(x=x, scale=10.0)
+            main.global_block().create_var(
+                name=o.name, psersistable=False, dtype=o.dtype, shape=o.shape)
+        exe = fluid.Executor(place)
+        exe.run(main)
+
+    def init_client(self, place):
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            x = layers.data(
+                shape=[32, 32],
+                dtype='float32',
+                name='X',
+                append_batch_size=False)
+            fluid.initializer.Constant(value=1.0)(x, main.global_block())
+            layers.Send("127.0.0.1:6174", [x], [x])
+        exe = fluid.Executor(place)
+        exe.run(main)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
index 74cd6de9e6fde70c001bb2189c4976cdd8e34633..0a223bac0ce8fd626881cef983c7cd960f2c5ba8 100644
--- a/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
+++ b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
@@ -45,7 +45,7 @@ class TestReorderLoDTensor(unittest.TestCase):
         outputs = []
         input_grads = []
         places = [core.CPUPlace()]
-        if core.is_compile_gpu():
+        if core.is_compiled_with_cuda():
             places.append(core.CUDAPlace(0))
         for place in places:
             self.set_inputs(place)
diff --git a/python/paddle/v2/fluid/tests/test_sgd_op.py b/python/paddle/v2/fluid/tests/test_sgd_op.py
index f87927968b0fdb00ec207ff1d52be9e0d81af139..ba2ca1683f9f6d72bbd1550df89c7424d223a1d9 100644
--- a/python/paddle/v2/fluid/tests/test_sgd_op.py
+++ b/python/paddle/v2/fluid/tests/test_sgd_op.py
@@ -91,7 +91,7 @@ class TestSparseSGDOp(unittest.TestCase):
 
     def test_sparse_sgd(self):
         places = [core.CPUPlace()]
-        if core.is_compile_gpu():
+        if core.is_compiled_with_cuda():
             places.append(core.CUDAPlace(0))
         for place in places:
             self.check_with_place(place)
diff --git a/python/paddle/v2/fluid/tests/test_split_selected_rows_op.py b/python/paddle/v2/fluid/tests/test_split_selected_rows_op.py
index 37c6587c4151a89563f93cab35d63b2419ef88ab..343aa20066146ae08462a92f1efaa20c4d4b5ed8 100644
--- a/python/paddle/v2/fluid/tests/test_split_selected_rows_op.py
+++ b/python/paddle/v2/fluid/tests/test_split_selected_rows_op.py
@@ -21,7 +21,7 @@ from paddle.v2.fluid.op import Operator
 class TestSpliteSelectedRows(unittest.TestCase):
     def get_places(self):
         places = [core.CPUPlace()]
-        if core.is_compile_gpu():
+        if core.is_compiled_with_cuda():
             places.append(core.CUDAPlace(0))
         return places
 
diff --git a/python/paddle/v2/fluid/tests/test_switch.py b/python/paddle/v2/fluid/tests/test_switch.py
new file mode 100644
index 0000000000000000000000000000000000000000..52ebf773ec72226aae5efb635e070baa8a123595
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_switch.py
@@ -0,0 +1,64 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.framework as framework
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.framework import default_startup_program
+
+
+class TestSwitch(unittest.TestCase):
+    def check_switch(self, value):
+        x = layers.fill_constant(shape=[1], dtype='float32', value=value)
+
+        zero_var = layers.fill_constant(shape=[1], dtype='float32', value=0.0)
+        one_var = layers.fill_constant(shape=[1], dtype='float32', value=1.0)
+        two_var = layers.fill_constant(shape=[1], dtype='float32', value=2.0)
+        three_var = layers.fill_constant(shape=[1], dtype='float32', value=3.0)
+
+        result = layers.create_global_var(
+            shape=[1], value=-1.0, dtype='float32', persistable=True)
+
+        with layers.Switch() as switch:
+            with switch.case(layers.less_than(x, zero_var)):
+                layers.assign(zero_var, result)
+            with switch.case(layers.less_than(x, one_var)):
+                layers.assign(one_var, result)
+            with switch.case(layers.less_than(x, two_var)):
+                layers.assign(two_var, result)
+            with switch.default():
+                layers.assign(three_var, result)
+
+        cpu = core.CPUPlace()
+        exe = Executor(cpu)
+        exe.run(default_startup_program())
+
+        out = exe.run(feed={}, fetch_list=[result])[0][0]
+        return out
+
+    def test_switch(self):
+        test_data = {(-0.1, 0), (0.1, 1), (1.1, 2), (2.1, 3)}
+        for x, expected_result in test_data:
+            main_program = framework.Program()
+            startup_program = framework.Program()
+            with framework.program_guard(main_program, startup_program):
+                result = self.check_switch(x)
+                self.assertEqual(result, expected_result)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_target_assign_op.py b/python/paddle/v2/fluid/tests/test_target_assign_op.py
new file mode 100755
index 0000000000000000000000000000000000000000..8a1155c6217401b1b85e3c0bdc47f438f482bcbb
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_target_assign_op.py
@@ -0,0 +1,122 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import random
+from op_test import OpTest
+
+
+def gen_match_and_neg_indices(num_prior, gt_lod, neg_lod):
+    if len(gt_lod) != len(neg_lod):
+        raise AssertionError("The input arguments are illegal.")
+
+    batch_size = len(gt_lod) - 1
+
+    match_indices = -1 * np.ones((batch_size, num_prior)).astype('int32')
+    neg_indices = np.zeros((neg_lod[-1], 1)).astype('int32')
+
+    for n in range(batch_size):
+        gt_num = gt_lod[n + 1] - gt_lod[n]
+        ids = random.sample([i for i in range(num_prior)], gt_num)
+        match_indices[n, ids] = [i for i in range(gt_num)]
+
+        ret_ids = set([i for i in range(num_prior)]) - set(ids)
+        s = neg_lod[n]
+        e = neg_lod[n + 1]
+        l = e - s
+        neg_ids = random.sample(ret_ids, l)
+        neg_indices[s:e, :] = np.array(neg_ids).astype('int32').reshape(l, 1)
+
+    return match_indices, neg_indices
+
+
+def target_assign(encoded_box, gt_label, match_indices, neg_indices, gt_lod,
+                  neg_lod, background_label):
+    batch_size, num_prior = match_indices.shape
+
+    # init target bbox
+    trg_box = np.zeros((batch_size, num_prior, 4)).astype('float32')
+    # init weight for target bbox
+    trg_box_wt = np.zeros((batch_size, num_prior, 1)).astype('float32')
+    # init target label
+    trg_label = np.ones((batch_size, num_prior, 1)).astype('int32')
+    trg_label = trg_label * background_label
+    # init weight for target label
+    trg_label_wt = np.zeros((batch_size, num_prior, 1)).astype('float32')
+
+    for i in range(batch_size):
+        cur_indices = match_indices[i]
+        col_ids = np.where(cur_indices > -1)
+        col_val = cur_indices[col_ids]
+
+        gt_start = gt_lod[i]
+        # target bbox
+        for v, c in zip(col_val + gt_start, col_ids[0].tolist()):
+            trg_box[i][c][:] = encoded_box[v][c][:]
+
+        # weight for target bbox
+        trg_box_wt[i][col_ids] = 1.0
+
+        trg_label[i][col_ids] = gt_label[col_val + gt_start]
+
+        trg_label_wt[i][col_ids] = 1.0
+        # set target label weight to 1.0 for the negative samples
+        neg_ids = neg_indices[neg_lod[i]:neg_lod[i + 1]]
+        trg_label_wt[i][neg_ids] = 1.0
+
+    return trg_box, trg_box_wt, trg_label, trg_label_wt
+
+
+class TestTargetAssginOp(OpTest):
+    def setUp(self):
+        self.op_type = "target_assign"
+
+        num_prior = 120
+        num_class = 21
+        gt_lod = [0, 5, 11, 23]
+        neg_lod = [0, 4, 7, 13]
+        batch_size = len(gt_lod) - 1
+        num_gt = gt_lod[-1]
+        background_label = 0
+
+        encoded_box = np.random.random((num_gt, num_prior, 4)).astype('float32')
+        gt_label = np.random.randint(
+            num_class, size=(num_gt, 1)).astype('int32')
+        match_indices, neg_indices = gen_match_and_neg_indices(num_prior,
+                                                               gt_lod, neg_lod)
+        trg_box, trg_box_wt, trg_label, trg_label_wt = target_assign(
+            encoded_box, gt_label, match_indices, neg_indices, gt_lod, neg_lod,
+            background_label)
+
+        self.inputs = {
+            'EncodedGTBBox': (encoded_box, [gt_lod]),
+            'GTScoreLabel': (gt_label, [gt_lod]),
+            'MatchIndices': (match_indices),
+            'NegIndices': (neg_indices, [neg_lod]),
+        }
+        self.attrs = {'background_label': background_label}
+        self.outputs = {
+            'PredBBoxLabel': (trg_box),
+            'PredBBoxWeight': (trg_box_wt),
+            'PredScoreLabel': (trg_label),
+            'PredScoreWeight': (trg_label_wt),
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_tensor.py b/python/paddle/v2/fluid/tests/test_tensor.py
index d5cc235f588ad37b0d1293dc9894952c97411757..0219bef42b3ba133dda7412c1036cf989a170a36 100644
--- a/python/paddle/v2/fluid/tests/test_tensor.py
+++ b/python/paddle/v2/fluid/tests/test_tensor.py
@@ -108,9 +108,31 @@ class TestTensor(unittest.TestCase):
         scope = core.Scope()
         place = core.CPUPlace()
         lod_py = [[0, 2, 5], [0, 2, 4, 5]]
-        lod_tensor = core.LoDTensor(lod_py)
+        lod_tensor = core.LoDTensor()
 
         lod_tensor.set_dims([5, 2, 3, 4])
+        lod_tensor.set_lod(lod_py)
+        lod_tensor.alloc_float(place)
+        tensor_array = numpy.array(lod_tensor)
+        tensor_array[0, 0, 0, 0] = 1.0
+        tensor_array[0, 0, 0, 1] = 2.0
+        lod_tensor.set(tensor_array, place)
+
+        lod_v = numpy.array(lod_tensor)
+        self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0])
+        self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
+        self.assertListEqual(lod_py, lod_tensor.lod())
+
+    def test_lod_tensor_gpu_init(self):
+        if not core.is_compiled_with_cuda():
+            return
+        scope = core.Scope()
+        place = core.CUDAPlace(0)
+        lod_py = [[0, 2, 5], [0, 2, 4, 5]]
+        lod_tensor = core.LoDTensor()
+
+        lod_tensor.set_dims([5, 2, 3, 4])
+        lod_tensor.set_lod(lod_py)
         lod_tensor.alloc_float(place)
         tensor_array = numpy.array(lod_tensor)
         tensor_array[0, 0, 0, 0] = 1.0
diff --git a/python/paddle/v2/fluid/tests/test_uniform_random_op.py b/python/paddle/v2/fluid/tests/test_uniform_random_op.py
index b2a39f975eb461292dc2e7be332a26931684bf90..94cf416fad8f02cdea8017ae1350fa264ce644b1 100644
--- a/python/paddle/v2/fluid/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_uniform_random_op.py
@@ -36,7 +36,7 @@ class TestUniformRandomOp(unittest.TestCase):
         self.uniform_random_test(place=core.CPUPlace())
 
     def test_gpu(self):
-        if core.is_compile_gpu():
+        if core.is_compiled_with_cuda():
             self.uniform_random_test(place=core.CUDAPlace(0))
 
     def uniform_random_test(self, place):
diff --git a/python/paddle/v2/fluid/tests/test_weight_normalization.py b/python/paddle/v2/fluid/tests/test_weight_normalization.py
new file mode 100644
index 0000000000000000000000000000000000000000..80ad8285d8a3c2ced814cc3588a814c14ec60855
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_weight_normalization.py
@@ -0,0 +1,121 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy
+import collections
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.initializer import ConstantInitializer
+from paddle.v2.fluid.param_attr import WeightNormParamAttr
+
+
+class TestWeightNormalization(unittest.TestCase):
+    batch_size = 3
+    hidden_size = 5
+    data_desc = (['x', [10], 0], )
+
+    @classmethod
+    def setUpClass(cls):
+        cls.set_program()
+
+    @classmethod
+    def set_program(cls):
+        data = fluid.layers.data(
+            name=cls.data_desc[0][0], shape=cls.data_desc[0][1])
+        out = fluid.layers.fc(input=data,
+                              size=cls.hidden_size,
+                              param_attr=WeightNormParamAttr(
+                                  dim=None,
+                                  name='weight_norm_param',
+                                  initializer=ConstantInitializer(1.0)),
+                              bias_attr=False,
+                              act=None)
+        loss = fluid.layers.reduce_sum(out)
+        fluid.backward.append_backward(loss=loss)
+        cls.fetch_list = [
+            'weight_norm_param_g', 'weight_norm_param_v',
+            'weight_norm_param_g@GRAD'
+        ]
+
+    def run_program(self):
+        outputs = []
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.set_inputs(place)
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            output = exe.run(fluid.default_main_program(),
+                             feed=self.inputs,
+                             fetch_list=self.fetch_list,
+                             return_numpy=False)
+            outputs.append(output)
+        self.actual_outputs = outputs
+
+    def set_data(self):
+        self.data = collections.OrderedDict()
+        for desc in self.data_desc:
+            data_name = desc[0]
+            data_shape = desc[1]
+            data_lod_level = desc[2]
+            data_lod = []
+            for i in range(data_lod_level):
+                lod_level_i = numpy.random.randint(
+                    low=1,
+                    high=5,
+                    size=self.batch_size if i == 0 else lod_level_i[-1])
+                lod_level_i = [0] + numpy.cumsum(lod_level_i).tolist()
+                data_lod.append(lod_level_i)
+            data_value = numpy.random.random(
+                size=[data_lod[-1][-1] if data_lod else self.batch_size
+                      ] + data_shape).astype('float32')
+            self.data[data_name] = (data_value, data_lod)
+
+    def set_inputs(self, place):
+        self.inputs = {}
+        for desc in self.data_desc:
+            tensor = fluid.Tensor()
+            tensor.set(self.data[desc[0]][0], place)
+            if self.data[desc[0]][1]:
+                tensor.set_lod(self.data[desc[0]][1])
+            self.inputs[desc[0]] = tensor
+
+    def weight_normalize(self):
+        v = numpy.ones((self.data[self.data_desc[0][0]][0].shape[-1],
+                        self.hidden_size))
+        g = numpy.linalg.norm(v, axis=None, keepdims=True)
+        w = g * v / numpy.linalg.norm(v, axis=None, keepdims=True)
+        x = self.data[self.data_desc[0][0]][0]
+        out = numpy.dot(x, w)
+        g_grad = (numpy.dot(x.T, numpy.ones_like(out)) * (v / numpy.linalg.norm(
+            v, axis=None, keepdims=True))).sum(axis=None, keepdims=True)
+        return g, v, g_grad
+
+    def test_weight_normalization(self):
+        self.set_data()
+        self.run_program()
+        expect_output = self.weight_normalize()
+        for actual_output in self.actual_outputs:
+            [
+                self.assertTrue(
+                    numpy.allclose(
+                        numpy.array(actual), expect, atol=0.001))
+                for expect, actual in zip(expect_output, actual_output)
+            ]
+
+
+if __name__ == '__main__':
+    unittest.main()