Merge branch 'develop' into inference_lib_dist

446198da · Luo Tao · 55b5f29e · be815dd0 · 446198da · 446198da
176 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,7 +39,7 @@ option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_F
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
 option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
-option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
+option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
 option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check"         ON)
 option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
@@ -137,7 +137,7 @@ include(external/openblas)  # download, build, install openblas
 include(external/mkldnn)    # download, build, install mkldnn
 include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
-include(external/boost)     # download, build, install boost
+include(external/boost)     # download boost
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11

--- a/benchmark/cluster/vgg16/Dockerfile
+++ b/benchmark/cluster/vgg16/Dockerfile
+#FROM python:2.7.14
+FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
+RUN apt-get update && apt-get install -y python
+RUN pip install -U kubernetes opencv-python &&   apt-get update -y &&   apt-get install -y iputils-ping libgtk2.0-dev
+# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
+#       so we must build one with distribute support to install in this image.
+RUN pip install paddlepaddle
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
+RUN pip uninstall -y paddlepaddle
+# below lines may change a lot for debugging
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
+ADD *.whl /
+RUN pip install /*.whl && rm -f /*.whl && \
+chmod +x /usr/bin/paddle_k8s
+ENV LD_LIBRARY_PATH=/usr/local/lib
+ADD vgg16_fluid.py vgg16_v2.py /workspace/
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
+# Performance for Distributed vgg16
+## Test Result
+### Hardware Infomation
+- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
+- cpu MHz		: 2101.000
+- cache size	: 20480 KB
+### Single Node Single Thread
+- PServer Count: 10
+- Trainer Count: 20
+- Metrics: samples / sec
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
+| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
+| TensorFlow | - | - | - | - |
+### Different Batch Size
+- PServer Count: 10
+- Trainer Count: 20
+- Per trainer CPU Core: 1
+- Metrics: samples / sec
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
+| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
+| TensorFlow | - | - | - | - |
+### Accelerate Rate
+- Pserver Count: 20
+- Batch Size: 128
+- Metrics: samples / sec
+| Trainer Count | 20 | 40 | 80 | 100 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
+| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
+| TensorFlow | - | - | - | - |
+### Different Pserver Count
+- Trainer Count: 60
+- Batch Size: 128
+- Metrics: samples/ sec
+| PServer Count | 3 | 6 |10 | 20 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
+| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
+| TensorFlow | - | - | - | - |
+*The performance gap between Fuild and v2 comes from the network interference.*
+## Steps to Run the Performance Test
+1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
+1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
+1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
+1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
+1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
+Check the logs for the distributed training progress and analyze the performance.
+## Enable Verbos Logs
+Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.
--- a/benchmark/cluster/vgg16/fluid_pserver.yaml
+++ b/benchmark/cluster/vgg16/fluid_pserver.yaml
+apiVersion: extensions/v1beta1
+kind: ReplicaSet
+metadata:
+  name: vgg16job-pserver
+spec:
+  replicas: 10
+  template:
+    metadata:
+      labels:
+        paddle-job-pserver: vgg16job
+    spec:
+      hostNetwork: true
+      imagePullSecrets:
+      - name: job-registry-secret
+      containers:
+      - name: pserver
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+        imagePullPolicy: Always
+        ports:
+        - name: jobport-30236
+          containerPort: 30236
+        env:
+        - name: PADDLE_JOB_NAME
+          value: vgg16job
+        - name: MKL_NUM_THREADS
+          value: "1"
+        - name: TRAINING_ROLE
+          value: "PSERVER"
+        - name: TRAINERS
+          value: "20"
+        - name: PSERVERS
+          value: "10"
+        - name: TOPOLOGY
+          value: ""
+        - name: ENTRY
+          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: PADDLE_INIT_PORT
+          value: "30236"
+        - name: PADDLE_INIT_NICS
+          value: "xgbe0"
+        - name: PADDLE_INIT_TRAINER_COUNT
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+          value: "1"
+        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+          value: "20"
+        - name: PADDLE_INIT_NUM_PASSES
+          value: "1"
+        - name: PADDLE_INIT_USE_GPU
+          value: "0"
+        - name: LD_LIBRARY_PATH
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        - name: POD_IP
+          valueFrom:
+            fieldRef:
+              fieldPath: "status.podIP"
+        command: ["paddle_k8s", "start_fluid"]
+        resources:
+          requests:
+            memory: 10Gi
+            cpu: 4
+          limits:
+            memory: 10Gi
+            cpu: 4
--- a/benchmark/cluster/vgg16/fluid_trainer.yaml
+++ b/benchmark/cluster/vgg16/fluid_trainer.yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: vgg16job-trainer
+spec:
+  parallelism: 20
+  completions: 20
+  template:
+    metadata:
+      labels:
+        paddle-job: vgg16job
+    spec:
+      imagePullSecrets:
+        - name: job-registry-secret
+      hostNetwork: true
+      containers:
+      - name: trainer
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+        imagePullPolicy: Always
+        command: ["paddle_k8s", "start_fluid"]
+        env:
+        - name: PADDLE_JOB_NAME
+          value: vgg16job
+        - name: TRAINING_ROLE
+          value: "TRAINER"
+        - name: TRAINERS
+          value: "20"
+        - name: PSERVERS
+          value: "10"
+        - name: TOPOLOGY
+          value: ""
+        - name: ENTRY
+          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: PADDLE_INIT_PORT
+          value: "30236"
+        - name: PADDLE_INIT_NICS
+          value: "xgbe0"
+        - name: PADDLE_INIT_TRAINER_COUNT
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+          value: "1"
+        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+          value: "20"
+        - name: PADDLE_INIT_NUM_PASSES
+          value: "1"
+        - name: PADDLE_INIT_USE_GPU
+          value: "0"
+        - name: LD_LIBRARY_PATH
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        - name: POD_IP
+          valueFrom:
+            fieldRef:
+              fieldPath: "status.podIP"
+        resources:
+          requests:
+            memory: 40Gi
+            cpu: 2
+          limits:
+            memory: 40Gi
+            cpu: 2
+      restartPolicy: Never
--- a/benchmark/cluster/vgg16/v2_pserver.yaml
+++ b/benchmark/cluster/vgg16/v2_pserver.yaml
+apiVersion: extensions/v1beta1
+kind: ReplicaSet
+metadata:
+  name: vgg16v2job-pserver
+spec:
+  replicas: 10
+  template:
+    metadata:
+      labels:
+        paddle-job-pserver: vgg16v2job
+    spec:
+      hostNetwork: true
+      imagePullSecrets:
+      - name: job-registry-secret
+      containers:
+      - name: pserver
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+        imagePullPolicy: Always
+        ports:
+        - name: jobport-30236
+          containerPort: 30236
+        env:
+        - name: PADDLE_JOB_NAME
+          value: vgg16v2job
+        - name: TRAINERS
+          value: "20"
+        - name: PSERVERS
+          value: "10"
+        - name: TOPOLOGY
+          value: ""
+        - name: ENTRY
+          value: "python train.py"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: PADDLE_INIT_PORT
+          value: "30236"
+        - name: PADDLE_INIT_NICS
+          value: "xgbe0"
+        - name: PADDLE_INIT_TRAINER_COUNT
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+          value: "1"
+        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+          value: "20"
+        - name: PADDLE_INIT_NUM_PASSES
+          value: "1"
+        - name: PADDLE_INIT_USE_GPU
+          value: "0"
+        - name: LD_LIBRARY_PATH
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        command: ["paddle_k8s", "start_pserver"]
+        resources:
+          requests:
+            memory: 10Gi
+            cpu: 4
+          limits:
+            memory: 10Gi
+            cpu: 4
--- a/benchmark/cluster/vgg16/v2_trainer.yaml
+++ b/benchmark/cluster/vgg16/v2_trainer.yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: vgg16v2job-trainer
+spec:
+  parallelism: 20
+  completions: 20
+  template:
+    metadata:
+      labels:
+        paddle-job: vgg16v2job
+    spec:
+      imagePullSecrets:
+        - name: job-registry-secret
+      hostNetwork: true
+      containers:
+      - name: trainer
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+        imagePullPolicy: Always
+        command: ["paddle_k8s", "start_trainer", "v2"]
+        env:
+        - name: PADDLE_JOB_NAME
+          value: vgg16v2job
+        - name: BATCH_SIZE
+          value: "256"
+        - name: TRAINERS
+          value: "20"
+        - name: PSERVERS
+          value: "10"
+        - name: TOPOLOGY
+          value: ""
+        - name: ENTRY
+          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: PADDLE_INIT_PORT
+          value: "30236"
+        - name: PADDLE_INIT_NICS
+          value: "xgbe0"
+        - name: PADDLE_INIT_TRAINER_COUNT
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+          value: "1"
+        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+          value: "20"
+        - name: PADDLE_INIT_NUM_PASSES
+          value: "2"
+        - name: PADDLE_INIT_USE_GPU
+          value: "0"
+        - name: LD_LIBRARY_PATH
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        resources:
+          requests:
+            memory: 40Gi
+            cpu: 2
+          limits:
+            memory: 40Gi
+            cpu: 2
+      restartPolicy: Never
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VGG16 benchmark in Fluid"""
+from __future__ import print_function
+import sys
+import time
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.profiler as profiler
+import argparse
+import functools
+import os
+def str2bool(v):
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--batch_size', type=int, default=128, help="Batch size for training.")
+parser.add_argument(
+    '--learning_rate',
+    type=float,
+    default=1e-3,
+    help="Learning rate for training.")
+parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
+parser.add_argument(
+    '--device',
+    type=str,
+    default='CPU',
+    choices=['CPU', 'GPU'],
+    help="The device type.")
+parser.add_argument('--device_id', type=int, default=0, help="The device id.")
+parser.add_argument(
+    '--data_format',
+    type=str,
+    default='NCHW',
+    choices=['NCHW', 'NHWC'],
+    help='The data order, now only support NCHW.')
+parser.add_argument(
+    '--data_set',
+    type=str,
+    default='cifar10',
+    choices=['cifar10', 'flowers'],
+    help='Optional dataset for benchmark.')
+parser.add_argument(
+    '--local',
+    type=str2bool,
+    default=True,
+    help='Whether to run as local mode.')
+args = parser.parse_args()
+def vgg16_bn_drop(input):
+    def conv_block(input, num_filter, groups, dropouts):
+        return fluid.nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max')
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    return fc2
+def main():
+    if args.data_set == "cifar10":
+        classdim = 10
+        if args.data_format == 'NCHW':
+            data_shape = [3, 32, 32]
+        else:
+            data_shape = [32, 32, 3]
+    else:
+        classdim = 102
+        if args.data_format == 'NCHW':
+            data_shape = [3, 224, 224]
+        else:
+            data_shape = [224, 224, 3]
+    # Input data
+    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    # Train program
+    net = vgg16_bn_drop(images)
+    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    # Evaluator
+    accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+    # inference program
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        test_target = accuracy.metrics + accuracy.states
+        inference_program = fluid.io.get_inference_program(test_target)
+    # Optimization
+    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+    optimize_ops, params_grads = optimizer.minimize(avg_cost)
+    # Initialize executor
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(
+        args.device_id)
+    exe = fluid.Executor(place)
+    # test
+    def test(exe):
+        accuracy.reset(exe)
+        for batch_id, data in enumerate(test_reader()):
+            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
+                                    data)).astype("float32")
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([-1, 1])
+            exe.run(inference_program,
+                    feed={"pixel": img_data,
+                          "label": y_data})
+        return accuracy.eval(exe)
+    def train_loop(exe, trainer_prog):
+        iters = 0
+        ts = time.time()
+        for pass_id in range(args.num_passes):
+            # train
+            start_time = time.time()
+            num_samples = 0
+            accuracy.reset(exe)
+            with profiler.profiler("CPU", 'total') as prof:
+                for batch_id, data in enumerate(train_reader()):
+                    ts = time.time()
+                    img_data = np.array(
+                        map(lambda x: x[0].reshape(data_shape), data)).astype(
+                            "float32")
+                    y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                    y_data = y_data.reshape([-1, 1])
+                    loss, acc = exe.run(
+                        trainer_prog,
+                        feed={"pixel": img_data,
+                              "label": y_data},
+                        fetch_list=[avg_cost] + accuracy.metrics)
+                    iters += 1
+                    num_samples += len(data)
+                    print(
+                        "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f"
+                        % (pass_id, iters, loss, acc, time.time() - ts)
+                    )  # The accuracy is the accumulation of batches, but not the current batch.
+            pass_elapsed = time.time() - start_time
+            pass_train_acc = accuracy.eval(exe)
+            pass_test_acc = test(exe)
+            print(
+                "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"
+                % (pass_id, num_samples / pass_elapsed, pass_train_acc,
+                   pass_test_acc))
+    if args.local:
+        # Parameter initialization
+        exe.run(fluid.default_startup_program())
+        # data reader
+        train_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
+                else paddle.dataset.flowers.train(),
+                buf_size=5120),
+            batch_size=args.batch_size)
+        test_reader = paddle.batch(
+            paddle.dataset.cifar.test10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+            batch_size=args.batch_size)
+        train_loop(exe, fluid.default_main_program())
+    else:
+        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # all pserver endpoints
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, "6174"]))
+        pserver_endpoints = ",".join(eplist)
+        print("pserver endpoints: ", pserver_endpoints)
+        trainers = int(os.getenv("TRAINERS"))  # total trainer count
+        print("trainers total: ", trainers)
+        current_endpoint = os.getenv(
+            "POD_IP") + ":6174"  # current pserver endpoint
+        training_role = os.getenv(
+            "TRAINING_ROLE",
+            "TRAINER")  # get the training role: trainer/pserver
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            optimize_ops,
+            params_grads,
+            pservers=pserver_endpoints,
+            trainers=trainers)
+        if training_role == "PSERVER":
+            if not current_endpoint:
+                print("need env SERVER_ENDPOINT")
+                exit(1)
+            pserver_prog = t.get_pserver_program(current_endpoint)
+            pserver_startup = t.get_startup_program(current_endpoint,
+                                                    pserver_prog)
+            print("starting server side startup")
+            exe.run(pserver_startup)
+            print("starting parameter server...")
+            exe.run(pserver_prog)
+        elif training_role == "TRAINER":
+            # Parameter initialization
+            exe.run(fluid.default_startup_program())
+            # data reader
+            train_reader = paddle.batch(
+                paddle.reader.shuffle(
+                    paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
+                    else paddle.dataset.flowers.train(),
+                    buf_size=5120),
+                batch_size=args.batch_size)
+            test_reader = paddle.batch(
+                paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else
+                paddle.dataset.flowers.test(),
+                batch_size=args.batch_size)
+            trainer_prog = t.get_trainer_program()
+            feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
+            # TODO(typhoonzero): change trainer startup program to fetch parameters from pserver
+            exe.run(fluid.default_startup_program())
+            train_loop(exe, trainer_prog)
+        else:
+            print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
+def print_arguments():
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+if __name__ == "__main__":
+    print_arguments()
+    main()
--- a/benchmark/cluster/vgg16/vgg16_v2.py
+++ b/benchmark/cluster/vgg16/vgg16_v2.py
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import gzip
+import paddle.v2.dataset.cifar as cifar
+import paddle.v2 as paddle
+import time
+import os
+DATA_DIM = 3 * 32 * 32
+CLASS_DIM = 10
+BATCH_SIZE = os.getenv("BATCH_SIZE")
+if BATCH_SIZE:
+    BATCH_SIZE = int(BATCH_SIZE)
+else:
+    BATCH_SIZE = 128
+print "batch_size", BATCH_SIZE
+NODE_COUNT = int(os.getenv("TRAINERS"))
+ts = 0
+def vgg(input, nums, class_dim):
+    def conv_block(input, num_filter, groups, num_channels=None):
+        return paddle.networks.img_conv_group(
+            input=input,
+            num_channels=num_channels,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act=paddle.activation.Relu(),
+            pool_type=paddle.pooling.Max())
+    assert len(nums) == 5
+    # the channel of input feature is 3
+    conv1 = conv_block(input, 64, nums[0], 3)
+    conv2 = conv_block(conv1, 128, nums[1])
+    conv3 = conv_block(conv2, 256, nums[2])
+    conv4 = conv_block(conv3, 512, nums[3])
+    conv5 = conv_block(conv4, 512, nums[4])
+    fc_dim = 512
+    fc1 = paddle.layer.fc(input=conv5,
+                          size=fc_dim,
+                          act=paddle.activation.Relu(),
+                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
+    fc2 = paddle.layer.fc(input=fc1,
+                          size=fc_dim,
+                          act=paddle.activation.Relu(),
+                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
+    out = paddle.layer.fc(input=fc2,
+                          size=class_dim,
+                          act=paddle.activation.Softmax())
+    return out
+def vgg13(input, class_dim):
+    nums = [2, 2, 2, 2, 2]
+    return vgg(input, nums, class_dim)
+def vgg16(input, class_dim):
+    nums = [2, 2, 3, 3, 3]
+    return vgg(input, nums, class_dim)
+def vgg19(input, class_dim):
+    nums = [2, 2, 4, 4, 4]
+    return vgg(input, nums, class_dim)
+def main():
+    global ts
+    paddle.init(use_gpu=False)
+    image = paddle.layer.data(
+        name="image", type=paddle.data_type.dense_vector(DATA_DIM))
+    lbl = paddle.layer.data(
+        name="label", type=paddle.data_type.integer_value(CLASS_DIM))
+    extra_layers = None
+    # NOTE: for v2 distributed training need averaging updates.
+    learning_rate = 1e-3 / NODE_COUNT
+    out = vgg16(image, class_dim=CLASS_DIM)
+    cost = paddle.layer.classification_cost(input=out, label=lbl)
+    # Create parameters
+    parameters = paddle.parameters.create(cost)
+    # Create optimizer
+    optimizer = paddle.optimizer.Momentum(
+        momentum=0.9,
+        regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
+                                                         BATCH_SIZE),
+        learning_rate=learning_rate / BATCH_SIZE,
+        learning_rate_decay_a=0.1,
+        learning_rate_decay_b=128000 * 35,
+        learning_rate_schedule="discexp", )
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            cifar.train10(),
+            # To use other data, replace the above line with:
+            # reader.train_reader('train.list'),
+            buf_size=1000),
+        batch_size=BATCH_SIZE)
+    test_reader = paddle.batch(
+        cifar.test10(),
+        # To use other data, replace the above line with:
+        # reader.test_reader('val.list'),
+        batch_size=BATCH_SIZE)
+    # Create trainer
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer,
+                                 extra_layers=extra_layers,
+                                 is_local=False)
+    # End batch and end pass event handler
+    def event_handler(event):
+        global ts, ts_pass
+        if isinstance(event, paddle.event.BeginPass):
+            ts_pass = time.time()
+        if isinstance(event, paddle.event.BeginIteration):
+            ts = time.time()
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 1 == 0:
+                print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics,
+                    time.time() - ts)
+        if isinstance(event, paddle.event.EndPass):
+            print "Pass %d end, spent: %f" % (event.pass_id,
+                                              time.time() - ts_pass)
+            result = trainer.test(reader=test_reader)
+            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+    trainer.train(
+        reader=train_reader, num_passes=200, event_handler=event_handler)
+if __name__ == '__main__':
+    main()
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -21,6 +21,7 @@ set(BOOST_URL           "http://sourceforge.net/projects/boost/files/boost/${BOO
 set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
 set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
 set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
+set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
 include_directories(${BOOST_INCLUDE_DIR})

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -186,6 +186,11 @@ function(cc_library TARGET_NAME)
      add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
    endif()
    if (cc_library_DEPS)
+      # Don't need link libwarpctc.so
+      if ("${cc_library_DEPS};" MATCHES "warpctc;")
+        list(REMOVE_ITEM cc_library_DEPS warpctc)
+        add_dependencies(${TARGET_NAME} warpctc)
+      endif()
      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
    endif()
@@ -224,12 +229,18 @@ function(cc_test TARGET_NAME)
  if(WITH_TESTING)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
+    set(multiValueArgs SRCS DEPS ARGS)
    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
+    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
+      list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
+    endif()
    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
-    add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+    add_test(NAME ${TARGET_NAME}
+             COMMAND ${TARGET_NAME} ${cc_test_ARGS}
+             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
  endif()
 endfunction(cc_test)
@@ -457,12 +468,12 @@ endfunction()
 function(py_test TARGET_NAME)
  if(WITH_TESTING)
-    set(options STATIC static SHARED shared)
+    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS ARGS)
+    set(multiValueArgs SRCS DEPS ARGS ENVS)
    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python
+             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_ENVS}
             ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
  endif()

--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -87,6 +87,11 @@ roi_pool
 ..  autoclass:: paddle.v2.layer.roi_pool
    :noindex:
+pad
+----
+..  autoclass:: paddle.v2.layer.pad
+    :noindex:
 Norm Layer
 ==========
@@ -133,6 +138,11 @@ grumemory
 ..  autoclass:: paddle.v2.layer.grumemory
    :noindex:
+gated_unit
+-----------
+..  autoclass:: paddle.v2.layer.gated_unit
+    :noindex:
 Recurrent Layer Group
 =====================
@@ -340,6 +350,11 @@ bilinear_interp
 ..  autoclass:: paddle.v2.layer.bilinear_interp
    :noindex:
+dropout
+--------
+..  autoclass:: paddle.v2.layer.dropout
+    :noindex:
 dot_prod
 ---------
 .. autoclass:: paddle.v2.layer.dot_prod
@@ -402,6 +417,11 @@ scale_shift
 ..  autoclass:: paddle.v2.layer.scale_shift
    :noindex:
+factorization_machine
+---------------------
+..  autoclass:: paddle.v2.layer.factorization_machine
+    :noindex:
 Sampling Layers
 ===============
@@ -420,22 +440,6 @@ multiplex
 ..  autoclass:: paddle.v2.layer.multiplex
    :noindex:
-Factorization Machine Layer
-============================
-factorization_machine
---------------------
-..  autoclass:: paddle.v2.layer.factorization_machine
-    :noindex:
-Slicing and Joining Layers
-==========================
-pad
----
-..  autoclass:: paddle.v2.layer.pad
-    :noindex:
 ..  _api_v2.layer_costs:
 Cost Layers
@@ -526,6 +530,11 @@ multibox_loss
 ..  autoclass:: paddle.v2.layer.multibox_loss
    :noindex:
+detection_output
+----------------
+..  autoclass:: paddle.v2.layer.detection_output
+    :noindex:
 Check Layer
 ============
@@ -534,31 +543,10 @@ eos
 ..  autoclass:: paddle.v2.layer.eos
    :noindex:
-Miscs
+Activation
-=====
+==========
-dropout
--------
-..  autoclass:: paddle.v2.layer.dropout
-    :noindex:
-Activation with learnable parameter
-===================================
 prelu
 --------
 ..  autoclass:: paddle.v2.layer.prelu
    :noindex:
-gated_unit
-----------
-..  autoclass:: paddle.v2.layer.gated_unit
-    :noindex:
-Detection output Layer
-======================
-detection_output
----------------
-..  autoclass:: paddle.v2.layer.detection_output
-    :noindex:
--- a/doc/api/v2/data/dataset.rst
+++ b/doc/api/v2/data/dataset.rst
@@ -73,3 +73,10 @@ wmt14
 ..  automodule:: paddle.v2.dataset.wmt14
    :members:
    :noindex:
+wmt16
+++++
+..  automodule:: paddle.v2.dataset.wmt16
+    :members:
+    :noindex:
--- a/doc/api/v2/fluid/data_feeder.rst
+++ b/doc/api/v2/fluid/data_feeder.rst
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
 ===========
-DataFeeder
+data_feeder
 ===========
 DataFeeder
-----------
+----------
-..  automodule:: paddle.v2.fluid.data_feeder
-    :members: DataFeeder
+..  autoclass:: paddle.v2.fluid.data_feeder.DataFeeder
+    :members:
    :noindex:
--- a/doc/api/v2/fluid/evaluator.rst
+++ b/doc/api/v2/fluid/evaluator.rst
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-Evaluator
+    !DO NOT EDIT THIS FILE MANUALLY!
-===========
+=========
-Evaluator
+evaluator
-----------
+=========
-..  automodule:: paddle.v2.fluid.evaluator
-    :members: Evaluator
+Accuracy
+--------
+..  autoclass:: paddle.v2.fluid.evaluator.Accuracy
+    :members:
    :noindex:
+ChunkEvaluator
+--------------
+..  autoclass:: paddle.v2.fluid.evaluator.ChunkEvaluator
+    :members:
+    :noindex:
--- a/doc/api/v2/fluid/executor.rst
+++ b/doc/api/v2/fluid/executor.rst
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-Executor
+    !DO NOT EDIT THIS FILE MANUALLY!
-===========
+========
+executor
+========
 Executor
+--------
+..  autoclass:: paddle.v2.fluid.executor.Executor
+    :members:
+    :noindex:
+global_scope
+------------
+..  autofunction:: paddle.v2.fluid.executor.global_scope
+    :noindex:
+scope_guard
 -----------
-..  automodule:: paddle.v2.fluid.executor
-    :members: Executor
+..  autofunction:: paddle.v2.fluid.executor.scope_guard
+    :noindex:
+switch_scope
+------------
+..  autofunction:: paddle.v2.fluid.executor.switch_scope
    :noindex:
--- a/doc/api/v2/fluid/gen_doc.py
+++ b/doc/api/v2/fluid/gen_doc.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import sys
+import types
+import paddle.v2.fluid as fluid
+def parse_arg():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--submodules', nargs="*")
+    parser.add_argument(
+        'module', type=str, help='Generate the documentation of which module')
+    return parser.parse_args()
+class DocGenerator(object):
+    def __init__(self, module_name, stream=sys.stdout):
+        self.stream = stream
+        self.module_name = module_name
+        if not hasattr(fluid, module_name):
+            raise ValueError("Cannot find fluid.{0}".format(module_name))
+        else:
+            self.module = getattr(fluid, module_name)
+        self.stream.write('''..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+''')
+        self._print_header_(module_name, dot='=', is_title=True)
+    def print_submodule(self, submodule_name):
+        submodule = getattr(self.module, submodule_name)
+        if submodule is None:
+            raise ValueError("Cannot find submodule {0}".format(submodule_name))
+        self.print_section(submodule_name)
+        for item in submodule.__all__:
+            self.print_item(item)
+    def print_current_module(self):
+        for item in self.module.__all__:
+            self.print_item(item)
+    def print_section(self, name):
+        self._print_header_(name, dot='=', is_title=False)
+    def print_item(self, name):
+        item = getattr(self.module, name)
+        if isinstance(item, types.TypeType):
+            self.print_class(name)
+        elif isinstance(item, types.FunctionType):
+            self.print_method(name)
+        else:
+            raise RuntimeError("Unsupported item {0}".format(name))
+    def print_class(self, name):
+        self._print_header_(name, dot='-', is_title=False)
+        self.stream.write('''..  autoclass:: paddle.v2.fluid.{0}.{1}
+    :members:
+    :noindex:
+'''.format(self.module_name, name))
+    def print_method(self, name):
+        self._print_header_(name, dot='-', is_title=False)
+        self.stream.write('''..  autofunction:: paddle.v2.fluid.{0}.{1}
+    :noindex:
+'''.format(self.module_name, name))
+    def _print_header_(self, name, dot, is_title):
+        dot_line = dot * len(name)
+        if is_title:
+            self.stream.write(dot_line)
+            self.stream.write('\n')
+        self.stream.write(name)
+        self.stream.write('\n')
+        self.stream.write(dot_line)
+        self.stream.write('\n')
+        self.stream.write('\n')
+def main():
+    args = parse_arg()
+    gen = DocGenerator(args.module)
+    if args.submodules is None:
+        gen.print_current_module()
+    else:
+        for submodule_name in args.submodules:
+            gen.print_submodule(submodule_name)
+if __name__ == '__main__':
+    main()
--- a/doc/api/v2/fluid/gen_doc.sh
+++ b/doc/api/v2/fluid/gen_doc.sh
+#!/bin/bash
+python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst
+for module in io data_feeder evaluator executor initializer io nets optimizer param_attr profiler regularizer
+do
+  python gen_doc.py ${module} > ${module}.rst
+done
--- a/doc/api/v2/fluid/initializer.rst
+++ b/doc/api/v2/fluid/initializer.rst
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
 ===========
-Initializer
+initializer
 ===========
+Constant
+--------
+..  autoclass:: paddle.v2.fluid.initializer.Constant
-Initializer
+    :members:
-----------
-..  automodule:: paddle.v2.fluid.initializer
-    :members: Initializer
-    :noindex:
-ConstantInitializer
-------------------
-..  automodule:: paddle.v2.fluid.initializer
-    :members: ConstantInitializer
    :noindex:
+Uniform
+-------
+..  autoclass:: paddle.v2.fluid.initializer.Uniform
-UniformInitializer
+    :members:
------------------
-..  automodule:: paddle.v2.fluid.initializer
-    :members: UniformInitializer
-    :noindex:
-NormalInitializer
-----------------
-..  automodule:: paddle.v2.fluid.initializer
-    :members: NormalInitializer
    :noindex:
+Normal
+------
-XavierInitializer
+..  autoclass:: paddle.v2.fluid.initializer.Normal
-----------------
+    :members:
-..  automodule:: paddle.v2.fluid.initializer
-    :members: XavierInitializer
    :noindex:
+Xavier
+------
-MSRAInitializer
+..  autoclass:: paddle.v2.fluid.initializer.Xavier
---------------
+    :members:
-..  automodule:: paddle.v2.fluid.initializer
-    :members: MSRAInitializer
    :noindex:
--- a/doc/api/v2/fluid/io.rst
+++ b/doc/api/v2/fluid/io.rst
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-IO
+    !DO NOT EDIT THIS FILE MANUALLY!
-===========
+==
+io
+==
+save_vars
+---------
-is_parameter
+..  autofunction:: paddle.v2.fluid.io.save_vars
+    :noindex:
+save_params
 -----------
-..  autofunction:: paddle.v2.fluid.io.is_parameter
+..  autofunction:: paddle.v2.fluid.io.save_params
+    :noindex:
+save_persistables
+-----------------
+..  autofunction:: paddle.v2.fluid.io.save_persistables
+    :noindex:
+load_vars
+---------
+..  autofunction:: paddle.v2.fluid.io.load_vars
+    :noindex:
+load_params
+-----------
+..  autofunction:: paddle.v2.fluid.io.load_params
    :noindex:
+load_persistables
+-----------------
+..  autofunction:: paddle.v2.fluid.io.load_persistables
+    :noindex:
+save_inference_model
+--------------------
+..  autofunction:: paddle.v2.fluid.io.save_inference_model
+    :noindex:
+load_inference_model
+--------------------
+..  autofunction:: paddle.v2.fluid.io.load_inference_model
+    :noindex:
+get_inference_program
+---------------------
+..  autofunction:: paddle.v2.fluid.io.get_inference_program
+    :noindex:
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
-==========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-Layers
+    !DO NOT EDIT THIS FILE MANUALLY!
-==========
+======
+layers
+======
-fc
+control_flow
---
+============
-..  autofunction:: paddle.v2.fluid.layers.fc
+split_lod_tensor
+----------------
+..  autofunction:: paddle.v2.fluid.layers.split_lod_tensor
    :noindex:
-embedding
+merge_lod_tensor
---------
+----------------
-..  autofunction:: paddle.v2.fluid.layers.embedding
+..  autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
    :noindex:
-dynamic_lstm
+BlockGuard
------------
+----------
-..  autofunction:: paddle.v2.fluid.layers.dynamic_lstm
+..  autoclass:: paddle.v2.fluid.layers.BlockGuard
+    :members:
    :noindex:
-dynamic_lstmp
+BlockGuardWithCompletion
-------------
+------------------------
-..  autofunction:: paddle.v2.fluid.layers.dynamic_lstmp
+..  autoclass:: paddle.v2.fluid.layers.BlockGuardWithCompletion
+    :members:
    :noindex:
-dynamic_gru
+StaticRNNMemoryLink
-----------
+-------------------
-..  autofunction:: paddle.v2.fluid.layers.dynamic_gru
+..  autoclass:: paddle.v2.fluid.layers.StaticRNNMemoryLink
+    :members:
    :noindex:
-data
+WhileGuard
----
+----------
-..  autofunction:: paddle.v2.fluid.layers.data
+..  autoclass:: paddle.v2.fluid.layers.WhileGuard
+    :members:
    :noindex:
-mean
+While
----
+-----
-..  autofunction:: paddle.v2.fluid.layers.mean
+..  autoclass:: paddle.v2.fluid.layers.While
+    :members:
    :noindex:
-mul
+lod_rank_table
---
+--------------
-..  autofunction:: paddle.v2.fluid.layers.mul
+..  autofunction:: paddle.v2.fluid.layers.lod_rank_table
    :noindex:
-elementwise_add
+max_sequence_len
---------------
+----------------
-..  autofunction:: paddle.v2.fluid.layers.elementwise_add
+..  autofunction:: paddle.v2.fluid.layers.max_sequence_len
    :noindex:
-elementwise_sub
+topk
---------------
+----
-..  autofunction:: paddle.v2.fluid.layers.elementwise_sub
+..  autofunction:: paddle.v2.fluid.layers.topk
    :noindex:
-elementwise_mul
+lod_tensor_to_array
---------------
+-------------------
-..  autofunction:: paddle.v2.fluid.layers.elementwise_mul
+..  autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
    :noindex:
-elementwise_div
+array_to_lod_tensor
---------------
+-------------------
-..  autofunction:: paddle.v2.fluid.layers.elementwise_div
+..  autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
    :noindex:
+increment
+---------
-dropout
+..  autofunction:: paddle.v2.fluid.layers.increment
-------
-..  autofunction:: paddle.v2.fluid.layers.dropout
    :noindex:
+array_write
+-----------
-reshape
+..  autofunction:: paddle.v2.fluid.layers.array_write
--------
-..  autofunction:: paddle.v2.fluid.layers.reshape
    :noindex:
+create_array
+------------
-sigmoid
+..  autofunction:: paddle.v2.fluid.layers.create_array
+    :noindex:
+less_than
 ---------
-..  autofunction:: paddle.v2.fluid.layers.sigmoid
+..  autofunction:: paddle.v2.fluid.layers.less_than
    :noindex:
+array_read
+----------
-scale
+..  autofunction:: paddle.v2.fluid.layers.array_read
---------
+    :noindex:
-..  autofunction:: paddle.v2.fluid.layers.scale
+shrink_memory
+-------------
+..  autofunction:: paddle.v2.fluid.layers.shrink_memory
    :noindex:
+array_length
+------------
-transpose
+..  autofunction:: paddle.v2.fluid.layers.array_length
+    :noindex:
+IfElse
+------
+..  autoclass:: paddle.v2.fluid.layers.IfElse
+    :members:
+    :noindex:
+DynamicRNN
+----------
+..  autoclass:: paddle.v2.fluid.layers.DynamicRNN
+    :members:
+    :noindex:
+ConditionalBlock
+----------------
+..  autoclass:: paddle.v2.fluid.layers.ConditionalBlock
+    :members:
+    :noindex:
+StaticRNN
 ---------
-..  autofunction:: paddle.v2.fluid.layers.transpose
+..  autoclass:: paddle.v2.fluid.layers.StaticRNN
+    :members:
    :noindex:
+reorder_lod_tensor_by_rank
+--------------------------
-sigmoid_cross_entropy_with_logits
+..  autofunction:: paddle.v2.fluid.layers.reorder_lod_tensor_by_rank
---------------------------------
-..  autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits
    :noindex:
+ParallelDo
+----------
-cast
+..  autoclass:: paddle.v2.fluid.layers.ParallelDo
+    :members:
+    :noindex:
+Print
+-----
+..  autofunction:: paddle.v2.fluid.layers.Print
+    :noindex:
+device
+======
+get_places
+----------
+..  autofunction:: paddle.v2.fluid.layers.get_places
+    :noindex:
+io
+==
+data
 ----
-..  autofunction:: paddle.v2.fluid.layers.cast
+..  autofunction:: paddle.v2.fluid.layers.data
    :noindex:
+BlockGuardServ
+--------------
-concat
+..  autoclass:: paddle.v2.fluid.layers.BlockGuardServ
-------
+    :members:
-..  autofunction:: paddle.v2.fluid.layers.concat
    :noindex:
+ListenAndServ
+-------------
-sums
+..  autoclass:: paddle.v2.fluid.layers.ListenAndServ
+    :members:
+    :noindex:
+Send
 ----
-..  autofunction:: paddle.v2.fluid.layers.sums
+..  autofunction:: paddle.v2.fluid.layers.Send
    :noindex:
+nn
+==
-linear_chain_crf
+fc
----------------
+--
-..  autofunction:: paddle.v2.fluid.layers.linear_chain_crf
+..  autofunction:: paddle.v2.fluid.layers.fc
    :noindex:
+embedding
+---------
-assign
-------
 ..  autofunction:: paddle.v2.fluid.layers.embedding
    :noindex:
+dynamic_lstm
+------------
-split_lod_tensor
+..  autofunction:: paddle.v2.fluid.layers.dynamic_lstm
----------------
-..  autofunction:: paddle.v2.fluid.layers.split_lod_tensor
    :noindex:
+dynamic_lstmp
+-------------
-merge_lod_tensor
+..  autofunction:: paddle.v2.fluid.layers.dynamic_lstmp
+    :noindex:
+dynamic_gru
+-----------
+..  autofunction:: paddle.v2.fluid.layers.dynamic_gru
+    :noindex:
+gru_unit
+--------
+..  autofunction:: paddle.v2.fluid.layers.gru_unit
+    :noindex:
+linear_chain_crf
 ----------------
-..  autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
+..  autofunction:: paddle.v2.fluid.layers.linear_chain_crf
+    :noindex:
+crf_decoding
+------------
+..  autofunction:: paddle.v2.fluid.layers.crf_decoding
    :noindex:
 cos_sim
--------
+-------
 ..  autofunction:: paddle.v2.fluid.layers.cos_sim
    :noindex:
 cross_entropy
 -------------
 ..  autofunction:: paddle.v2.fluid.layers.cross_entropy
    :noindex:
 square_error_cost
 -----------------
 ..  autofunction:: paddle.v2.fluid.layers.square_error_cost
    :noindex:
 accuracy
---------
+--------
 ..  autofunction:: paddle.v2.fluid.layers.accuracy
    :noindex:
+chunk_eval
+----------
+..  autofunction:: paddle.v2.fluid.layers.chunk_eval
+    :noindex:
 sequence_conv
 -------------
 ..  autofunction:: paddle.v2.fluid.layers.sequence_conv
    :noindex:
 conv2d
 ------
 ..  autofunction:: paddle.v2.fluid.layers.conv2d
    :noindex:
 sequence_pool
 -------------
 ..  autofunction:: paddle.v2.fluid.layers.sequence_pool
    :noindex:
+pool2d
+------
-sequence_first_step
+..  autofunction:: paddle.v2.fluid.layers.pool2d
-------------------
-..  autofunction:: paddle.v2.fluid.layers.sequence_first_step
    :noindex:
+batch_norm
+----------
+..  autofunction:: paddle.v2.fluid.layers.batch_norm
+    :noindex:
-sequence_last_step
+beam_search_decode
 ------------------
-..  autofunction:: paddle.v2.fluid.layers.sequence_last_step
+..  autofunction:: paddle.v2.fluid.layers.beam_search_decode
    :noindex:
+conv2d_transpose
+----------------
-pool2d
+..  autofunction:: paddle.v2.fluid.layers.conv2d_transpose
------
-..  autofunction:: paddle.v2.fluid.layers.pool2d
    :noindex:
+sequence_expand
+---------------
-batch_norm
+..  autofunction:: paddle.v2.fluid.layers.sequence_expand
+    :noindex:
+lstm_unit
+---------
+..  autofunction:: paddle.v2.fluid.layers.lstm_unit
+    :noindex:
+reduce_sum
 ----------
-..  autofunction:: paddle.v2.fluid.layers.batch_norm
+..  autofunction:: paddle.v2.fluid.layers.reduce_sum
+    :noindex:
+reduce_mean
+-----------
+..  autofunction:: paddle.v2.fluid.layers.reduce_mean
    :noindex:
+reduce_max
+----------
+..  autofunction:: paddle.v2.fluid.layers.reduce_max
+    :noindex:
-beam_search_decode
+reduce_min
+----------
+..  autofunction:: paddle.v2.fluid.layers.reduce_min
+    :noindex:
+sequence_first_step
+-------------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_first_step
+    :noindex:
+sequence_last_step
 ------------------
-..  autofunction:: paddle.v2.fluid.layers.beam_search_decode
+..  autofunction:: paddle.v2.fluid.layers.sequence_last_step
+    :noindex:
+dropout
+-------
+..  autofunction:: paddle.v2.fluid.layers.dropout
    :noindex:
+split
+-----
-lod_rank_table
+..  autofunction:: paddle.v2.fluid.layers.split
--------------
-..  autofunction:: paddle.v2.fluid.layers.lod_rank_table
    :noindex:
+ctc_greedy_decoder
+------------------
-max_sequence_len
+..  autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder
----------------
-..  autofunction:: paddle.v2.fluid.layers.max_sequence_len
    :noindex:
+edit_distance
+-------------
-topk
+..  autofunction:: paddle.v2.fluid.layers.edit_distance
-----
-..  autofunction:: paddle.v2.fluid.layers.topk
    :noindex:
+l2_normalize
+------------
-lod_tensor_to_array
+..  autofunction:: paddle.v2.fluid.layers.l2_normalize
-------------------
-..  autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
    :noindex:
+matmul
+------
+..  autofunction:: paddle.v2.fluid.layers.matmul
-array_to_lod_tensor
-------------------
-..  autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
    :noindex:
+warpctc
+-------
+..  autofunction:: paddle.v2.fluid.layers.warpctc
+    :noindex:
+sequence_reshape
+----------------
-fill_constant
+..  autofunction:: paddle.v2.fluid.layers.sequence_reshape
-------------
-..  autofunction:: paddle.v2.fluid.layers.fill_constant
    :noindex:
+transpose
+---------
+..  autofunction:: paddle.v2.fluid.layers.transpose
+    :noindex:
-fill_constant_batch_size_like
+im2sequence
-----------------------------
+-----------
-..  autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
+..  autofunction:: paddle.v2.fluid.layers.im2sequence
    :noindex:
+nce
+---
-ones
+..  autofunction:: paddle.v2.fluid.layers.nce
----
-..  autofunction:: paddle.v2.fluid.layers.ones
    :noindex:
+beam_search
+-----------
-zeros
+..  autofunction:: paddle.v2.fluid.layers.beam_search
-----
-..  autofunction:: paddle.v2.fluid.layers.zeros
    :noindex:
+row_conv
+--------
-increment
+..  autofunction:: paddle.v2.fluid.layers.row_conv
---------
-..  autofunction:: paddle.v2.fluid.layers.increment
    :noindex:
+multiplex
+---------
-array_write
+..  autofunction:: paddle.v2.fluid.layers.multiplex
-----------
-..  autofunction:: paddle.v2.fluid.layers.array_write
    :noindex:
+ops
+===
+mean
+----
-create_array
+..  autofunction:: paddle.v2.fluid.layers.mean
------------
-..  autofunction:: paddle.v2.fluid.layers.create_array
    :noindex:
+mul
+---
-less_than
+..  autofunction:: paddle.v2.fluid.layers.mul
---------
-..  autofunction:: paddle.v2.fluid.layers.less_than
    :noindex:
+reshape
+-------
-array_read
+..  autofunction:: paddle.v2.fluid.layers.reshape
----------
-..  autofunction:: paddle.v2.fluid.layers.array_read
    :noindex:
+scale
+-----
-shrink_memory
+..  autofunction:: paddle.v2.fluid.layers.scale
--------------
-..  autofunction:: paddle.v2.fluid.layers.shrink_memory
    :noindex:
+sigmoid_cross_entropy_with_logits
+---------------------------------
-array_length
+..  autofunction:: paddle.v2.fluid.layers.sigmoid_cross_entropy_with_logits
-------------
-..  autofunction:: paddle.v2.fluid.layers.array_length
    :noindex:
+elementwise_add
+---------------
-conv2d_transpose
+..  autofunction:: paddle.v2.fluid.layers.elementwise_add
----------------
-..  autofunction:: paddle.v2.fluid.layers.conv2d_transpose
    :noindex:
+elementwise_div
-sequence_expand
 ---------------
-..  autofunction:: paddle.v2.fluid.layers.sequence_expand
+..  autofunction:: paddle.v2.fluid.layers.elementwise_div
    :noindex:
+elementwise_sub
+---------------
-gru_unit
+..  autofunction:: paddle.v2.fluid.layers.elementwise_sub
--------
-..  autofunction:: paddle.v2.fluid.layers.gru_unit
    :noindex:
+elementwise_mul
+---------------
-lstm_unit
+..  autofunction:: paddle.v2.fluid.layers.elementwise_mul
---------
-..  autofunction:: paddle.v2.fluid.layers.lstm_unit
    :noindex:
+elementwise_max
+---------------
-sequence_softmax
+..  autofunction:: paddle.v2.fluid.layers.elementwise_max
----------------
-..  autofunction:: paddle.v2.fluid.layers.sequence_softmax
    :noindex:
+elementwise_min
+---------------
-reduce_sum
+..  autofunction:: paddle.v2.fluid.layers.elementwise_min
----------
-..  autofunction:: paddle.v2.fluid.layers.reduce_sum
    :noindex:
+elementwise_pow
+---------------
-reduce_mean
+..  autofunction:: paddle.v2.fluid.layers.elementwise_pow
-----------
-..  autofunction:: paddle.v2.fluid.layers.reduce_mean
    :noindex:
+clip
+----
-reduce_max
+..  autofunction:: paddle.v2.fluid.layers.clip
----------
-..  autofunction:: paddle.v2.fluid.layers.reduce_max
    :noindex:
+clip_by_norm
+------------
-reduce_min
+..  autofunction:: paddle.v2.fluid.layers.clip_by_norm
----------
-..  autofunction:: paddle.v2.fluid.layers.reduce_min
    :noindex:
+sequence_softmax
+----------------
-split
+..  autofunction:: paddle.v2.fluid.layers.sequence_softmax
-----
-..  autofunction:: paddle.v2.fluid.layers.split
    :noindex:
+sigmoid
+-------
-matmul
+..  autofunction:: paddle.v2.fluid.layers.sigmoid
------
-..  autofunction:: paddle.v2.fluid.layers.matmul
    :noindex:
 logsigmoid
 ----------
 ..  autofunction:: paddle.v2.fluid.layers.logsigmoid
    :noindex:
 exp
 ---
 ..  autofunction:: paddle.v2.fluid.layers.exp
    :noindex:
 relu
 ----
 ..  autofunction:: paddle.v2.fluid.layers.relu
    :noindex:
 tanh
 ----
 ..  autofunction:: paddle.v2.fluid.layers.tanh
    :noindex:
 tanh_shrink
 -----------
 ..  autofunction:: paddle.v2.fluid.layers.tanh_shrink
    :noindex:
 softshrink
 ----------
 ..  autofunction:: paddle.v2.fluid.layers.softshrink
    :noindex:
 sqrt
 ----
 ..  autofunction:: paddle.v2.fluid.layers.sqrt
    :noindex:
 abs
----
+---
 ..  autofunction:: paddle.v2.fluid.layers.abs
    :noindex:
 ceil
 ----
 ..  autofunction:: paddle.v2.fluid.layers.ceil
    :noindex:
 floor
 -----
 ..  autofunction:: paddle.v2.fluid.layers.floor
    :noindex:
 round
 -----
 ..  autofunction:: paddle.v2.fluid.layers.round
    :noindex:
 reciprocal
 ----------
 ..  autofunction:: paddle.v2.fluid.layers.reciprocal
    :noindex:
 log
 ---
 ..  autofunction:: paddle.v2.fluid.layers.log
    :noindex:
 square
 ------
 ..  autofunction:: paddle.v2.fluid.layers.square
    :noindex:
 softplus
 --------
 ..  autofunction:: paddle.v2.fluid.layers.softplus
    :noindex:
 softsign
---------
+--------
 ..  autofunction:: paddle.v2.fluid.layers.softsign
    :noindex:
 brelu
 -----
 ..  autofunction:: paddle.v2.fluid.layers.brelu
    :noindex:
 leaky_relu
 ----------
 ..  autofunction:: paddle.v2.fluid.layers.leaky_relu
    :noindex:
 soft_relu
 ---------
 ..  autofunction:: paddle.v2.fluid.layers.soft_relu
    :noindex:
 elu
----
+---
 ..  autofunction:: paddle.v2.fluid.layers.elu
    :noindex:
 relu6
 -----
 ..  autofunction:: paddle.v2.fluid.layers.relu6
    :noindex:
 pow
----
+---
 ..  autofunction:: paddle.v2.fluid.layers.pow
    :noindex:
+stanh
+-----
+..  autofunction:: paddle.v2.fluid.layers.stanh
+    :noindex:
 hard_shrink
 -----------
 ..  autofunction:: paddle.v2.fluid.layers.hard_shrink
    :noindex:
 thresholded_relu
 ----------------
 ..  autofunction:: paddle.v2.fluid.layers.thresholded_relu
    :noindex:
 hard_sigmoid
-------------
+------------
 ..  autofunction:: paddle.v2.fluid.layers.hard_sigmoid
    :noindex:
 swish
------
+-----
 ..  autofunction:: paddle.v2.fluid.layers.swish
    :noindex:
-im2sequence
+tensor
+======
+create_tensor
+-------------
+..  autofunction:: paddle.v2.fluid.layers.create_tensor
+    :noindex:
+create_parameter
+----------------
+..  autofunction:: paddle.v2.fluid.layers.create_parameter
+    :noindex:
+create_global_var
+-----------------
+..  autofunction:: paddle.v2.fluid.layers.create_global_var
+    :noindex:
+cast
+----
+..  autofunction:: paddle.v2.fluid.layers.cast
+    :noindex:
+concat
 ------
-..  autofunction:: paddle.v2.fluid.layers.im2sequence
+..  autofunction:: paddle.v2.fluid.layers.concat
    :noindex:
-edit_distance
+sums
---------------
+----
-..  autofunction:: paddle.v2.fluid.layers.edit_distance_error
+..  autofunction:: paddle.v2.fluid.layers.sums
    :noindex:
-ctc_greedy_decoder
+assign
---------------
+------
-..  autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder
+..  autofunction:: paddle.v2.fluid.layers.assign
    :noindex:
-l2_normalize
+fill_constant_batch_size_like
------------
+-----------------------------
-..  autofunction:: paddle.v2.fluid.layers.l2_normalize
+..  autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
    :noindex:
-sequence_reshape
+fill_constant
----------------
+-------------
-..  autofunction:: paddle.v2.fluid.layers.sequence_reshape
+..  autofunction:: paddle.v2.fluid.layers.fill_constant
    :noindex:
-row_conv
+ones
--------
+----
-..  autofunction:: paddle.v2.fluid.layers.row_conv
+..  autofunction:: paddle.v2.fluid.layers.ones
    :noindex:
-multiplex
+zeros
---------
+-----
-..  autofunction:: paddle.v2.fluid.layers.multiplex
+..  autofunction:: paddle.v2.fluid.layers.zeros
    :noindex:
--- a/doc/api/v2/fluid/nets.rst
+++ b/doc/api/v2/fluid/nets.rst
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-Nets
+    !DO NOT EDIT THIS FILE MANUALLY!
-===========
+====
+nets
+====
 simple_img_conv_pool
 --------------------
-..  autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
-    :noindex:
+..  autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
-img_conv_group
---------------
-..  autofunction:: paddle.v2.fluid.nets.img_conv_group
    :noindex:
 sequence_conv_pool
 ------------------
 ..  autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
    :noindex:
 glu
 ---
 ..  autofunction:: paddle.v2.fluid.nets.glu
    :noindex:
 scaled_dot_product_attention
 ----------------------------
 ..  autofunction:: paddle.v2.fluid.nets.scaled_dot_product_attention
    :noindex:
--- a/doc/api/v2/fluid/optimizer.rst
+++ b/doc/api/v2/fluid/optimizer.rst
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-Optimizer
+    !DO NOT EDIT THIS FILE MANUALLY!
-===========
-Optimizer
-----------
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: Optimizer
-    :noindex:
+=========
+optimizer
+=========
-SGDOptimizer
+SGD
-----------
+---
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: SGDOptimizer
-    :noindex:
+..  autoclass:: paddle.v2.fluid.optimizer.SGD
+    :members:
+    :noindex:
+Momentum
+--------
-MomentumOptimizer
+..  autoclass:: paddle.v2.fluid.optimizer.Momentum
-----------------
+    :members:
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: MomentumOptimizer
    :noindex:
+Adagrad
+-------
+..  autoclass:: paddle.v2.fluid.optimizer.Adagrad
-AdagradOptimizer
+    :members:
----------------
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: AdagradOptimizer
    :noindex:
+Adam
+----
-AdamOptimizer
+..  autoclass:: paddle.v2.fluid.optimizer.Adam
-------------
+    :members:
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: AdamOptimizer
    :noindex:
+Adamax
+------
-AdamaxOptimizer
+..  autoclass:: paddle.v2.fluid.optimizer.Adamax
-----------
+    :members:
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: AdamaxOptimizer
    :noindex:
+DecayedAdagrad
+--------------
-DecayedAdagradOptimizer
+..  autoclass:: paddle.v2.fluid.optimizer.DecayedAdagrad
-----------------------
+    :members:
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: DecayedAdagradOptimizer
    :noindex:
--- a/doc/api/v2/fluid/param_attr.rst
+++ b/doc/api/v2/fluid/param_attr.rst
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+==========
+param_attr
+==========
 ParamAttr
-===========
+---------
+..  autoclass:: paddle.v2.fluid.param_attr.ParamAttr
+    :members:
+    :noindex:
+WeightNormParamAttr
+-------------------
-ParamAttr
+..  autoclass:: paddle.v2.fluid.param_attr.WeightNormParamAttr
-----------
+    :members:
-..  automodule:: paddle.v2.fluid.param_attr
-    :members: ParamAttr
    :noindex:
--- a/doc/api/v2/fluid/profiler.rst
+++ b/doc/api/v2/fluid/profiler.rst
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-Profiler
+    !DO NOT EDIT THIS FILE MANUALLY!
-===========
+========
+profiler
+========
+cuda_profiler
+-------------
-Profiler
-----------
 ..  autofunction:: paddle.v2.fluid.profiler.cuda_profiler
    :noindex:
+reset_profiler
+--------------
+..  autofunction:: paddle.v2.fluid.profiler.reset_profiler
+    :noindex:
+profiler
+--------
+..  autofunction:: paddle.v2.fluid.profiler.profiler
+    :noindex:
--- a/doc/api/v2/fluid/regularizer.rst
+++ b/doc/api/v2/fluid/regularizer.rst
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
 ===========
-Regularizer
+regularizer
 ===========
-WeightDecayRegularizer
+append_regularization_ops
----------------------
+-------------------------
-..  automodule:: paddle.v2.fluid.regularizer
-    :members: WeightDecayRegularizer
-    :noindex:
-L2DecayRegularizer
+..  autofunction:: paddle.v2.fluid.regularizer.append_regularization_ops
------------------
-..  automodule:: paddle.v2.fluid.regularizer
-    :members: L2DecayRegularizer
    :noindex:
+L1Decay
+-------
+..  autoclass:: paddle.v2.fluid.regularizer.L1Decay
+    :members:
+    :noindex:
-L1DecayRegularizer
+L2Decay
-------------------
+-------
-..  automodule:: paddle.v2.fluid.regularizer
-    :members: L1DecayRegularizer
+..  autoclass:: paddle.v2.fluid.regularizer.L2Decay
+    :members:
+    :noindex:
--- a/doc/design/speech/README.MD
+++ b/doc/design/speech/README.MD
@@ -140,7 +140,19 @@ TODO by Assignees
 ### Beam Search with CTC and LM
-TODO by Assignees
+<div align="center">
+<img src="image/beam_search.png" width=600><br/>
+Figure 2. Algorithm for CTC Beam Search Decoder.
+</div>
+- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts: 
+   - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths; 
+   - 2) the if condition ```if l^+ not in A_prev then``` after probabilities' computation is deprecated for it is hard to understand and seems unnecessary.
+- An **external scorer** would be passed into the decoder to evaluate a candidate prefix during decoding whenever a white space appended in English decoding and any character appended in Mandarin decoding.
+- Such external scorer consists of language model, word count or any other custom scorers.
+- The **language model** is built from Task 5, with parameters should be carefully tuned to achieve minimum WER/CER (c.f. Task 7)
+- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality. 
 ## Future Work
@@ -153,3 +165,4 @@ TODO by Assignees
 1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016.
 2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). 	arXiv:1512.02595.
+3. Awni Y. Hannun, etc. [First-Pass Large Vocabulary Continuous Speech Recognition using Bi-Directional Recurrent DNNs](https://arxiv.org/abs/1408.2873). arXiv:1408.2873
--- a/doc/design/speech/image/beam_search.png
+++ b/doc/design/speech/image/beam_search.png
--- a/doc/design/switch.md
+++ b/doc/design/switch.md
+### Design Doc: Switch
+### Background
+Many programming languages provide `switch` as a generalization of `if-elif-else`.  We want to add it to Fluid.
+The following example shows the usage of `fluid.switch`.
+```python
+a = fluid.Var(10)
+b = fluid.Var(0)
+switch = fluid.switch()
+with switch.block():
+    with switch.case(fluid.less_equal(a, 10)):
+        fluid.print("Case 1")
+    with switch.case(fluid.larger(a, 0)):
+        fluid.print("Case 2")
+    with switch.default():
+        fluid.print("Case 3")
+```
+### The Semantics
+1. A `switch` control-flow checks cases one-by-one.
+1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values.
+1. It runs the first matched case, or the default case if there is one.
+1. Once it matches a case, it runs the corresponding branch and only that branch.  It's like there is a C's `break` keyword at the end of each case.
+The above program should print and print only "Case 1".
+The implementation of the backward pass of the `switch` control-flow is easier than the backward of the `if_else`, because `switch` runs at most one branch, whereas `if-else` could run more than one branches.
--- a/doc/getstarted/build_and_install/build_from_source_cn.rst
+++ b/doc/getstarted/build_and_install/build_from_source_cn.rst
@@ -115,7 +115,7 @@ PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种B
    "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
    "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
    "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
-    "WITH_TESTING", "是否开启单元测试", "ON"
+    "WITH_TESTING", "是否开启单元测试", "OFF"
    "WITH_DOC", "是否编译中英文文档", "OFF"
    "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练", "Auto"
    "WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON"

--- a/doc/getstarted/build_and_install/build_from_source_en.rst
+++ b/doc/getstarted/build_and_install/build_from_source_en.rst
@@ -126,7 +126,7 @@ You can add :code:`-D` argument to pass such options, like:
    "WITH_AVX", "Build with AVX support", "ON"
    "WITH_PYTHON", "Build with integrated Python interpreter", "ON"
    "WITH_STYLE_CHECK", "Check code style when building", "ON"
-    "WITH_TESTING", "Build unit tests", "ON"
+    "WITH_TESTING", "Build unit tests", "OFF"
    "WITH_DOC", "Build documentations", "OFF"
    "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
    "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"

--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -95,6 +95,12 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
     docker run -p 8888:8888 paddlepaddle/book
+国内用户可以使用下面的镜像源来加速访问：
+  .. code-block: bash
+    docker run -p 8888:8888 docker.paddlepaddlehub.com/book
 然后在浏览器中输入以下网址：
  .. code-block:: text

--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -102,6 +102,12 @@ We provide a packaged book image, simply issue the command:
     docker run -p 8888:8888 paddlepaddle/book
+For users in China, we provide a faster mirror:
+  .. code-block: bash
+    docker run -p 8888:8888 docker.paddlepaddlehub.com/book
 Then, you would back and paste the address into the local browser:
  .. code-block:: text

--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
@@ -92,11 +92,11 @@ paddle.init(
 参数说明
 - use_gpu： **可选，默认False**，是否启用GPU训练
- trainer_count：**必选，默认1**，当前训练任务trainer总个数
+- trainer_count：**必选，默认1**，当前trainer的线程数目
 - port：**必选，默认7164**，连接到pserver的端口
 - ports_num：**必选，默认1**，连接到pserver的端口个数
 - ports_num_for_sparse：**必选，默认0**，和pserver之间用于稀疏类型参数通信的端口个数
- num_gradient_servers：**必选，默认1**，当前训练任务pserver总数
+- num_gradient_servers：**必选，默认1**，当前训练任务trainer总数
 - trainer_id：**必选，默认0**，每个trainer的唯一ID，从0开始的整数
 - pservers：**必选，默认127.0.0.1**，当前训练任务启动的pserver的IP列表，多个IP使用“,”隔开

--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
@@ -95,11 +95,11 @@ paddle.init(
 Parameter Description
 - use_gpu: **optional, default False**, set to "True" to enable GPU training.
- trainer_count: **required, default 1**, total count of trainers in the training job.
+- trainer_count: **required, default 1**, number of threads in current trainer.
 - port: **required, default 7164**, port to connect to parameter server.
 - ports_num: **required, default 1**, number of ports for communication.
 - ports_num_for_sparse: **required, default 0**, number of ports for sparse type caculation.
- num_gradient_servers: **required, default 1**, total number of gradient server.
+- num_gradient_servers: **required, default 1**, number of trainers in current job.
 - trainer_id: **required, default 0**, ID for every trainer, start from 0.
 - pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".

--- a/doc/index_cn.rst
+++ b/doc/index_cn.rst
@@ -8,4 +8,3 @@ PaddlePaddle 文档
  howto/index_cn.rst
  api/index_cn.rst
  faq/index_cn.rst
-  mobile/index_cn.rst
--- a/doc/index_en.rst
+++ b/doc/index_en.rst
@@ -7,4 +7,3 @@ PaddlePaddle Documentation
  getstarted/index_en.rst
  howto/index_en.rst
  api/index_en.rst
-  mobile/index_en.rst
--- a/doc/mobile/index_cn.rst
+++ b/doc/mobile/index_cn.rst
-MOBILE
-======
-..  toctree::
-  :maxdepth: 1
-  cross_compiling_for_android_cn.md
-  cross_compiling_for_ios_cn.md
-  cross_compiling_for_raspberry_cn.md
--- a/doc/mobile/index_en.rst
+++ b/doc/mobile/index_en.rst
-MOBILE
-======
-..  toctree::
-  :maxdepth: 1
-  cross_compiling_for_android_en.md
-  cross_compiling_for_ios_en.md
-  cross_compiling_for_raspberry_en.md
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -22,7 +22,7 @@ cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
-nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
+nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
 cc_test(variable_test SRCS variable_test.cc)

--- a/paddle/framework/channel.h
+++ b/paddle/framework/channel.h
@@ -23,12 +23,10 @@ namespace framework {
 template <typename T>
 class Channel {
 public:
-  virtual void Send(T*) = 0;
+  virtual bool Send(T*) = 0;
-  virtual void Receive(T*) = 0;
+  virtual bool Receive(T*) = 0;
  virtual size_t Cap() = 0;
+  virtual void Close() = 0;
-  // Don't delete channels; instead, call Channel::Close.
- protected:
  virtual ~Channel() {}
 };
@@ -50,11 +48,7 @@ Channel<T>* MakeChannel(size_t buffer_size) {
 template <typename T>
 void CloseChannel(Channel<T>* ch) {
-  if (ch->Cap() > 0) {
+  ch->Close();
-    delete dynamic_cast<details::Buffered<T>*>(ch);
-  } else {
-    delete dynamic_cast<details::UnBuffered<T>*>(ch);
-  }
 }
 }  // namespace framework

--- a/paddle/framework/channel_test.cc
+++ b/paddle/framework/channel_test.cc
@@ -14,13 +14,329 @@ limitations under the License. */
 #include "paddle/framework/channel.h"
+#include <chrono>
+#include <thread>
 #include "gtest/gtest.h"
+using paddle::framework::Channel;
+using paddle::framework::MakeChannel;
+using paddle::framework::CloseChannel;
 TEST(Channel, MakeAndClose) {
-  using paddle::framework::Channel;
+  using paddle::framework::details::Buffered;
-  using paddle::framework::MakeChannel;
+  using paddle::framework::details::UnBuffered;
-  using paddle::framework::CloseChannel;
+  {
+    // MakeChannel should return a buffered channel is buffer_size > 0.
+    auto ch = MakeChannel<int>(10);
+    EXPECT_NE(dynamic_cast<Buffered<int> *>(ch), nullptr);
+    EXPECT_EQ(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
+    CloseChannel(ch);
+    delete ch;
+  }
+  {
+    // MakeChannel should return an un-buffered channel is buffer_size = 0.
+    auto ch = MakeChannel<int>(0);
+    EXPECT_EQ(dynamic_cast<Buffered<int> *>(ch), nullptr);
+    EXPECT_NE(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
+    CloseChannel(ch);
+    delete ch;
+  }
+}
+TEST(Channel, SufficientBufferSizeDoesntBlock) {
+  const size_t buffer_size = 10;
+  auto ch = MakeChannel<size_t>(buffer_size);
+  for (size_t i = 0; i < buffer_size; ++i) {
+    EXPECT_EQ(ch->Send(&i), true);  // should not block
+  }
+  size_t out;
+  for (size_t i = 0; i < buffer_size; ++i) {
+    EXPECT_EQ(ch->Receive(&out), true);  // should not block
+    EXPECT_EQ(out, i);
+  }
+  CloseChannel(ch);
+  delete ch;
+}
+TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
+  const size_t buffer_size = 10;
+  auto ch = MakeChannel<size_t>(buffer_size);
+  size_t sum = 0;
+  std::thread t([&]() {
+    // Try to write more than buffer size.
+    for (size_t i = 0; i < 2 * buffer_size; ++i) {
+      if (i < buffer_size)
+        EXPECT_EQ(ch->Send(&i), true);  // should block after 10 iterations
+      else
+        EXPECT_EQ(ch->Send(&i), false);
+      sum += i;
+    }
+  });
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.5 sec
+  EXPECT_EQ(sum, 45U);
+  CloseChannel(ch);
+  t.join();
+  delete ch;
+}
+TEST(Channel, SimpleUnbufferedChannelTest) {
+  auto ch = MakeChannel<int>(0);
+  unsigned sum_send = 0;
+  std::thread t([&]() {
+    for (int i = 0; i < 5; i++) {
+      EXPECT_EQ(ch->Send(&i), true);
+      sum_send += i;
+    }
+  });
+  for (int i = 0; i < 5; i++) {
+    int recv;
+    EXPECT_EQ(ch->Receive(&recv), true);
+    EXPECT_EQ(recv, i);
+  }
+  CloseChannel(ch);
+  t.join();
+  EXPECT_EQ(sum_send, 10U);
+  delete ch;
+}
+// This tests that closing a buffered channel also unblocks
+//  any receivers waiting on the channel
+TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
+  auto ch = MakeChannel<int>(1);
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+  // Launches threads that try to read and are blocked because of no writers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data;
+          // All reads should return false
+          EXPECT_EQ(ch->Receive(&data), false);
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+  // Verify that all threads are blocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], false);
+  }
+  // Explicitly close the channel
+  // This should unblock all receivers
+  CloseChannel(ch);
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  delete ch;
+}
+// This tests that closing a buffered channel also unblocks
+//  any senders waiting for channel to have write space
+TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
+  auto ch = MakeChannel<int>(1);
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+  bool send_success[num_threads];
+  // Launches threads that try to write and are blocked because of no readers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    send_success[i] = false;
+    t[i] = std::thread(
+        [&](bool *ended, bool *success) {
+          int data = 10;
+          *success = ch->Send(&data);
+          *ended = true;
+        },
+        &thread_ended[i], &send_success[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+  // Verify that atleast 4 threads are blocked
+  int ct = 0;
+  for (size_t i = 0; i < num_threads; i++) {
+    if (thread_ended[i] == false) ct++;
+  }
+  // Atleast 4 threads must be blocked
+  EXPECT_GE(ct, 4);
+  // Explicitly close the thread
+  // This should unblock all senders
+  CloseChannel(ch);
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+  // Verify that only 1 send was successful
+  ct = 0;
+  for (size_t i = 0; i < num_threads; i++) {
+    if (send_success[i]) ct++;
+  }
+  // Only 1 send must be successful
+  EXPECT_EQ(ct, 1);
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  delete ch;
+}
+// This tests that closing an unbuffered channel also unblocks
+//  unblocks any receivers waiting for senders
+TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
+  auto ch = MakeChannel<int>(0);
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+  // Launches threads that try to read and are blocked becausew of no writers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data;
+          EXPECT_EQ(ch->Receive(&data), false);
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  // Verify that all the threads are blocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], false);
+  }
+  // Explicitly close the thread
+  // This should unblock all receivers
+  CloseChannel(ch);
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  delete ch;
+}
+// This tests that closing an unbuffered channel also unblocks
+//  unblocks any senders waiting for senders
+TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
+  auto ch = MakeChannel<int>(0);
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+  // Launches threads that try to read and are blocked becausew of no writers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data = 10;
+          EXPECT_EQ(ch->Send(&data), false);
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  // Verify that all the threads are blocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], false);
+  }
+  // Explicitly close the thread
+  // This should unblock all receivers
+  CloseChannel(ch);
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  delete ch;
+}
+TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
+  auto ch = MakeChannel<int>(0);
+  unsigned sum_send = 0;
+  // Send should block after three iterations
+  // since we only have three receivers.
+  std::thread t([&]() {
+    // Try to send more number of times
+    // than receivers
+    for (int i = 0; i < 4; i++) {
+      ch->Send(&i);
+      sum_send += i;
+    }
+  });
+  for (int i = 0; i < 3; i++) {
+    int recv;
+    ch->Receive(&recv);
+    EXPECT_EQ(recv, i);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.5 sec
+  EXPECT_EQ(sum_send, 3U);
+  CloseChannel(ch);
+  t.join();
+  delete ch;
+}
+TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
+  auto ch = MakeChannel<int>(0);
+  unsigned sum_send = 0;
+  unsigned sum_receive = 0;
+  // The receiver should block after 5
+  // iterations, since there are only 5 senders.
+  std::thread t([&]() {
+    for (int i = 0; i < 8; i++) {
+      int recv;
+      ch->Receive(&recv);  // should block after the fifth iteration.
+      EXPECT_EQ(recv, i);
+      sum_receive += i;
+    }
+  });
+  for (int i = 0; i < 5; i++) {
+    ch->Send(&i);
+    sum_send += i;
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  EXPECT_EQ(sum_send, 10U);
+  EXPECT_EQ(sum_receive, 10U);
+  // send three more elements
+  for (int i = 5; i < 8; i++) {
+    ch->Send(&i);
+    sum_send += i;
+  }
-  Channel<int>* ch = MakeChannel<int>(10);
  CloseChannel(ch);
+  t.join();
+  EXPECT_EQ(sum_send, 28U);
+  EXPECT_EQ(sum_receive, 28U);
+  delete ch;
 }
--- a/paddle/framework/details/buffered_channel.h
+++ b/paddle/framework/details/buffered_channel.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <mutex>
 #include "paddle/framework/channel.h"
+#include "paddle/platform/enforce.h"
 namespace paddle {
 namespace framework {
@@ -29,9 +30,11 @@ class Buffered : public paddle::framework::Channel<T> {
  friend void paddle::framework::CloseChannel<T>(Channel<T>*);
 public:
-  virtual void Send(T*);
+  virtual bool Send(T*);
-  virtual void Receive(T*);
+  virtual bool Receive(T*);
  virtual size_t Cap() { return cap_; }
+  virtual void Close();
+  virtual ~Buffered();
 private:
  size_t cap_;
@@ -39,42 +42,64 @@ class Buffered : public paddle::framework::Channel<T> {
  std::condition_variable empty_cond_var_;
  std::condition_variable full_cond_var_;
  std::deque<T> channel_;
+  bool closed_;
-  Buffered(size_t cap) : cap_(cap) {}
+  Buffered(size_t cap) : cap_(cap), closed_(false) {
-  virtual ~Buffered();
+    PADDLE_ENFORCE_GT(cap, 0);
+  }
-  void NotifyAllSenders(std::unique_lock<std::mutex>*);
+  void NotifyAllParticipants(std::unique_lock<std::mutex>*);
 };
 template <typename T>
-void Buffered<T>::Send(T* item) {
+bool Buffered<T>::Send(T* item) {
+  std::unique_lock<std::mutex> lock(mu_);
+  full_cond_var_.wait(lock,
+                      [this]() { return channel_.size() < cap_ || closed_; });
+  bool ret = false;
+  if (!closed_) {
+    channel_.push_back(std::move(*item));
+    lock.unlock();
+    empty_cond_var_.notify_one();
+    ret = true;
+  }
+  return ret;
+}
+template <typename T>
+bool Buffered<T>::Receive(T* item) {
  std::unique_lock<std::mutex> lock(mu_);
-  full_cond_var_.wait(lock, [this]() { return channel_.size() < cap_; });
+  empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
-  channel_.push_back(std::move(*item));
+  bool ret = false;
-  lock.unlock();
+  if (!closed_) {
-  empty_cond_var_.notify_one();
+    *item = std::move(channel_.front());
+    channel_.pop_front();
+    full_cond_var_.notify_one();
+    ret = true;
+  }
+  return ret;
 }
 template <typename T>
-void Buffered<T>::Receive(T* item) {
+void Buffered<T>::Close() {
  std::unique_lock<std::mutex> lock(mu_);
-  empty_cond_var_.wait(lock, [this]() { return !channel_.empty(); });
+  closed_ = true;
-  *item = std::move(channel_.front());
+  NotifyAllParticipants(&lock);
-  channel_.pop_front();
-  NotifyAllSenders(&lock);
 }
 template <typename T>
 Buffered<T>::~Buffered() {
  std::unique_lock<std::mutex> lock(mu_);
+  closed_ = true;
  channel_.clear();
-  NotifyAllSenders(&lock);
+  NotifyAllParticipants(&lock);
 }
 template <typename T>
-void Buffered<T>::NotifyAllSenders(std::unique_lock<std::mutex>* lock) {
+void Buffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
  lock->unlock();
-  full_cond_var_.notify_one();
+  full_cond_var_.notify_all();
+  empty_cond_var_.notify_all();
 }
 }  // namespace details

--- a/paddle/framework/details/unbuffered_channel.h
+++ b/paddle/framework/details/unbuffered_channel.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <atomic>
 #include <condition_variable>
-#include <deque>
 #include <mutex>
 #include "paddle/framework/channel.h"
@@ -29,23 +29,117 @@ class UnBuffered : public paddle::framework::Channel<T> {
  friend void paddle::framework::CloseChannel<T>(Channel<T>*);
 public:
-  virtual void Send(T*);
+  virtual bool Send(T*);
-  virtual void Receive(T*);
+  virtual bool Receive(T*);
  virtual size_t Cap() { return 0; }
+  virtual void Close();
+  virtual ~UnBuffered();
 private:
-  UnBuffered() {}
+  std::mutex mu_ch_;
-  virtual ~UnBuffered();
+  // Mutex for readers and writers who are waiting for other reader
+  // and writer to complete execution
+  std::recursive_mutex mu_read_, mu_write_;
+  // reader_found_ is set true when a reader is ready to accept data
+  // writer_found_ is set true when a writer is ready to send data
+  // A transaction occurs only when both are true
+  std::atomic<bool> reader_found_{false}, writer_found_{false};
+  std::condition_variable cv_channel_;
+  std::condition_variable_any cv_reader_, cv_writer_;
+  T* item{nullptr};
+  std::atomic<bool> closed_{false};
+  UnBuffered() : closed_(false) {}
+  void NotifyAllParticipants(std::unique_lock<std::mutex>*);
 };
+// This function implements the concept of how data should
+// be sent from a writer to a reader.
+template <typename T>
+bool UnBuffered<T>::Send(T* data) {
+  // Prevent other writers from entering
+  std::unique_lock<std::recursive_mutex> writer_lock(mu_write_);
+  writer_found_ = true;
+  std::unique_lock<std::recursive_mutex> cv_lock(mu_write_);
+  // If writer comes first, it should wait till a reader arrives
+  cv_writer_.wait(cv_lock,
+                  [this]() { return reader_found_ == true || closed_; });
+  cv_reader_.notify_one();
+  bool ret = false;
+  if (!closed_) {
+    std::unique_lock<std::mutex> channel_lock(mu_ch_);
+    item = data;
+    channel_lock.unlock();
+    cv_channel_.notify_one();
+    channel_lock.lock();
+    cv_channel_.wait(channel_lock,
+                     [this]() { return item == nullptr || closed_; });
+    ret = true;
+  }
+  writer_found_ = false;
+  return ret;
+}
+// This function implements the concept of how
+// data that was sent by a writer is read from a reader.
+template <typename T>
+bool UnBuffered<T>::Receive(T* data) {
+  // Prevent other readers from entering
+  std::unique_lock<std::recursive_mutex> read_lock{mu_read_};
+  reader_found_ = true;
+  std::unique_lock<std::recursive_mutex> cv_lock{mu_read_};
+  // If reader comes first, it should wait till a writer arrives
+  cv_reader_.wait(cv_lock,
+                  [this]() { return writer_found_ == true || closed_; });
+  cv_writer_.notify_one();
+  bool ret = false;
+  if (!closed_) {
+    std::unique_lock<std::mutex> lock_ch{mu_ch_};
+    // Reader should wait for the writer to first write its data
+    cv_channel_.wait(lock_ch, [this]() { return item != nullptr || closed_; });
+    if (!closed_) {
+      *data = std::move(*item);
+      item = nullptr;
+      lock_ch.unlock();
+      ret = true;
+    }
+    cv_channel_.notify_one();
+  }
+  reader_found_ = false;
+  return ret;
+}
+// This function implements the sequence of events
+// that take place once the channel is closed.
 template <typename T>
-void UnBuffered<T>::Send(T* channel_element) {}
+void UnBuffered<T>::Close() {
+  std::unique_lock<std::mutex> lock(mu_ch_);
+  item = nullptr;
+  closed_ = true;
+  NotifyAllParticipants(&lock);
+}
+// This function implements the sequence of events
+// that are executed once the object of an UnBuffered
+// channel is destroyed.
 template <typename T>
-void UnBuffered<T>::Receive(T*) {}
+UnBuffered<T>::~UnBuffered() {
+  std::unique_lock<std::mutex> lock(mu_ch_);
+  item = nullptr;
+  closed_ = true;
+  NotifyAllParticipants(&lock);
+}
+// This function notifies all the readers, writers and
+// the channel condition variables.
 template <typename T>
-UnBuffered<T>::~UnBuffered() {}
+void UnBuffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
+  lock->unlock();
+  cv_writer_.notify_all();
+  cv_channel_.notify_all();
+  cv_reader_.notify_all();
+}
 }  // namespace details
 }  // namespace framework

--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -25,7 +25,7 @@ limitations under the License. */
 #include "paddle/platform/place.h"
 #include "paddle/platform/profiler.h"
-DECLARE_bool(do_memory_benchmark);
+DECLARE_bool(benchmark);
 DEFINE_bool(check_nan_inf, false,
            "Checking whether operator produce NAN/INF or not. It will be "
            "extremely slow so please use this flag wisely.");
@@ -33,9 +33,6 @@ DEFINE_bool(check_nan_inf, false,
 namespace paddle {
 namespace framework {
-const std::string kFeedOpType = "feed";
-const std::string kFetchOpType = "fetch";
 Executor::Executor(const platform::Place& place) : place_(place) {}
 static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
@@ -125,7 +122,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
    op->Run(*local_scope, place_);
    VLOG(3) << op->DebugStringEx(local_scope);
-    if (FLAGS_do_memory_benchmark) {
+    if (FLAGS_benchmark) {
      VLOG(2) << "Memory used after operator " + op->Type() + " running: "
              << memory::memory_usage(place_);
    }
@@ -142,7 +139,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
  if (create_vars && create_local_scope) {
    scope->DeleteScope(local_scope);
  }
-  if (FLAGS_do_memory_benchmark) {
+  if (FLAGS_benchmark) {
    VLOG(2) << "-------------------------------------------------------";
    VLOG(2) << "Memory used after deleting local scope: "
            << memory::memory_usage(place_);

--- a/paddle/framework/feed_fetch_type.h
+++ b/paddle/framework/feed_fetch_type.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <string>
 #include <vector>
 #include "paddle/framework/lod_tensor.h"
@@ -20,5 +21,8 @@ namespace paddle {
 namespace framework {
 using FeedFetchType = LoDTensor;
 using FeedFetchList = std::vector<FeedFetchType>;
+static const std::string kFeedOpType = "feed";
+static const std::string kFetchOpType = "fetch";
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/init.cc
+++ b/paddle/framework/init.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <string.h>  // for strdup
 #include <algorithm>
+#include <stdexcept>
 #include <string>
 #include "paddle/framework/init.h"
@@ -46,17 +47,23 @@ void InitDevices() {
  std::vector<platform::Place> places;
  places.emplace_back(platform::CPUPlace());
+  int count = 0;
 #ifdef PADDLE_WITH_CUDA
-  int count = platform::GetCUDADeviceCount();
+  try {
-  for (int i = 0; i < count; ++i) {
+    count = platform::GetCUDADeviceCount();
-    places.emplace_back(platform::CUDAPlace(i));
+  } catch (const std::exception &exp) {
+    LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
  }
 #else
  LOG(WARNING)
-      << "'GPU' is not supported, Please re-compile with WITH_GPU option";
+      << "'CUDA' is not supported, Please re-compile with WITH_GPU option";
 #endif
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(platform::CUDAPlace(i));
+  }
  platform::DeviceContextPool::Init(places);
 }

--- a/paddle/framework/init_test.cc
+++ b/paddle/framework/init_test.cc
@@ -20,7 +20,21 @@ TEST(InitDevices, CPU) {
  using paddle::framework::InitDevices;
  using paddle::platform::DeviceContextPool;
+#ifndef PADDLE_WITH_CUDA
  InitDevices();
  DeviceContextPool& pool = DeviceContextPool::Instance();
-  ASSERT_GE(pool.size(), 1U);
+  ASSERT_EQ(pool.size(), 1U);
+#endif
+}
+TEST(InitDevices, CUDA) {
+  using paddle::framework::InitDevices;
+  using paddle::platform::DeviceContextPool;
+#ifdef PADDLE_WITH_CUDA
+  int count = paddle::platform::GetCUDADeviceCount();
+  InitDevices();
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
+#endif
 }
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -24,8 +24,6 @@ limitations under the License. */
 #include <algorithm>
 #include <iterator>
-#include <glog/logging.h>
 namespace paddle {
 namespace framework {

--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -18,11 +18,11 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
-#include <thrust/system/cuda/experimental/pinned_allocator.h>
 #endif
 #include <glog/logging.h>
 #include "paddle/framework/ddim.h"
+#include "paddle/framework/mixed_vector.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/framework/tensor_util.h"
 #include "paddle/platform/enforce.h"
@@ -31,15 +31,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-#ifndef PADDLE_WITH_CUDA
-template <typename T>
-using Vector = std::vector<T>;
-#else
-template <typename T>
-using Vector = thrust::host_vector<
-    T, thrust::system::cuda::experimental::pinned_allocator<T>>;
-#endif
 /*
 * LoD is short for Level of Details.
 *
@@ -55,7 +46,15 @@ using Vector = thrust::host_vector<
 *    0 2 4 7
 *    0 2 5 7 10 12 15 20
 */
-using LoD = std::vector<Vector<size_t>>;
+struct LoD : public std::vector<Vector<size_t>> {
+  using std::vector<Vector<size_t>>::vector;
+  void CopyFromCUDA() {
+    for (auto it = this->begin(); it != this->end(); ++it) {
+      it->CopyFromCUDA();
+    }
+  }
+};
 std::ostream& operator<<(std::ostream& os, const LoD& lod);
 std::ostream& operator<<(std::ostream& os, const LoDTensor& t);
@@ -109,7 +108,10 @@ bool CheckAbsLoD(const LoD& in, int tensor_height = -1);
 */
 class LoDTensor : public Tensor {
 public:
-  LoDTensor() {}
+  LoDTensor() : Tensor() {}
+  /* Constructor with place should only be used in pybind */
+  explicit LoDTensor(const platform::Place& place) : Tensor(place) {}
  explicit LoDTensor(const LoD& lod) : lod_(lod) {}

--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -23,6 +23,17 @@
 namespace paddle {
 namespace framework {
+TEST(LoD, data) {
+  LoD lod{{0, 1, 2}};
+  lod.push_back({0, 2, 4, 5});
+  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
+  auto& v = lod[0];
+  for (size_t i = 0; i < v.size(); ++i) {
+    EXPECT_EQ(v[i], i);
+  }
+}
 TEST(LodExpand, test) {
  LoD lod{{0, 2}};
  LoDTensor tensor;

--- a/paddle/framework/lod_tensor_test.cu
+++ b/paddle/framework/lod_tensor_test.cu
@@ -14,6 +14,8 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <stdio.h>
+#include "paddle/framework/init.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/platform/assert.h"
@@ -26,7 +28,48 @@ __global__ void test(size_t* a, int size) {
  }
 }
+TEST(Vector, Normal) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  using namespace paddle::memory;
+  paddle::framework::InitDevices();
+  paddle::framework::Vector<size_t> vec({1, 2, 3});
+  size_t* ptr = vec.data();
+  for (size_t i = 0; i < vec.size(); ++i) {
+    EXPECT_EQ(vec[i], *(ptr + i));
+  }
+  vec.clear();
+  vec.CopyFromCUDA();
+  std::vector<size_t> v = {1, 2, 3};
+  for (size_t i = 0; i < v.size(); ++i) {
+    EXPECT_EQ(v[i], vec[i]);
+  }
+}
+TEST(LoD, data) {
+  paddle::framework::InitDevices();
+  paddle::framework::LoD lod{{0, 1, 2}};
+  lod.push_back({0, 2, 4, 5});
+  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
+  auto& v = lod[0];
+  test<<<1, 1>>>(v.cuda_data(), v.size());
+  cudaDeviceSynchronize();
+  v.CopyFromCUDA();
+  for (size_t i = 0; i < v.size(); ++i) {
+    EXPECT_EQ(v[i], i * 2);
+  }
+}
 TEST(LoDTensor, LoDInGPU) {
+  paddle::framework::InitDevices();
  paddle::framework::LoDTensor lod_tensor;
  paddle::platform::CUDAPlace place(0);
@@ -42,8 +85,9 @@ TEST(LoDTensor, LoDInGPU) {
  auto lod = lod_tensor.lod();
-  test<<<1, 8>>>(lod[0].data(), lod[0].size());
+  test<<<1, 8>>>(lod[0].cuda_data(), lod[0].size());
  cudaDeviceSynchronize();
+  lod.CopyFromCUDA();
  for (size_t i = 0; i < src_lod[0].size(); ++i) {
    EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);

--- a/paddle/framework/mixed_vector.h
+++ b/paddle/framework/mixed_vector.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <initializer_list>
+#include <vector>
+#include "paddle/memory/memcpy.h"
+#include "paddle/memory/memory.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/place.h"
+namespace paddle {
+namespace framework {
+/**
+ * @brief Vector support both cpu and gpu.
+ * host vector lifetime is same with Vector
+ * device vector is lazily malloc and modified.
+ */
+template <typename T>
+class Vector : public std::vector<T> {
+ public:
+  using std::vector<T>::vector;
+  Vector() {}
+  Vector(const std::vector<T> &v) : std::vector<T>(v) {}  // NOLINT
+  virtual ~Vector() {
+#ifdef PADDLE_WITH_CUDA
+    if (cuda_ptr_ != nullptr) {
+      memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
+    }
+#endif
+  }
+  /* Get device vector */
+  T *cuda_data() {
+    CopyToCUDA();
+    PADDLE_ENFORCE_NOT_NULL(
+        cuda_ptr_, "No data or Insufficient CUDA memory to allocation");
+    return static_cast<T *>(cuda_ptr_);
+  }
+  /* Get host vector */
+  T *data() { return std::vector<T>::data(); }
+  const T *data() const { return std::vector<T>::data(); }
+  /* Synchronize host vector to device vector */
+  void CopyToCUDA();
+  /* Synchronize device vector to host vector */
+  void CopyFromCUDA();
+  /* Switch device vector location */
+  void CopyToPeer(platform::Place);
+ private:
+  void *cuda_ptr_ = nullptr;
+  size_t cuda_size_ = 0;  // device vector numel
+  platform::CUDAPlace place_;
+};
+template <typename T>
+void Vector<T>::CopyToCUDA() {
+#ifdef PADDLE_WITH_CUDA
+  if (cuda_size_ < this->size()) {
+    if (cuda_ptr_ != nullptr) {
+      memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
+    }
+    cuda_ptr_ =
+        memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T));
+  }
+  cuda_size_ = this->size();
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto *ctx = pool.GetByPlace(place_);
+  memory::Copy(place_, cuda_ptr_, platform::CPUPlace(),
+               static_cast<const void *>(this->data()),
+               this->size() * sizeof(T), ctx->stream());
+  ctx->Wait();
+#endif
+}
+template <typename T>
+void Vector<T>::CopyFromCUDA() {
+#ifdef PADDLE_WITH_CUDA
+  if (cuda_ptr_ == nullptr) {
+    LOG(WARNING) << "No uncommitted cuda data.";
+    return;
+  }
+  this->resize(cuda_size_);
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto *ctx = pool.GetByPlace(place_);
+  memory::Copy(platform::CPUPlace(), static_cast<void *>(this->data()), place_,
+               static_cast<const void *>(cuda_ptr_), this->size() * sizeof(T),
+               ctx->stream());
+  ctx->Wait();
+#endif
+}
+template <typename T>
+void Vector<T>::CopyToPeer(platform::Place peer_place) {
+#ifdef PADDLE_WITH_CUDA
+  auto *ctx = platform::DeviceContextPool::Instance().GetByPlace(place_);
+  void *peer_cuda_ptr = memory::Alloc<platform::CUDAPlace>(
+      boost::get<platform::CUDAPlace>(peer_place), this->size() * sizeof(T));
+  memory::Copy(boost::get<platform::CUDAPlace>(peer_place), peer_cuda_ptr,
+               place_, cuda_ptr_, this->size() * sizeof(T), ctx->stream());
+  ctx->Wait();
+  memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
+  place_ = boost::get<platform::CUDAPlace>(peer_place);
+  cuda_ptr_ = peer_cuda_ptr;
+#endif
+}
+template class Vector<int>;
+template class Vector<unsigned>;
+template class Vector<size_t>;
+template class Vector<int64_t>;
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -39,10 +39,6 @@ class CompileTimeInferShapeContext : public InferShapeContext {
  bool HasOutputs(const std::string &name) const override;
-  DDim GetInputDim(const std::string &name) const override;
-  void SetOutputDim(const std::string &name, const DDim &dim) override;
  AttrReader Attrs() const override;
  const std::vector<std::string> &Inputs(
@@ -444,21 +440,6 @@ bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const {
  return true;
 }
-DDim CompileTimeInferShapeContext::GetInputDim(const std::string &name) const {
-  std::vector<DDim> ddims = GetInputsDim(name);
-  auto length = ddims.size();
-  PADDLE_ENFORCE_EQ(length, 1UL,
-                    "Input(%s) should have 1 value, "
-                    "but it has %d now",
-                    name, length);
-  return ddims[0];
-}
-void CompileTimeInferShapeContext::SetOutputDim(const std::string &name,
-                                                const DDim &dim) {
-  SetOutputsDim(name, {dim});
-}
 AttrReader CompileTimeInferShapeContext::Attrs() const {
  return AttrReader(op_.GetAttrMap());
 }

--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -22,9 +22,7 @@ limitations under the License. */
 #include "paddle/framework/shape_inference.h"
 #include "paddle/framework/var_type.h"
-DEFINE_bool(op_sync, false,
+DECLARE_bool(benchmark);
-            "Default cuda is asynchronous device, set to True will"
-            "force op run in synchronous mode.");
 namespace paddle {
 namespace framework {
@@ -368,14 +366,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
    return true;
  }
-  DDim GetInputDim(const std::string& name) const override {
-    return GetDim(op_.Input(name));
-  }
-  void SetOutputDim(const std::string& name, const DDim& dim) override {
-    SetDim(op_.Output(name), dim);
-  }
  AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
  const std::vector<std::string>& Inputs(
@@ -531,7 +521,7 @@ void OperatorWithKernel::Run(const Scope& scope,
      ExecutionContext(*this, new_scope, *new_dev_ctx));
  /*For profiling/benchmark only*/
-  if (FLAGS_op_sync) {
+  if (FLAGS_benchmark) {
    new_dev_ctx->Wait();
  }
 }

--- a/paddle/framework/program_desc.cc
+++ b/paddle/framework/program_desc.cc
@@ -14,13 +14,11 @@ limitations under the License. */
 #include "paddle/framework/program_desc.h"
 #include "paddle/framework/block_desc.h"
+#include "paddle/framework/feed_fetch_type.h"
 namespace paddle {
 namespace framework {
-const std::string kFeedOpType = "feed";
-const std::string kFetchOpType = "fetch";
 BlockDesc *ProgramDesc::AppendBlock(const BlockDesc &parent) {
  auto *b = desc_.add_blocks();
  b->set_parent_idx(parent.ID());

--- a/paddle/framework/program_desc.h
+++ b/paddle/framework/program_desc.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <memory>
 #include <vector>
+#include "paddle/framework/block_desc.h"
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/proto_desc.h"
 #include "paddle/platform/macros.h"

--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -20,9 +20,11 @@ limitations under the License. */
 #include "paddle/framework/threadpool.h"
 #include "paddle/string/printf.h"
-DEFINE_bool(do_memory_benchmark, false,
+DEFINE_bool(benchmark, false,
            "Doing memory benchmark. It will make deleting scope synchronized, "
-            "and add some memory usage logs");
+            "and add some memory usage logs."
+            "Default cuda is asynchronous device, set to True will"
+            "force op run in synchronous mode.");
 namespace paddle {
 namespace framework {
@@ -93,7 +95,7 @@ void Scope::DeleteScope(Scope* scope) {
  PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
  this->kids_.erase(it);
  // When making memory benchmark on Fluid, we have to delete scope sync.
-  if (FLAGS_do_memory_benchmark) {
+  if (FLAGS_benchmark) {
    delete scope;
  } else {
    Async([scope] { delete scope; });

--- a/paddle/framework/shape_inference.cc
+++ b/paddle/framework/shape_inference.cc
@@ -18,10 +18,18 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-std::vector<framework::DDim> InferShapeContext::GetInputsDim(
+DDim InferShapeContext::GetInputDim(const std::string &name) const {
+  const std::vector<std::string> &arg_names = Inputs(name);
+  PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
+                    "Input(%s) should hold one element, but now it holds %d",
+                    name, arg_names.size());
+  return this->GetDim(arg_names[0]);
+}
+std::vector<DDim> InferShapeContext::GetInputsDim(
    const std::string &name) const {
-  const std::vector<std::string> &names = Inputs(name);
+  const std::vector<std::string> &arg_names = Inputs(name);
-  return GetDims(names);
+  return GetDims(arg_names);
 }
 DDim InferShapeContext::GetInputsElementDim(const std::string &name,
@@ -30,24 +38,31 @@ DDim InferShapeContext::GetInputsElementDim(const std::string &name,
  return this->GetDim(names[idx]);
 }
-void InferShapeContext::SetOutputsDim(
+void InferShapeContext::SetOutputDim(const std::string &name, const DDim &dim) {
-    const std::string &name, const std::vector<framework::DDim> &dims) {
+  auto &arg_names = Outputs(name);
+  PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
+                    "Output(%s) should hold one element, but now it holds %d",
+                    name, arg_names.size());
+  SetDim(arg_names[0], dim);
+}
+void InferShapeContext::SetOutputsDim(const std::string &name,
+                                      const std::vector<DDim> &dims) {
  auto &names = Outputs(name);
  SetDims(names, dims);
 }
-std::vector<framework::DDim> InferShapeContext::GetDims(
+std::vector<DDim> InferShapeContext::GetDims(
    const std::vector<std::string> &names) const {
-  std::vector<framework::DDim> ret;
+  std::vector<DDim> ret;
  ret.reserve(names.size());
  std::transform(
      names.begin(), names.end(), std::back_inserter(ret),
      [this](const std::string &name) { return this->GetDim(name); });
  return ret;
 }
 void InferShapeContext::SetDims(const std::vector<std::string> &names,
-                                const std::vector<framework::DDim> &dims) {
+                                const std::vector<DDim> &dims) {
  size_t length = names.size();
  PADDLE_ENFORCE_EQ(length, dims.size());
  for (size_t i = 0; i < length; ++i) {

--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -35,14 +35,13 @@ class InferShapeContext {
  virtual bool HasInputs(const std::string &name) const = 0;
  virtual bool HasOutputs(const std::string &name) const = 0;
-  virtual framework::DDim GetInputDim(const std::string &name) const = 0;
+  DDim GetInputDim(const std::string &name) const;
-  std::vector<framework::DDim> GetInputsDim(const std::string &name) const;
+  std::vector<DDim> GetInputsDim(const std::string &name) const;
  DDim GetInputsElementDim(const std::string &name, int idx) const;
-  virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0;
+  void SetOutputDim(const std::string &name, const DDim &dim);
-  void SetOutputsDim(const std::string &name,
+  void SetOutputsDim(const std::string &name, const std::vector<DDim> &dims);
-                     const std::vector<framework::DDim> &dims);
  virtual AttrReader Attrs() const = 0;
  virtual const std::vector<std::string> &Inputs(
@@ -57,15 +56,13 @@ class InferShapeContext {
  // Note: In while op, we need this to be public
  void SetDims(const std::vector<std::string> &names,
-               const std::vector<framework::DDim> &dims);
+               const std::vector<DDim> &dims);
 protected:
-  virtual framework::DDim GetDim(const std::string &name) const = 0;
+  virtual DDim GetDim(const std::string &name) const = 0;
-  virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0;
+  virtual void SetDim(const std::string &name, const DDim &dim) = 0;
-  std::vector<framework::DDim> GetDims(
-      const std::vector<std::string> &names) const;
+  std::vector<DDim> GetDims(const std::vector<std::string> &names) const;
  std::vector<proto::VarDesc::VarType> GetVarTypes(
      const std::vector<std::string> &names) const;

--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -47,6 +47,11 @@ class Tensor {
 public:
  Tensor() : offset_(0) {}
+  /*! Constructor with place should only be used in pybind. */
+  explicit Tensor(const platform::Place& place) : offset_(0) {
+    holder_->set_place(place);
+  }
  /*! Return a pointer to mutable memory block. */
  template <typename T>
  inline T* data();
@@ -137,6 +142,7 @@ class Tensor {
    virtual std::type_index type() const = 0;
    virtual platform::Place place() const = 0;
    virtual void set_type(std::type_index type) = 0;
+    virtual void set_place(platform::Place place) = 0;
  };
  template <typename Place>
@@ -156,6 +162,7 @@ class Tensor {
    virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
    virtual std::type_index type() const { return type_; }
    virtual void set_type(std::type_index type) { type_ = type; }
+    virtual void set_place(platform::Place place) { place_ = place; }
    /*! the pointer of memory block. */
    std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_;

--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -178,19 +178,22 @@ public:
    real* inputData = inputs[0].data<real>();
    real* filterData = inputs[1].data<real>();
    real* outputData = outputs[0].data<real>();
+    real* colData = NULL;
    bool needIm2col = isNeedIm2col(filter);
    TensorShape imShape =
        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
    TensorShape colShape;
-    real* colData = NULL;
-    size_t colHeight = inputChannels / groups_ * filterHeight * filterWidth;
+    // Max col matrix width 4096, Max col matrix size 4M.
-    size_t colWidth = outputHeight * outputWidth;
+    size_t outputHeightSteps =
-    // Max col matrix height 256, Max col matrix width 1024
+        std::min(std::max(4096 / outputWidth, (size_t)1), outputHeight);
-    size_t stepColHeight = std::min(colHeight, static_cast<size_t>(256));
+    size_t maxColWidth = outputHeightSteps * outputWidth;
-    size_t stepColWidth = std::min(colWidth, static_cast<size_t>(2048));
+    size_t channelSteps =
+        std::min(std::max((1048576 / maxColWidth) / filterHeight * filterWidth,
+                          (size_t)1),
+                 inputChannels / groups_);
+    size_t maxColHeight = channelSteps * filterHeight * filterWidth;
    if (needIm2col) {
      colShape = TensorShape({inputChannels / groups_,
@@ -199,7 +202,7 @@ public:
                              outputHeight,
                              outputWidth});
-      resizeBuffer<Device>(stepColHeight * stepColWidth * sizeof(real));
+      resizeBuffer<Device>(maxColHeight * maxColWidth * sizeof(real));
      colData = reinterpret_cast<real*>(memory_->getBuf());
    }
@@ -209,20 +212,24 @@ public:
        (outputChannels / groups_) * outputHeight * outputWidth;
    size_t filterOffset = filter.getElements() / groups_;
-    int nStride = colWidth;
+    int nStride = outputHeight * outputWidth;
-    int kStride = colHeight;
+    int kStride = inputChannels / groups_ * filterHeight * filterWidth;
    for (size_t i = 0; i < batchSize; i++) {
+      filterData = inputs[1].data<real>();
      for (size_t g = 0; g < groups_; g++) {
        if (needIm2col) {
          real beta_ = beta;
-          for (size_t colHeightStart = 0; colHeightStart < colHeight;
+          for (size_t ic = 0; ic < inputChannels / groups_;
-               colHeightStart += stepColHeight) {
+               ic += channelSteps) {
-            for (size_t colWidthStart = 0; colWidthStart < colWidth;
+            int channels = std::min(inputChannels / groups_ - ic, channelSteps);
-                 colWidthStart += stepColWidth) {
+            for (size_t oh = 0; oh < outputHeight; oh += outputHeightSteps) {
-              int N = std::min(colWidth - colWidthStart, stepColWidth);
+              int height = std::min(outputHeight - oh, outputHeightSteps);
-              int K = std::min(colHeight - colHeightStart, stepColHeight);
+              int M = outputChannels / groups_;
+              int N = height * outputWidth;
+              int K = channels * filterHeight * filterWidth;
              // im2col
-              im2col(inputData + g * inputOffset,
+              im2col(inputData,
                     imShape,
                     colData,
                     colShape,
@@ -232,13 +239,12 @@ public:
                     paddingW(),
                     dilationH(),
                     dilationW(),
-                     colHeightStart,
+                     channels,
-                     K,
+                     oh,
-                     colWidthStart,
+                     height,
                     N);
              // gemm
-              int M = outputChannels / groups_;
              BlasGemm<Device, real>::compute(
                  false,
                  false,
@@ -246,12 +252,12 @@ public:
                  N,
                  K,
                  1.0f,
-                  filterData + g * filterOffset + colHeightStart,
+                  filterData + ic * filterHeight * filterWidth,
                  kStride,
                  colData,
                  N,
                  beta_,
-                  outputData + g * outputOffset + colWidthStart,
+                  outputData + oh * outputWidth,
                  nStride);
            }
            beta_ = 1.0;
@@ -266,17 +272,18 @@ public:
                                          N,
                                          K,
                                          1.0f,
-                                          filterData + g * filterOffset,
+                                          filterData,
                                          K,
-                                          inputData + g * inputOffset,
+                                          inputData,
                                          N,
                                          beta,
-                                          outputData + g * outputOffset,
+                                          outputData,
                                          N);
        }
+        inputData += inputOffset;
+        outputData += outputOffset;
+        filterData += filterOffset;
      }
-      inputData += inputChannels * inputHeight * inputWidth;
-      outputData += outputChannels * outputHeight * outputWidth;
    }
    memory_.reset();

--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -111,39 +111,42 @@ public:
                  int paddingWidth,
                  int dilationHeight,
                  int dilationWidth,
-                  int colHeightStart,
+                  int inputChannels,
-                  int colHeightSize,
+                  int colOffset,
-                  int colWidthStart,
+                  int colOutputHeight,
-                  int colWidthSize) {
+                  int colWidth) {
    int inputHeight = imShape[1];
    int inputWidth = imShape[2];
    int filterHeight = colShape[1];
    int filterWidth = colShape[2];
    int outputWidth = colShape[4];
-    for (int colh = 0; colh < colHeightSize; colh++) {
+    for (int ic = 0; ic < inputChannels; ic++) {
-      int wOffset = (colHeightStart + colh) % filterWidth;
+      for (int oh = 0; oh < colOutputHeight; oh++) {
-      int hOffset = ((colHeightStart + colh) / filterWidth) % filterHeight;
+        T* dstData = colData + oh * outputWidth;
-      int c_im = (colHeightStart + colh) / filterWidth / filterHeight;
+        for (int fh = 0; fh < filterHeight; fh++) {
+          for (int fw = 0; fw < filterWidth; fw++) {
-      for (int colw = 0; colw < colWidthSize; colw++) {
+            int imRowIdx = (oh + colOffset) * strideHeight +
-        int h = (colWidthStart + colw) / outputWidth;
+                           fh * dilationHeight - paddingHeight;
-        int w = (colWidthStart + colw) % outputWidth;
+            if (imRowIdx < 0 || imRowIdx >= inputHeight) {
+              memset(dstData, 0, outputWidth * sizeof(T));
-        int imRowIdx = h * strideHeight + hOffset * dilationHeight;
+            } else {
-        int imColIdx = w * strideWidth + wOffset * dilationWidth;
+              for (int ow = 0; ow < outputWidth; ow++) {
-        if ((imRowIdx - paddingHeight) < 0 ||
+                int imColIdx =
-            (imRowIdx - paddingHeight) >= inputHeight ||
+                    ow * strideWidth + fw * dilationWidth - paddingWidth;
-            (imColIdx - paddingWidth) < 0 ||
+                if (imColIdx < 0 || imColIdx >= inputWidth) {
-            (imColIdx - paddingWidth) >= inputWidth) {
+                  dstData[ow] = T(0);
-          colData[colh * colWidthSize + colw] = static_cast<T>(0);
+                } else {
-        } else {
+                  dstData[ow] = imData[imRowIdx * inputWidth + imColIdx];
-          imRowIdx += c_im * inputHeight - paddingHeight;
+                }
-          imColIdx -= paddingWidth;
+              }
-          colData[colh * colWidthSize + colw] =
+            }
-              imData[imRowIdx * inputWidth + imColIdx];
+            dstData += colWidth;
+          }
        }
      }
+      colData += filterHeight * filterWidth * colWidth;
+      imData += inputHeight * inputWidth;
    }
  }
 };

--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
@@ -202,10 +202,10 @@ void TestIm2ColMobileFunctor() {
                          padding,
                          dilation,
                          dilation,
+                          channels,
                          0,
-                          height,
+                          outputHeight,
-                          0,
+                          outputHeight * outputWidth);
-                          width);
                  autotest::TensorCheckEqual(*output1, *output2);
                }

--- a/paddle/inference/CMakeLists.txt
+++ b/paddle/inference/CMakeLists.txt
-set(FLUID_CORE_MODULES proto_desc paddle_memory executor prune init)
+set(FLUID_CORE_MODULES proto_desc paddle_memory lod_tensor executor prune init)
 cc_library(paddle_fluid_api
    SRCS io.cc
@@ -29,19 +29,6 @@ add_custom_target(inference_lib_dist DEPENDS
  inference_lib framework_lib memory_lib platform_lib string_lib
  gflags_lib glog_lib protobuf_lib eigen3_lib)
-add_executable(example example.cc)
+if(WITH_TESTING)
-if(APPLE)
+  add_subdirectory(tests/book)
-  set(OPTIONAL_LINK_FLAGS)
-  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
-    set(OPTIONAL_LINK_FLAGS "-undefined dynamic_lookup")
-  endif()
-  target_link_libraries(example
-      -Wl,-force_load paddle_fluid
-      ${OPTIONAL_LINK_FLAGS}
-      ${PTOOLS_LIB})
-else()
-  target_link_libraries(example
-      -Wl,--start-group -Wl,--whole-archive paddle_fluid
-      -Wl,--no-whole-archive -Wl,--end-group
-      ${PTOOLS_LIB})
 endif()
--- a/paddle/inference/io.cc
+++ b/paddle/inference/io.cc
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/inference/io.h"
 #include <fstream>
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/feed_fetch_type.h"
 namespace paddle {
 namespace inference {
-const std::string kFeedOpType = "feed";
 bool IsParameter(const framework::VarDesc* var,
                 const framework::ProgramDesc& main_program) {
  if (var->Persistable()) {
@@ -27,7 +28,7 @@ bool IsParameter(const framework::VarDesc* var,
    for (size_t i = 0; i < main_program.Size(); ++i) {
      const framework::BlockDesc& block = main_program.Block(i);
      for (auto* op : block.AllOps()) {
-        if (op->Type() == kFeedOpType) {
+        if (op->Type() == framework::kFeedOpType) {
          continue;
        }
        for (auto input_argument_name : op->InputArgumentNames()) {
@@ -51,7 +52,7 @@ void LoadPersistables(framework::Executor& executor,
  framework::BlockDesc* load_block = load_program->MutableBlock(0);
  for (auto* var : global_block.AllVars()) {
    if (IsParameter(var, main_program)) {
-      LOG(INFO) << "parameter's name: " << var->Name();
+      VLOG(3) << "parameter's name: " << var->Name();
      framework::VarDesc* new_var = load_block->Var(var->Name());
      new_var->SetShape(var->Shape());

--- a/paddle/inference/io.h
+++ b/paddle/inference/io.h
@@ -17,18 +17,13 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
-#include "paddle/framework/block_desc.h"
 #include "paddle/framework/executor.h"
 #include "paddle/framework/program_desc.h"
 #include "paddle/framework/scope.h"
-#include "paddle/framework/var_desc.h"
 namespace paddle {
 namespace inference {
-bool IsParameter(const framework::VarDesc* var,
-                 const framework::ProgramDesc& main_program);
 void LoadPersistables(framework::Executor& executor,
                      framework::Scope& scope,
                      const std::string& dirname,

--- a/paddle/inference/tests/book/CMakeLists.txt
+++ b/paddle/inference/tests/book/CMakeLists.txt
+set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests)
+cc_test(test_inference_recognize_digits_mlp
+    SRCS test_inference_recognize_digits.cc
+    DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
+    ARGS --dirname=${PYTHON_TESTS_DIR}/book/recognize_digits_mlp.inference.model)
+set_tests_properties(test_inference_recognize_digits_mlp
+    PROPERTIES DEPENDS test_recognize_digits)
--- a/paddle/inference/example.cc
+++ b/paddle/inference/example.cc
@@ -12,93 +12,102 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <gtest/gtest.h>
 #include <time.h>
-#include <iostream>
+#include <sstream>
 #include "gflags/gflags.h"
-#include "paddle/framework/init.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/inference/io.h"
 DEFINE_string(dirname, "", "Directory of the inference model.");
-int main(int argc, char** argv) {
+template <typename Place, typename T>
-  google::ParseCommandLineFlags(&argc, &argv, true);
+void TestInference(const std::string& dirname,
-  if (FLAGS_dirname.empty()) {
+                   const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
-    // Example:
+                   std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
-    //   ./example --dirname=recognize_digits_mlp.inference.model
+  // 1. Define place, executor and scope
-    std::cout << "Usage: ./example --dirname=path/to/your/model" << std::endl;
+  auto place = Place();
-    exit(1);
+  auto executor = paddle::framework::Executor(place);
-  }
-  // 1. Define place, executor, scope
-  auto place = paddle::platform::CPUPlace();
-  paddle::framework::InitDevices();
-  auto* executor = new paddle::framework::Executor(place);
  auto* scope = new paddle::framework::Scope();
-  std::cout << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  // 2. Initialize the inference_program and load all parameters from file
-  std::string dirname = FLAGS_dirname;
+  auto inference_program = paddle::inference::Load(executor, *scope, dirname);
-  // 2. Initialize the inference program
-  auto inference_program = paddle::inference::Load(*executor, *scope, dirname);
-  // 3. Optional: perform optimization on the inference_program
+  // 3. Get the feed_target_names and fetch_target_names
-  // 4. Get the feed_target_names and fetch_target_names
  const std::vector<std::string>& feed_target_names =
      inference_program->GetFeedTargetNames();
  const std::vector<std::string>& fetch_target_names =
      inference_program->GetFetchTargetNames();
-  // 5. Generate input
+  // 4. Prepare inputs: set up maps for feed targets
-  paddle::framework::LoDTensor input;
-  srand(time(0));
-  float* input_ptr =
-      input.mutable_data<float>({1, 784}, paddle::platform::CPUPlace());
-  for (int i = 0; i < 784; ++i) {
-    input_ptr[i] = rand() / (static_cast<float>(RAND_MAX));
-  }
-  std::vector<paddle::framework::LoDTensor> feeds;
-  feeds.push_back(input);
-  std::vector<paddle::framework::LoDTensor> fetchs;
-  // Set up maps for feed and fetch targets
  std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
-  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
-  // set_feed_variable
  for (size_t i = 0; i < feed_target_names.size(); ++i) {
-    feed_targets[feed_target_names[i]] = &feeds[i];
+    // Please make sure that cpu_feeds[i] is right for feed_target_names[i]
+    feed_targets[feed_target_names[i]] = cpu_feeds[i];
  }
-  // get_fetch_variable
+  // 5. Define Tensor to get the outputs: set up maps for fetch targets
-  fetchs.resize(fetch_target_names.size());
+  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
  for (size_t i = 0; i < fetch_target_names.size(); ++i) {
-    fetch_targets[fetch_target_names[i]] = &fetchs[i];
+    fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
  }
-  // Run the inference program
+  // 6. Run the inference program
-  executor->Run(*inference_program, scope, feed_targets, fetch_targets);
+  executor.Run(*inference_program, scope, feed_targets, fetch_targets);
-  // Get outputs
+  delete scope;
-  for (size_t i = 0; i < fetchs.size(); ++i) {
+}
-    auto dims_i = fetchs[i].dims();
-    std::cout << "dims_i:";
+TEST(inference, recognize_digits) {
-    for (int j = 0; j < dims_i.size(); ++j) {
+  if (FLAGS_dirname.empty()) {
-      std::cout << " " << dims_i[j];
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
-    }
-    std::cout << std::endl;
-    std::cout << "result:";
-    float* output_ptr = fetchs[i].data<float>();
-    for (int j = 0; j < paddle::framework::product(dims_i); ++j) {
-      std::cout << " " << output_ptr[j];
-    }
-    std::cout << std::endl;
  }
-  delete scope;
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
-  delete executor;
+  std::string dirname = FLAGS_dirname;
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-  return 0;
+  paddle::framework::LoDTensor input;
+  srand(time(0));
+  float* input_ptr =
+      input.mutable_data<float>({1, 28, 28}, paddle::platform::CPUPlace());
+  for (int i = 0; i < 784; ++i) {
+    input_ptr[i] = rand() / (static_cast<float>(RAND_MAX));
+  }
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&input);
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace, float>(
+      dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.dims();
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace, float>(
+      dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.dims();
+  EXPECT_EQ(output1.dims(), output2.dims());
+  EXPECT_EQ(output1.numel(), output2.numel());
+  float err = 1E-3;
+  int count = 0;
+  for (int64_t i = 0; i < output1.numel(); ++i) {
+    if (fabs(output1.data<float>()[i] - output2.data<float>()[i]) > err) {
+      count++;
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
+#endif
 }
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -2015,13 +2015,6 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
    CHECK_EQ(channels * outLength, maskMatP->getWidth());
  }
-  /* initialize the data_ */
-  for (size_t i = 0; i < height_; i++) {
-    for (size_t j = 0; j < width_; j++) {
-      outData[i * outStride + j] = -(real)FLT_MAX;
-    }
-  }
  /* pool max one by one */
  for (size_t n = 0; n < num; ++n) {  // frame by frame
    if (!isContiguous()) {
@@ -2030,19 +2023,24 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
    for (size_t c = 0; c < channels; ++c) {  // channel by channel
      for (size_t ph = 0; ph < outputH; ++ph) {
        int hstart = ph * strideH - paddingH;
-        int hend = std::min(hstart + sizeY, imgSizeH);
+        int hend = hstart + sizeY;
-        hstart = std::max(hstart, 0);
+        hstart = hstart < 0 ? 0 : hstart;
+        hend = hend < (int)imgSizeH ? hend : (int)imgSizeH;
        for (size_t pw = 0; pw < outputW; ++pw) {
          int wstart = pw * strideW - paddingW;
-          int wend = std::min(wstart + sizeX, imgSizeW);
+          int wend = wstart + sizeX;
-          wstart = std::max(wstart, 0);
+          wstart = wstart < 0 ? 0 : wstart;
+          wend = wend < (int)imgSizeW ? wend : (int)imgSizeW;
          if (maskData == NULL) {
+            real tmp = -(real)FLT_MAX;
            for (int h = hstart; h < hend; ++h) {
              for (int w = wstart; w < wend; ++w) {
-                outData[ph * outputW + pw] = std::max(
+                tmp = tmp < inputData[h * imgSizeW + w]
-                    outData[ph * outputW + pw], inputData[h * imgSizeW + w]);
+                          ? inputData[h * imgSizeW + w]
+                          : tmp;
              }
            }
+            outData[ph * outputW + pw] = tmp;
          } else {
            for (int h = hstart; h < hend; ++h) {
              for (int w = wstart; w < wend; ++w) {

--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -122,9 +122,11 @@ if(WITH_DISTRIBUTE)
    set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
    set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
+    op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor)
 else()
-    set(DEPS_OPS ${DEPS_OPS} send_op recv_op)
+    set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op)
 endif()
 op_library(cond_op DEPS framework_proto tensor net_op)
@@ -156,7 +158,10 @@ op_library(parallel_do_op DEPS executor)
 # Regist multiple Kernel to pybind
 if (WITH_GPU)
-op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS vol2col)
+op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS
+    vol2col depthwise_conv)
 op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function)
 op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling)
 op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc
@@ -173,6 +178,8 @@ endif()
 # FIXME(typhoonzero): save/load depends lodtensor serialization functions
 op_library(save_op DEPS lod_tensor)
 op_library(load_op DEPS lod_tensor)
+op_library(save_combine_op DEPS lod_tensor)
+op_library(load_combine_op DEPS lod_tensor)
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
@@ -192,3 +199,4 @@ if(WITH_GPU)
    cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 endif()
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
+cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
--- a/paddle/operators/adagrad_op.cu
+++ b/paddle/operators/adagrad_op.cu
@@ -82,7 +82,7 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
    math::scatter::MergeAdd<platform::CUDADeviceContext, T> merge_func;
    auto grad_merge = merge_func(context, grad);
    auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
-    auto& merge_rows = grad_merge.rows();
+    framework::Vector<int64_t> merge_rows(grad_merge.rows());
    // 2. m += g_m * g_m
    math::scatter::Mul<platform::CUDADeviceContext, T> sqare_func;
    auto grad_square = sqare_func(context, grad_merge, grad_merge);
@@ -101,8 +101,8 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
    SparseAdagradFunctorKernel<
        T, 256><<<grid2, threads, 0,
                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(grad_merge_data, grad_merge.rows().data(),
+                      .stream()>>>(grad_merge_data, merge_rows.cuda_data(), lr,
-                                   lr, param_data, moment_data, grad_width,
+                                   param_data, moment_data, grad_width,
                                   epsilon);
  }
 };

--- a/paddle/operators/adam_op.h
+++ b/paddle/operators/adam_op.h
@@ -199,7 +199,12 @@ class AdamOpKernel : public framework::OpKernel<T> {
          merge_func(ctx.template device_context<DeviceContext>(), grad);
      auto& grad_tensor = grad_merge.value();
      const T* grad_data = grad_tensor.template data<T>();
-      auto* rows = grad_merge.rows().data();
+      int64_t* rows = nullptr;
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+        rows = grad_merge.mutable_rows()->cuda_data();
+      } else {
+        rows = grad_merge.mutable_rows()->data();
+      }
      auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
      SparseAdamFunctor<T> functor(

--- a/paddle/operators/bipartite_match_op.cc
+++ b/paddle/operators/bipartite_match_op.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -28,12 +28,18 @@ class BipartiteMatchOp : public framework::OperatorWithKernel {
  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("DistMat"),
                   "Input(DistMat) of BipartiteMatch should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("ColToRowMatchIndices"),
+        "Output(ColToRowMatchIndices) of BipartiteMatch should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("ColToRowMatchDist"),
+        "Output(ColToRowMatchDist) of BipartiteMatch should not be null.");
    auto dims = ctx->GetInputDim("DistMat");
    PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DistMat) must be 2.");
    ctx->SetOutputDim("ColToRowMatchIndices", dims);
-    ctx->SetOutputDim("ColToRowMatchDis", dims);
+    ctx->SetOutputDim("ColToRowMatchDist", dims);
  }
 };
@@ -91,7 +97,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& context) const override {
    auto* dist_mat = context.Input<LoDTensor>("DistMat");
    auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices");
-    auto* match_dist = context.Output<Tensor>("ColToRowMatchDis");
+    auto* match_dist = context.Output<Tensor>("ColToRowMatchDist");
    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
@@ -148,13 +154,13 @@ class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {
              "Otherwise, it means B[j] is matched to row "
              "ColToRowMatchIndices[i][j] in i-th instance. The row number of "
              "i-th instance is saved in ColToRowMatchIndices[i][j].");
-    AddOutput("ColToRowMatchDis",
+    AddOutput("ColToRowMatchDist",
              "(Tensor) A 2-D Tensor with shape [N, M] in float type. "
              "N is batch size. If ColToRowMatchIndices[i][j] is -1, "
-              "ColToRowMatchDis[i][j] is also -1.0. Otherwise, assumed "
+              "ColToRowMatchDist[i][j] is also -1.0. Otherwise, assumed "
              "ColToRowMatchIndices[i][j] = d, and the row offsets of each "
              "instance are called LoD. Then "
-              "ColToRowMatchDis[i][j] = DistMat[d+LoD[i]][j]");
+              "ColToRowMatchDist[i][j] = DistMat[d+LoD[i]][j]");
    AddComment(R"DOC(
 This operator is a greedy bipartite matching algorithm, which is used to
 obtain the matching with the maximum distance based on the input

--- a/paddle/operators/box_coder_op.cc
+++ b/paddle/operators/box_coder_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/box_coder_op.h"
+namespace paddle {
+namespace operators {
+class BoxCoderOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
+                   "Input(PriorBox) of BoxCoderOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("PriorBoxVar"),
+                   "Input(PriorBoxVar) of BoxCoderOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("TargetBox"),
+                   "Input(TargetBox) of BoxCoderOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutputBox"),
+                   "Output(OutputBox) of BoxCoderOp should not be null.");
+    auto prior_box_dims = ctx->GetInputDim("PriorBox");
+    auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
+    auto target_box_dims = ctx->GetInputDim("TargetBox");
+    PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2,
+                      "The rank of Input of PriorBoxVar must be 2");
+    PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]");
+    PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims);
+    PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
+                      "The rank of Input of TargetBox must be 2");
+    PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
+                      "The shape of TargetBox is [M, 4]");
+    GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
+    ctx->SetOutputDim(
+        "OutputBox",
+        framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4}));
+    ctx->ShareLoD("TargetBox", /*->*/ "OutputBox");
+  }
+};
+class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BoxCoderOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "PriorBox",
+        "(Tensor, default Tensor<float>) "
+        "Box list PriorBox is a 2-D Tensor with shape [M, 4] holds M boxes, "
+        "each box is represented as [xmin, ymin, xmax, ymax], "
+        "[xmin, ymin] is the left top coordinate of the anchor box, "
+        "if the input is image feature map, they are close to the origin "
+        "of the coordinate system. [xmax, ymax] is the right bottom "
+        "coordinate of the anchor box.");
+    AddInput("PriorBoxVar",
+             "(Tensor, default Tensor<float>) "
+             "PriorBoxVar is a 2-D Tensor with shape [M, 4] holds M group "
+             "of variance.");
+    AddInput(
+        "TargetBox",
+        "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
+        "[N, 4], each box is represented as [xmin, ymin, xmax, ymax], "
+        "[xmin, ymin] is the left top coordinate of the box if the input "
+        "is image feature map, they are close to the origin of the coordinate "
+        "system. [xmax, ymax] is the right bottom coordinate of the box. "
+        "This tensor can contain LoD information to represent a batch "
+        "of inputs. One instance of this batch can contain different "
+        "numbers of entities.");
+    AddAttr<std::string>("code_type",
+                         "(string, default encode_center_size) "
+                         "the code type used with the target box")
+        .SetDefault("encode_center_size")
+        .InEnum({"encode_center_size", "decode_center_size"});
+    AddOutput(
+        "OutputBox",
+        "(LoDTensor or Tensor) "
+        "(Tensor) The output of box_coder_op, a tensor with shape [N, M, 4] "
+        "representing the result of N target boxes encoded/decoded with "
+        "M Prior boxes and variances.");
+    AddComment(R"DOC(
+Bounding Box Coder Operator.
+Encode/Decode the target bounding box with the priorbox information.
+The Encoding schema described below:
+ox = (tx - px) / pw / pxv
+oy = (ty - py) / ph / pyv
+ow = log(abs(tw / pw)) / pwv 
+oh = log(abs(th / ph)) / phv 
+The Decoding schema described below:
+ox = (pw * pxv * tx * + px) - tw / 2
+oy = (ph * pyv * ty * + py) - th / 2
+ow = exp(pwv * tw) * pw + tw / 2
+oh = exp(phv * th) * ph + th / 2
+where tx, ty, tw, th denote the target box's center coordinates, width and
+height respectively. Similarly, px, py, pw, ph denote the priorbox's(anchor)
+center coordinates, width and height. pxv, pyv, pwv, phv denote the variance
+of the priorbox and ox, oy, ow, oh denote the encoded/decoded coordinates,
+width and height.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker);
+REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel<float>,
+                       ops::BoxCoderKernel<double>);
--- a/paddle/operators/box_coder_op.cu
+++ b/paddle/operators/box_coder_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/box_coder_op.h"
+#include "paddle/platform/cuda_helper.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+__global__ void EncodeCenterSizeKernel(const T* prior_box_data,
+                                       const T* prior_box_var_data,
+                                       const T* target_box_data, const int row,
+                                       const int col, const int len,
+                                       T* output) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < row * col) {
+    const int row_idx = idx / col;
+    const int col_idx = idx % col;
+    T prior_box_width =
+        prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
+    T prior_box_height =
+        prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
+    T prior_box_center_x =
+        (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
+    T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
+                            prior_box_data[col_idx * len + 1]) /
+                           2;
+    T target_box_center_x =
+        (target_box_data[row_idx * len + 2] + target_box_data[row_idx * len]) /
+        2;
+    T target_box_center_y = (target_box_data[row_idx * len + 3] +
+                             target_box_data[row_idx * len + 1]) /
+                            2;
+    T target_box_width =
+        target_box_data[row_idx * len + 2] - target_box_data[row_idx * len];
+    T target_box_height =
+        target_box_data[row_idx * len + 3] - target_box_data[row_idx * len + 1];
+    output[idx * len] = (target_box_center_x - prior_box_center_x) /
+                        prior_box_width / prior_box_var_data[col_idx * len];
+    output[idx * len + 1] = (target_box_center_y - prior_box_center_y) /
+                            prior_box_height /
+                            prior_box_var_data[col_idx * len + 1];
+    output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)) /
+                            prior_box_var_data[col_idx * len + 2];
+    output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)) /
+                            prior_box_var_data[col_idx * len + 3];
+  }
+}
+template <typename T>
+__global__ void DecodeCenterSizeKernel(const T* prior_box_data,
+                                       const T* prior_box_var_data,
+                                       const T* target_box_data, const int row,
+                                       const int col, const int len,
+                                       T* output) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < row * col) {
+    const int row_idx = idx / col;
+    const int col_idx = idx % col;
+    T prior_box_width =
+        prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
+    T prior_box_height =
+        prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
+    T prior_box_center_x =
+        (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
+    T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
+                            prior_box_data[col_idx * len + 1]) /
+                           2;
+    T target_box_width = exp(prior_box_var_data[col_idx * len + 2] *
+                             target_box_data[row_idx * len + 2]) *
+                         prior_box_width;
+    T target_box_height = exp(prior_box_var_data[col_idx * len + 3] *
+                              target_box_data[row_idx * len + 3]) *
+                          prior_box_height;
+    T target_box_center_x = prior_box_var_data[col_idx * len] *
+                                target_box_data[row_idx * len] *
+                                prior_box_width +
+                            prior_box_center_x;
+    T target_box_center_y = prior_box_var_data[col_idx * len + 1] *
+                                target_box_data[row_idx * len + 1] *
+                                prior_box_height +
+                            prior_box_center_y;
+    output[idx * len] = target_box_center_x - target_box_width / 2;
+    output[idx * len + 1] = target_box_center_y - target_box_height / 2;
+    output[idx * len + 2] = target_box_center_x + target_box_width / 2;
+    output[idx * len + 3] = target_box_center_y + target_box_height / 2;
+  }
+}
+template <typename T>
+class BoxCoderCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto* prior_box = context.Input<framework::Tensor>("PriorBox");
+    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
+    auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
+    auto* output_box = context.Output<framework::Tensor>("OutputBox");
+    if (target_box->lod().size()) {
+      PADDLE_ENFORCE_EQ(target_box->lod().size(), 1,
+                        "Only support 1 level of LoD.");
+    }
+    auto row = target_box->dims()[0];
+    auto col = prior_box->dims()[0];
+    auto len = prior_box->dims()[1];
+    int block = 512;
+    int grid = (row * col + block - 1) / block;
+    auto& device_ctx = context.cuda_device_context();
+    const T* prior_box_data = prior_box->data<T>();
+    const T* prior_box_var_data = prior_box_var->data<T>();
+    const T* target_box_data = target_box->data<T>();
+    output_box->mutable_data<T>({row, col, len}, context.GetPlace());
+    T* output = output_box->data<T>();
+    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
+    if (code_type == BoxCodeType::kEncodeCenterSize) {
+      EncodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
+          prior_box_data, prior_box_var_data, target_box_data, row, col, len,
+          output);
+    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
+      DecodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
+          prior_box_data, prior_box_var_data, target_box_data, row, col, len,
+          output);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(box_coder, ops::BoxCoderCUDAKernel<float>,
+                        ops::BoxCoderCUDAKernel<double>);
--- a/paddle/operators/box_coder_op.h
+++ b/paddle/operators/box_coder_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+namespace paddle {
+namespace operators {
+enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 };
+inline BoxCodeType GetBoxCodeType(const std::string& type) {
+  if (type == "encode_center_size") {
+    return BoxCodeType::kEncodeCenterSize;
+  } else if (type == "decode_center_size") {
+    return BoxCodeType::kDecodeCenterSize;
+  }
+  PADDLE_THROW("Not support type %s.", type);
+}
+template <typename T>
+class BoxCoderKernel : public framework::OpKernel<T> {
+ public:
+  void EncodeCenterSize(const framework::Tensor& target_box,
+                        const framework::Tensor& prior_box,
+                        const framework::Tensor& prior_box_var,
+                        T* output) const {
+    int64_t row = target_box.dims()[0];
+    int64_t col = prior_box.dims()[0];
+    int64_t len = prior_box.dims()[1];
+    auto* target_box_data = target_box.data<T>();
+    auto* prior_box_data = prior_box.data<T>();
+    auto* prior_box_var_data = prior_box_var.data<T>();
+    for (int64_t i = 0; i < row; ++i) {
+      for (int64_t j = 0; j < col; ++j) {
+        T prior_box_width =
+            prior_box_data[j * len + 2] - prior_box_data[j * len];
+        T prior_box_height =
+            prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
+        T prior_box_center_x =
+            (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
+        T prior_box_center_y =
+            (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
+        T target_box_center_x =
+            (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
+        T target_box_center_y =
+            (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
+        T target_box_width =
+            target_box_data[i * len + 2] - target_box_data[i * len];
+        T target_box_height =
+            target_box_data[i * len + 3] - target_box_data[i * len + 1];
+        size_t offset = i * col * len + j * len;
+        output[offset] = (target_box_center_x - prior_box_center_x) /
+                         prior_box_width / prior_box_var_data[j * len];
+        output[offset + 1] = (target_box_center_y - prior_box_center_y) /
+                             prior_box_height / prior_box_var_data[j * len + 1];
+        output[offset + 2] =
+            std::log(std::fabs(target_box_width / prior_box_width)) /
+            prior_box_var_data[j * len + 2];
+        output[offset + 3] =
+            std::log(std::fabs(target_box_height / prior_box_height)) /
+            prior_box_var_data[j * len + 3];
+      }
+    }
+  }
+  void DecodeCenterSize(const framework::Tensor& target_box,
+                        const framework::Tensor& prior_box,
+                        const framework::Tensor& prior_box_var,
+                        T* output) const {
+    int64_t row = target_box.dims()[0];
+    int64_t col = prior_box.dims()[0];
+    int64_t len = prior_box.dims()[1];
+    auto* target_box_data = target_box.data<T>();
+    auto* prior_box_data = prior_box.data<T>();
+    auto* prior_box_var_data = prior_box_var.data<T>();
+    for (int64_t i = 0; i < row; ++i) {
+      for (int64_t j = 0; j < col; ++j) {
+        T prior_box_width =
+            prior_box_data[j * len + 2] - prior_box_data[j * len];
+        T prior_box_height =
+            prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
+        T prior_box_center_x =
+            (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
+        T prior_box_center_y =
+            (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
+        T target_box_center_x = prior_box_var_data[j * len] *
+                                    target_box_data[i * len] * prior_box_width +
+                                prior_box_center_x;
+        T target_box_center_y = prior_box_var_data[j * len + 1] *
+                                    target_box_data[i * len + 1] *
+                                    prior_box_height +
+                                prior_box_center_y;
+        T target_box_width = std::exp(prior_box_var_data[j * len + 2] *
+                                      target_box_data[i * len + 2]) *
+                             prior_box_width;
+        T target_box_height = std::exp(prior_box_var_data[j * len + 3] *
+                                       target_box_data[i * len + 3]) *
+                              prior_box_height;
+        size_t offset = i * col * len + j * len;
+        output[offset] = target_box_center_x - target_box_width / 2;
+        output[offset + 1] = target_box_center_y - target_box_height / 2;
+        output[offset + 2] = target_box_center_x + target_box_width / 2;
+        output[offset + 3] = target_box_center_y + target_box_height / 2;
+      }
+    }
+  }
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* prior_box = context.Input<framework::Tensor>("PriorBox");
+    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
+    auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
+    auto* output_box = context.Output<framework::Tensor>("OutputBox");
+    if (target_box->lod().size()) {
+      PADDLE_ENFORCE_EQ(target_box->lod().size(), 1UL,
+                        "Only support 1 level of LoD.");
+    }
+    auto row = target_box->dims()[0];
+    auto col = prior_box->dims()[0];
+    auto len = prior_box->dims()[1];
+    output_box->mutable_data<T>({row, col, len}, context.GetPlace());
+    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
+    T* output = output_box->data<T>();
+    if (code_type == BoxCodeType::kEncodeCenterSize) {
+      EncodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
+    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
+      DecodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/compare_op.h
+++ b/paddle/operators/compare_op.h
@@ -54,7 +54,15 @@ class CompareOpKernel
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    using T = typename Functor::ELEM_TYPE;
-    ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context);
+    using Tensor = framework::Tensor;
+    auto* x = context.Input<Tensor>("X");
+    auto* y = context.Input<Tensor>("Y");
+    auto* z = context.Output<Tensor>("Out");
+    z->mutable_data<T>(context.GetPlace());
+    int axis = context.Attr<int>("axis");
+    ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, axis,
+                                                          z);
  }
 };

--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -318,9 +318,25 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
 namespace ops = paddle::operators;
 REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
            ops::ConvOpGrad);
+// depthwise convolution op
+REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
+            depthwise_conv2d_grad, ops::ConvOpGrad);
 REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
            ops::ConvOpGrad);
+// depthwise conv kernel
+// TODO(xingzhaolong): neon kernel for mobile
+REGISTER_OP_CPU_KERNEL(
+    depthwise_conv2d,
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    depthwise_conv2d_grad,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
    conv2d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);

--- a/paddle/operators/conv_op.cu.cc
+++ b/paddle/operators/conv_op.cu.cc
@@ -16,6 +16,16 @@ limitations under the License. */
 namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    depthwise_conv2d,
+    ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    depthwise_conv2d_grad,
+    ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(
    conv2d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
    ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);

--- a/paddle/operators/conv_op.h
+++ b/paddle/operators/conv_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/depthwise_conv.h"
 #include "paddle/operators/math/im2col.h"
 #include "paddle/operators/math/math_function.h"
 #include "paddle/operators/math/vol2col.h"
@@ -350,5 +351,72 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
    }
  }
 };
+template <typename DeviceContext, typename T>
+class DepthwiseConvKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    Tensor filter = *context.Input<Tensor>("Filter");
+    Tensor* output = context.Output<Tensor>("Output");
+    output->mutable_data<T>(context.GetPlace());
+    PADDLE_ENFORCE_EQ(
+        output->dims()[1] % input->dims()[1], 0,
+        "The output channels must be a multiple of the input channels");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+    math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    depthwiseConv(dev_ctx, *input, filter, strides, paddings, output);
+  }
+};
+template <typename DeviceContext, typename T>
+class DepthwiseConvGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    const Tensor* output_grad =
+        context.Input<Tensor>(framework::GradVarName("Output"));
+    Tensor* input_grad =
+        context.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* filter_grad =
+        context.Output<Tensor>(framework::GradVarName("Filter"));
+    Tensor filter = *context.Input<Tensor>("Filter");
+    if (!input_grad && !filter_grad) return;
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    math::DepthwiseConvInputGradFunctor<DeviceContext, T>
+        depthwiseConvInputGrad;
+    math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
+        depthwiseConvFilterGrad;
+    if (input_grad) {
+      input_grad->mutable_data<T>(context.GetPlace());
+      set_zero(dev_ctx, input_grad, static_cast<T>(0));
+      depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
+                             paddings, input_grad);
+    }
+    if (filter_grad) {
+      filter_grad->mutable_data<T>(context.GetPlace());
+      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+      depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings,
+                              filter_grad);
+    }
+  }
+};
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/ctc_align_op.cu
+++ b/paddle/operators/ctc_align_op.cu
@@ -69,12 +69,11 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
    auto stream = ctx.cuda_device_context().stream();
    MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>(
-        num_tokens, tokens, num_seq, input_lod[level].data(), blank,
+        num_tokens, tokens, num_seq, input_lod[level].cuda_data(), blank,
        merge_repeated, dev_out_lod0_ptr, output_data);
    // set output lod
-    thrust::host_vector<size_t> host_out_lod0(dev_out_lod0.begin(),
+    std::vector<size_t> host_out_lod0(dev_out_lod0.begin(), dev_out_lod0.end());
-                                              dev_out_lod0.end());
    framework::LoD out_lod;
    out_lod.push_back(host_out_lod0);
    output->set_lod(out_lod);

--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
@@ -51,6 +51,13 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
                         "'dropout_prob' must be between 0.0 and 1.0.");
        });
    AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
+    AddAttr<bool>("fix_seed",
+                  "A flag indicating whether to use a fixed seed to generate "
+                  "random mask. NOTE: DO NOT set this flag to true in "
+                  "training. Setting this flag to true is only useful in "
+                  "unittest or for debug that always the same output units "
+                  "will be dropped.")
+        .SetDefault(false);
    AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
    AddComment(R"DOC(

--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
@@ -62,7 +62,11 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
      auto* mask = context.Output<Tensor>("Mask");
      auto* mask_data = mask->mutable_data<T>(context.GetPlace());
      int size = framework::product(mask->dims());
-      int seed = context.Attr<int>("seed");
+      std::random_device rnd;
+      int seed =
+          context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
      thrust::counting_iterator<unsigned int> index_sequence_begin(0);
      thrust::transform(index_sequence_begin, index_sequence_begin + size,
                        thrust::device_ptr<T>(mask_data),

--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
@@ -38,9 +38,15 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
    if (!context.Attr<bool>("is_test")) {
      auto* mask = context.Output<Tensor>("Mask");
      auto* mask_data = mask->mutable_data<T>(context.GetPlace());
-      int seed = context.Attr<int>("seed");
+      // NOTE: fixed seed should only be used in unittest or for debug.
+      // Guarantee to use random seed in training.
+      std::random_device rnd;
      std::minstd_rand engine;
+      int seed =
+          context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
      engine.seed(seed);
      std::uniform_real_distribution<float> dist(0, 1);
      size_t size = framework::product(mask->dims());
      for (size_t i = 0; i < size; ++i) {

--- a/paddle/operators/elementwise_add_op.h
+++ b/paddle/operators/elementwise_add_op.h
@@ -28,7 +28,14 @@ template <typename DeviceContext, typename T>
 class ElementwiseAddKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx);
+    using Tensor = framework::Tensor;
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
  }
 };
@@ -92,9 +99,19 @@ template <typename DeviceContext, typename T>
 class ElementwiseAddGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
    ElementwiseGradCompute<DeviceContext, T, ElementwiseAddGradFunctor<T>,
                           ElementwiseAddBroadCastGradFunctor<T>,
-                           ElementwiseAddBroadCast2GradFunctor<T>>(ctx);
+                           ElementwiseAddBroadCast2GradFunctor<T>>(
+        ctx, x, y, out, dout, axis, dx, dy);
  }
 };

--- a/paddle/operators/elementwise_div_op.h
+++ b/paddle/operators/elementwise_div_op.h
@@ -28,7 +28,14 @@ template <typename DeviceContext, typename T>
 class ElementwiseDivKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx);
+    using Tensor = framework::Tensor;
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
  }
 };
@@ -111,9 +118,19 @@ template <typename DeviceContext, typename T>
 class ElementwiseDivGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
    ElementwiseGradCompute<DeviceContext, T, ElementwiseDivGradFunctor<T>,
                           ElementwiseDivBroadCastGradFunctor<T>,
-                           ElementwiseDivBroadCast2GradFunctor<T>>(ctx);
+                           ElementwiseDivBroadCast2GradFunctor<T>>(
+        ctx, x, y, out, dout, axis, dx, dy);
  }
 };

--- a/paddle/operators/elementwise_max_op.h
+++ b/paddle/operators/elementwise_max_op.h
@@ -28,7 +28,14 @@ template <typename DeviceContext, typename T>
 class ElementwiseMaxKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx);
+    using Tensor = framework::Tensor;
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
  }
 };
@@ -110,9 +117,19 @@ template <typename DeviceContext, typename T>
 class ElementwiseMaxGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
    ElementwiseGradCompute<DeviceContext, T, ElementwiseMaxGradFunctor<T>,
                           ElementwiseMaxBroadCastGradFunctor<T>,
-                           ElementwiseMaxBroadCast2GradFunctor<T>>(ctx);
+                           ElementwiseMaxBroadCast2GradFunctor<T>>(
+        ctx, x, y, out, dout, axis, dx, dy);
  }
 };

--- a/paddle/operators/elementwise_min_op.h
+++ b/paddle/operators/elementwise_min_op.h
@@ -28,7 +28,14 @@ template <typename DeviceContext, typename T>
 class ElementwiseMinKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx);
+    using Tensor = framework::Tensor;
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
  }
 };
@@ -110,9 +117,19 @@ template <typename DeviceContext, typename T>
 class ElementwiseMinGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
    ElementwiseGradCompute<DeviceContext, T, ElementwiseMinGradFunctor<T>,
                           ElementwiseMinBroadCastGradFunctor<T>,
-                           ElementwiseMinBroadCast2GradFunctor<T>>(ctx);
+                           ElementwiseMinBroadCast2GradFunctor<T>>(
+        ctx, x, y, out, dout, axis, dx, dy);
  }
 };

--- a/paddle/operators/elementwise_mul_op.h
+++ b/paddle/operators/elementwise_mul_op.h
@@ -27,7 +27,14 @@ template <typename DeviceContext, typename T>
 class ElementwiseMulKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx);
+    using Tensor = framework::Tensor;
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
  }
 };
@@ -110,9 +117,19 @@ template <typename DeviceContext, typename T>
 class ElementwiseMulGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
    ElementwiseGradCompute<DeviceContext, T, ElementwiseMulGradFunctor<T>,
                           ElementwiseMulBroadCastGradFunctor<T>,
-                           ElementwiseMulBroadCast2GradFunctor<T>>(ctx);
+                           ElementwiseMulBroadCast2GradFunctor<T>>(
+        ctx, x, y, out, dout, axis, dx, dy);
  }
 };

--- a/paddle/operators/elementwise_op_function.h
+++ b/paddle/operators/elementwise_op_function.h
@@ -313,21 +313,18 @@ EIGEN_FUNCTOR(Div, EIGEN_DIV);
 template <typename DeviceContext, typename T, typename functor,
          typename broadcastfunctor, typename broadcast2functor>
-void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
+void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
-  using Tensor = framework::Tensor;
-  auto* x = ctx.Input<Tensor>("X");
-  auto* y = ctx.Input<Tensor>("Y");
-  auto* out = ctx.Input<Tensor>("Out");
-  auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+                            const framework::Tensor* x,
+                            const framework::Tensor* y,
+                            const framework::Tensor* out,
+                            const framework::Tensor* dout, int axis,
+                            framework::Tensor* dx, framework::Tensor* dy) {
  auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
  auto x_dims = x->dims();
  auto y_dims = y->dims();
-  auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-  auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
  if (dx) {
    dx->mutable_data<T>(ctx.GetPlace());
  }
@@ -348,7 +345,6 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
    x_dims = framework::make_ddim(extended_dims);
  }
-  int axis = ctx.Attr<int>("axis");
  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
  int pre, n, post;
@@ -367,13 +363,10 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
 template <typename Functor, typename DeviceContext, typename T,
          typename OutType = T>
-void ElementwiseComputeEx(const framework::ExecutionContext& ctx) {
+void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
-  using Tensor = framework::Tensor;
+                          const framework::Tensor* x,
+                          const framework::Tensor* y, int axis,
-  auto* x = ctx.Input<Tensor>("X");
+                          framework::Tensor* z) {
-  auto* y = ctx.Input<Tensor>("Y");
-  auto* z = ctx.Output<Tensor>("Out");
-  z->mutable_data<OutType>(ctx.GetPlace());
  TransformFunctor<Functor, T, DeviceContext, OutType> functor(
      x, y, z, ctx.template device_context<DeviceContext>(), Functor());
@@ -394,7 +387,6 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx) {
    x_dims = framework::make_ddim(extended_dims);
  }
-  int axis = ctx.Attr<int>("axis");
  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
  PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
                 "Axis should be in range [0, x_dims)");

--- a/paddle/operators/elementwise_pow_op.cc
+++ b/paddle/operators/elementwise_pow_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/elementwise_pow_op.h"
+#include "paddle/operators/elementwise_op.h"
+namespace paddle {
+namespace operators {
+class ElementwisePowOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwisePowOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Pow", "Out = X ^ Y");
+    AddComment(comment_);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(elementwise_pow, ops::ElementwiseOp,
+                             ops::ElementwisePowOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_pow,
+    ops::ElementwisePowKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwisePowKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/operators/elementwise_pow_op.cu
+++ b/paddle/operators/elementwise_pow_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#define EIGEN_USE_GPU
+#include "paddle/operators/elementwise_pow_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_pow,
+    ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/operators/elementwise_pow_op.h
+++ b/paddle/operators/elementwise_pow_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <cmath>
+#include "paddle/operators/elementwise_op_function.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+struct PowFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return std::pow(a, b); }
+};
+template <typename DeviceContext, typename T>
+class ElementwisePowKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseComputeEx<PowFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/elementwise_sub_op.h
+++ b/paddle/operators/elementwise_sub_op.h
@@ -27,7 +27,14 @@ template <typename DeviceContext, typename T>
 class ElementwiseSubKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx);
+    using Tensor = framework::Tensor;
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
  }
 };
@@ -93,9 +100,19 @@ template <typename DeviceContext, typename T>
 class ElementwiseSubGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
    ElementwiseGradCompute<DeviceContext, T, ElementwiseSubGradFunctor<T>,
                           ElementwiseSubBroadCastGradFunctor<T>,
-                           ElementwiseSubBroadCast2GradFunctor<T>>(ctx);
+                           ElementwiseSubBroadCast2GradFunctor<T>>(
+        ctx, x, y, out, dout, axis, dx, dy);
  }
 };

--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -52,7 +52,11 @@ class FeedOp : public framework::OperatorBase {
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    auto &dev_ctx = *pool.Get(place);
-    framework::Copy(feed_item, place, dev_ctx, out_item);
+    if (platform::is_same_place(feed_item.place(), place)) {
+      out_item->ShareDataWith(feed_item);
+    } else {
+      framework::Copy(feed_item, place, dev_ctx, out_item);
+    }
    out_item->set_lod(feed_item.lod());
  }
 };

--- a/paddle/operators/gru_op.h
+++ b/paddle/operators/gru_op.h
@@ -30,11 +30,12 @@ using Tensor = framework::Tensor;
 template <typename DeviceContext, typename T>
 inline void ReorderInitState(const DeviceContext& ctx,
-                             const framework::Tensor& src, const size_t* index,
+                             const framework::Tensor& src,
+                             framework::Vector<size_t> index_lod,
                             framework::Tensor* dst, bool indexed_src) {
  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  row_shuffle(ctx, src, index, *dst, indexed_src);
+  row_shuffle(ctx, src, index_lod, *dst, indexed_src);
 }
 template <typename DeviceContext, typename T>
@@ -76,7 +77,9 @@ class GRUKernel : public framework::OpKernel<T> {
    gru_value.state_weight =
        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
    Tensor ordered_h0;
-    const size_t* order = batch_gate->lod()[2].data();
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
    if (h0) {
      // Since the batch computing for GRU reorders the input sequences
      // according to their length. The initialized cell state also needs
@@ -159,7 +162,9 @@ class GRUGradKernel : public framework::OpKernel<T> {
    zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast<T>(0.0));
    Tensor ordered_h0, ordered_h0_grad;
-    const size_t* order = batch_gate->lod()[2].data();
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
    if (h0) {
      ReorderInitState<DeviceContext, T>(dev_ctx, *h0, order, &ordered_h0,
                                         true);

--- a/paddle/operators/label_smooth_op.cc
+++ b/paddle/operators/label_smooth_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/label_smooth_op.h"
+namespace paddle {
+namespace operators {
+class LabelSmoothOp : public framework::OperatorWithKernel {
+ public:
+  LabelSmoothOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of LabelSmoothOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of LabelSmoothOp should not be null.");
+    auto in_dims = ctx->GetInputDim("X");
+    if (ctx->HasInput("PriorDist")) {
+      auto noise_dims = ctx->GetInputDim("PriorDist");
+      auto noise_numel = paddle::framework::product(noise_dims);
+      PADDLE_ENFORCE(
+          in_dims[1] == noise_numel,
+          "The number of elements in Input(PriorDist) must be equal to the "
+          "dimension of each label.");
+    }
+    ctx->ShareLoD("X", /*->*/ "Out");
+    ctx->SetOutputDim("Out", in_dims);
+  }
+};
+class LabelSmoothOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LabelSmoothOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor) The input labels of LabelSmooth operator. This "
+             "input can be batched labels in one-hot encoding or output from "
+             "softmax, with shape [N x K], where N is the batch size and K is "
+             "the number of classes");
+    AddInput("PriorDist",
+             "(Tensor, optional)"
+             "The prior distribution to be added to the smoothed label. It is "
+             "fixed during training and the number of elements should be equal "
+             "to the dimension K of each label. Default is uniform "
+             "distribution and each element will be set to 1/K if not provided "
+             "in input.")
+        .AsDispensable();
+    AddOutput("Out",
+              "(loDTensor) The smoothed label of LabelSmooth operator. It has"
+              "the same shape and LoD with the Input(LoDTensor).");
+    AddAttr<float>("epsilon",
+                   "(float, default 0.0f)"
+                   "The smoothing parameter of LabelSmooth operator.")
+        .SetDefault(0.0f);
+    AddComment(R"DOC(
+LabelSmooth Operator.
+Label smoothing is a mechanism to regularize the classifier layer. In machine 
+learning, optimizing the log-likelihood of the correct label directly may 
+cause two problems. First, it may result in overfitting: if the model learns 
+to assign full probability to the ground-truth label for each training example,
+it is not guaranteed to generalize. Second, it encourages the differences 
+between the largest logit and all others to become large, reducing the ability 
+of the model to adapt. Label smoothing is proposed to encourage the model to 
+be less confident, which replaces the ground-truth label $y$ with the weighted 
+sum of itself and some fixed distribution $\mu$, i.e.
+$$
+    \tilde{y} = (1 - \epsilon) * y + \epsilon * \mu,
+$$
+where $(1 - \epsilon)$ and $\epsilon$ are the weights respectively, and 
+$\tilde{y}$ is the smoothed label. Usually uniform distribution is used for 
+$\mu$. This change in the ground-truth label is called label-smoothing 
+regularization or LSR.
+See more details about label smoothing in https://arxiv.org/abs/1512.00567.
+)DOC");
+  }
+};
+class LabelSmoothGradOp : public framework::OperatorWithKernel {
+ public:
+  LabelSmoothGradOp(const std::string &type,
+                    const framework::VariableNameMap &inputs,
+                    const framework::VariableNameMap &outputs,
+                    const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker,
+            label_smooth_grad, ops::LabelSmoothGradOp);
+REGISTER_OP_CPU_KERNEL(
+    label_smooth,
+    ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    label_smooth_grad,
+    ops::LabelSmoothGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LabelSmoothGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/operators/label_smooth_op.cu
+++ b/paddle/operators/label_smooth_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/label_smooth_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    label_smooth,
+    ops::LabelSmoothKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LabelSmoothKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    label_smooth_grad,
+    ops::LabelSmoothGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LabelSmoothGradKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/operators/label_smooth_op.h
+++ b/paddle/operators/label_smooth_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class LabelSmoothKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out_t = ctx.Output<framework::LoDTensor>("Out");
+    auto* in_t = ctx.Input<framework::LoDTensor>("X");
+    auto* dist_t = ctx.Input<framework::Tensor>("PriorDist");
+    auto label_dim = in_t->dims()[1];
+    out_t->mutable_data<T>(ctx.GetPlace());
+    auto epsilon = ctx.Attr<float>("epsilon");
+    auto out = framework::EigenVector<T>::Flatten(*out_t);
+    auto in = framework::EigenVector<T>::Flatten(*in_t);
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+    if (dist_t) {
+      auto dist = framework::EigenVector<T>::Flatten(*dist_t);
+      out.device(dev) =
+          static_cast<T>(1 - epsilon) * in +
+          epsilon * dist.broadcast(Eigen::DSizes<int, 1>(in_t->numel()));
+    } else {
+      out.device(dev) = static_cast<T>(1 - epsilon) * in +
+                        static_cast<T>(epsilon / label_dim);
+    }
+  }
+};
+template <typename DeviceContext, typename T>
+class LabelSmoothGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* d_in_t = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    d_in_t->mutable_data<T>(ctx.GetPlace());
+    auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
+    auto d_in = framework::EigenVector<T>::Flatten(*d_in_t);
+    auto epsilon = ctx.Attr<float>("epsilon");
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+    d_in.device(dev) = static_cast<T>(1 - epsilon) * d_out;
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/layer_norm_op.cc
+++ b/paddle/operators/layer_norm_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/layer_norm_op.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+template <typename T>
+using EigenMatrixMapRowMajor = Eigen::Map<
+    Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+template <typename T>
+using ConstEigenMatrixMapRowMajor = Eigen::Map<
+    const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+class LayerNormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"),
+                   "Output(Y) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Mean"),
+                   "Output(Mean) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Variance"),
+                   "Output(Variance) of LayerNormOp should not be null.");
+    auto x_dim = ctx->GetInputDim("X");
+    auto begin_norm_axis = ctx->Attrs().Get<int>("begin_norm_axis");
+    PADDLE_ENFORCE_LT(begin_norm_axis, x_dim.size(),
+                      "'begin_norm_axis' must be less than the rank of X.");
+    auto matrix_dim = framework::flatten_to_2d(x_dim, begin_norm_axis);
+    int left = static_cast<int>(matrix_dim[0]);
+    int right = static_cast<int>(matrix_dim[1]);
+    if (ctx->HasInput("Scale")) {
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right);
+    }
+    if (ctx->HasInput("Bias")) {
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right);
+    }
+    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
+    ctx->SetOutputDim("Mean", {left});
+    ctx->SetOutputDim("Variance", {left});
+    ctx->ShareLoD("X", "Y");
+  }
+};
+class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LayerNormOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensor) The input tensor.");
+    AddInput("Scale",
+             "(Tensor, optional) Scale is a 1-dimensional tensor of size "
+             "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
+             "It is applied to the output.")
+        .AsDispensable();
+    AddInput("Bias",
+             "(Tensor, optional) Bias is a 1-dimensional tensor of size "
+             "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
+             "It is applied to the output.")
+        .AsDispensable();
+    AddOutput("Y", "(LoDTensor) Result after normalization.");
+    AddOutput("Mean", "(Tensor) Mean of the current mini batch.")
+        .AsIntermediate();
+    AddOutput("Variance", "(Tensor) Variance of the current mini batch.")
+        .AsIntermediate();
+    AddAttr<float>("epsilon",
+                   "(float, default 1e-5) Constant for "
+                   "numerical stability")
+        .SetDefault(1e-5)
+        .AddCustomChecker([](const float &epsilon) {
+          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
+                         "'epsilon' should be between 0.0 and 0.001.");
+        });
+    AddAttr<int>("begin_norm_axis",
+                 "(int default:1), the "
+                 "axis of `begin_norm_axis ... Rank(X) - 1` will be "
+                 "normalized. `begin_norm_axis` splits the tensor(`X`) to a "
+                 "matrix [N,H].")
+        .SetDefault(1)
+        .AddCustomChecker([](const int &begin_norm_axis) {
+          PADDLE_ENFORCE_GT(begin_norm_axis, 0,
+                            "'begin_norm_axis' should be greater than zero.");
+        });
+    AddComment(R"DOC(
+Layer Normalization.
+Layer Norm has been implemented as discussed in the paper:
+https://arxiv.org/abs/1607.06450
+...
+)DOC");
+  }
+};
+template <typename T>
+class LayerNormKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *bias = ctx.Input<Tensor>("Bias");
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto &x_dims = x->dims();
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+    auto *output = ctx.Output<Tensor>("Y");
+    auto *mean = ctx.Output<Tensor>("Mean");
+    auto *var = ctx.Output<Tensor>("Variance");
+    output->mutable_data<T>(ctx.GetPlace());
+    mean->mutable_data<T>(ctx.GetPlace());
+    var->mutable_data<T>(ctx.GetPlace());
+    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
+    int left = static_cast<int>(matrix_dim[0]);
+    int right = static_cast<int>(matrix_dim[1]);
+    auto input_map = ConstEigenMatrixMapRowMajor<T>(x->data<T>(), left, right);
+    auto mean_map = EigenMatrixMapRowMajor<T>(mean->data<T>(), left, 1);
+    auto var_map = EigenMatrixMapRowMajor<T>(var->data<T>(), left, 1);
+    auto output_map = EigenMatrixMapRowMajor<T>(output->data<T>(), left, right);
+    auto squre = [](T ele) { return ele * ele; };
+    auto add_epslion = [epsilon](T ele) { return ele + epsilon; };
+    mean_map = input_map.rowwise().mean();
+    var_map = (input_map - mean_map.replicate(1, right))
+                  .unaryExpr(squre)
+                  .rowwise()
+                  .mean()
+                  .unaryExpr(add_epslion);
+    auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
+    // TODO(zcd): Some thinking about output_map, is it appropriate that
+    // `output_map` and `input_map` point to the same memory.
+    auto inv_std = var_map.unaryExpr(inv_std_func);
+    if (scale && bias) {
+      auto scale_map =
+          ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
+      auto bias_map = ConstEigenMatrixMapRowMajor<T>(bias->data<T>(), 1, right);
+      output_map = (input_map - mean_map.replicate(1, right))
+                       .cwiseProduct(inv_std.replicate(1, right))
+                       .cwiseProduct(scale_map.replicate(left, 1)) +
+                   bias_map.replicate(left, 1);
+    } else if (scale) {
+      auto scale_map =
+          ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
+      output_map = (input_map - mean_map.replicate(1, right))
+                       .cwiseProduct(inv_std.replicate(1, right))
+                       .cwiseProduct(scale_map.replicate(left, 1));
+    } else if (bias) {
+      auto bias_map = ConstEigenMatrixMapRowMajor<T>(bias->data<T>(), 1, right);
+      output_map = (input_map - mean_map.replicate(1, right))
+                       .cwiseProduct(inv_std.replicate(1, right)) +
+                   bias_map.replicate(left, 1);
+    } else {
+      output_map = (input_map - mean_map.replicate(1, right))
+                       .cwiseProduct(inv_std.replicate(1, right));
+    }
+  }
+};
+class LayerNormGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // check input
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Scale"),
+                   "Input(Scale) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Mean"),
+                   "Input(Mean) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Variance"),
+                   "Input(Variance) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
+                   "Input(Y@GRAD) of LayerNormOp should not be null.");
+    // check output
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
+      ctx->SetOutputDim(framework::GradVarName("Scale"),
+                        ctx->GetInputDim("Scale"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
+      ctx->SetOutputDim(framework::GradVarName("Bias"),
+                        ctx->GetInputDim("Bias"));
+    }
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    const auto *var = ctx.InputVar(framework::GradVarName("Y"));
+    if (var == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    const Tensor *t = nullptr;
+    if (var->IsType<Tensor>()) {
+      t = &var->Get<Tensor>();
+    } else if (var->IsType<LoDTensor>()) {
+      t = &var->Get<LoDTensor>();
+    }
+    if (t == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    return framework::OpKernelType(framework::ToDataType(t->type()),
+                                   ctx.GetPlace());
+  }
+};
+template <typename T>
+class LayerNormGradKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *mean = ctx.Input<Tensor>("Mean");
+    const auto *var = ctx.Input<Tensor>("Variance");
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto &x_dims = x->dims();
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
+    int left = static_cast<int>(matrix_dim[0]);
+    int right = static_cast<int>(matrix_dim[1]);
+    // init output
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    auto x_map = ConstEigenMatrixMapRowMajor<T>(x->data<T>(), left, right);
+    auto d_y_map = ConstEigenMatrixMapRowMajor<T>(d_y->data<T>(), left, right);
+    auto mean_map = ConstEigenMatrixMapRowMajor<T>(mean->data<T>(), left, 1);
+    auto var_map = ConstEigenMatrixMapRowMajor<T>(var->data<T>(), left, 1);
+    if (d_bias) {
+      d_bias->mutable_data<T>(ctx.GetPlace());
+      auto d_bias_map = EigenMatrixMapRowMajor<T>(d_bias->data<T>(), 1, right);
+      d_bias_map = d_y_map.colwise().sum();
+    }
+    if (d_scale) {
+      d_scale->mutable_data<T>(ctx.GetPlace());
+      auto d_scale_map =
+          EigenMatrixMapRowMajor<T>(d_scale->data<T>(), 1, right);
+      auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
+      // There are two equation to compute d_scale. One uses "Y" and the other
+      // does not use "Y"
+      d_scale_map =
+          ((x_map - mean_map.replicate(1, right))
+               .cwiseProduct(
+                   var_map.unaryExpr(inv_std_func).replicate(1, right))
+               .cwiseProduct(d_y_map))
+              .colwise()
+              .sum();
+    }
+    if (d_x) {
+      d_x->mutable_data<T>(ctx.GetPlace());
+      auto d_x_map = EigenMatrixMapRowMajor<T>(d_x->data<T>(), left, right);
+      auto triple_product_func = [](T ele) { return ele * ele * ele; };
+      auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
+      // TODO(zcd): these code can be refined
+      if (d_scale) {
+        auto scale_map =
+            ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
+        // dy_dx
+        auto dx_end = var_map.unaryExpr(inv_std_func)
+                          .replicate(1, right)
+                          .cwiseProduct(d_y_map)
+                          .cwiseProduct(scale_map.replicate(left, 1));
+        // dy_dmean_dx
+        auto dx_mean = (T(-1.0) / right) *
+                       var_map.unaryExpr(inv_std_func)
+                           .replicate(1, right)
+                           .cwiseProduct(d_y_map)
+                           .cwiseProduct(scale_map.replicate(left, 1))
+                           .rowwise()
+                           .sum()
+                           .replicate(1, right);
+        // dy_var_dx
+        auto dvar_end_part = (x_map - mean_map.replicate(1, right))
+                                 .cwiseProduct(scale_map.replicate(left, 1))
+                                 .cwiseProduct(d_y_map)
+                                 .rowwise()
+                                 .sum();
+        auto dvar_end = var_map.unaryExpr(inv_std_func)
+                            .unaryExpr(triple_product_func)
+                            .cwiseProduct(dvar_end_part)
+                            .replicate(1, right);
+        auto dx_var =
+            (T(-1.0) / right) *
+            (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end);
+        d_x_map = dx_end + dx_mean + dx_var;
+      } else {
+        // dy_dx
+        auto dx_end = var_map.unaryExpr(inv_std_func)
+                          .replicate(1, right)
+                          .cwiseProduct(d_y_map);
+        // dy_dmean_dx
+        auto dx_mean = (T(-1.0) / right) *
+                       var_map.unaryExpr(inv_std_func)
+                           .replicate(1, right)
+                           .cwiseProduct(d_y_map)
+                           .rowwise()
+                           .sum()
+                           .replicate(1, right);
+        // dy_var_dx
+        auto dvar_end_part = (x_map - mean_map.replicate(1, right))
+                                 .cwiseProduct(d_y_map)
+                                 .rowwise()
+                                 .sum();
+        auto dvar_end = var_map.unaryExpr(inv_std_func)
+                            .unaryExpr(triple_product_func)
+                            .cwiseProduct(dvar_end_part)
+                            .replicate(1, right);
+        auto dx_var =
+            (T(-1.0) / right) *
+            (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end);
+        d_x_map = dx_end + dx_mean + dx_var;
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
+            layer_norm_grad, ops::LayerNormGradOp);
+REGISTER_OP_CPU_KERNEL(
+    layer_norm,
+    ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    layer_norm_grad,
+    ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, float>);
--- a/paddle/operators/layer_norm_op.h
+++ b/paddle/operators/layer_norm_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class LayerNormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+template <typename DeviceContext, typename T>
+class LayerNormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/listen_and_serv_op.cc
+++ b/paddle/operators/listen_and_serv_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <stdint.h>
+#include <sys/stat.h>
+#include <ostream>
+#include <thread>
+#include <unistd.h>
+#include "paddle/framework/executor.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/proto_desc.h"
+#include "paddle/operators/detail/grpc_server.h"
+#include "paddle/operators/detail/sendrecvop_utils.h"
+#include "paddle/operators/detail/simple_block_queue.h"
+#include "paddle/string/printf.h"
+namespace paddle {
+namespace operators {
+constexpr char kOptimizeBlock[] = "OptimizeBlock";
+void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
+  service->RunSyncUpdate();
+  VLOG(4) << "RunServer thread end";
+}
+static void CreateTensorFromMessageType(framework::Variable *var,
+                                        sendrecv::VarType var_type) {
+  if (var_type == sendrecv::VarType::LOD_TENSOR) {
+    var->GetMutable<framework::LoDTensor>();
+  } else if (var_type == sendrecv::VarType::SELECTED_ROWS) {
+    var->GetMutable<framework::SelectedRows>();
+  } else {
+    PADDLE_THROW(
+        "VariableMessage type %d is not in "
+        "[LoDTensor, SelectedRows]",
+        var_type);
+  }
+}
+class ListenAndServOp : public framework::OperatorBase {
+ public:
+  ListenAndServOp(const std::string &type,
+                  const framework::VariableNameMap &inputs,
+                  const framework::VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {
+    if (!rpc_service_) {
+      std::string endpoint = Attr<std::string>("endpoint");
+      rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
+      server_thread_.reset(new std::thread(RunServer, rpc_service_));
+    }
+  }
+  void Stop() override {
+    detail::MessageWithName term_msg;
+    term_msg.first = LISTEN_TERMINATE_MESSAGE;
+    rpc_service_->Push(term_msg);
+    rpc_service_->ShutDown();
+    server_thread_->join();
+  }
+  std::string GetGradVarNameForTrainer(const std::string &varname) const {
+    if (grads_counter_.find(varname) == grads_counter_.end()) {
+      grads_counter_[varname] = 0;
+    }
+    return string::Sprintf("%s.trainer_%d", varname, grads_counter_[varname]++);
+  }
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+    framework::Scope &recv_scope = scope.NewScope();
+    // FIXME(Yancey1989): initialize rpc server with lazy mode.
+    rpc_service_->SetScope(&recv_scope);
+    rpc_service_->SetDevCtx(&dev_ctx);
+    auto param_list = Attr<std::vector<std::string>>("ParamList");
+    auto grad_list = Attr<std::vector<std::string>>("GradList");
+    auto fan_in = Attr<int>("Fanin");
+    auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
+    auto *program = block->Program();
+    framework::Executor executor(dev_place);
+    // TODO(typhoonzero): change this to a while_op for every cluster-batch.
+    bool exit_flag = false;
+    while (!exit_flag) {
+      // Get from multiple trainers, we don't care about the order in which
+      // the gradients arrives, just add suffix 0~n and merge the gradient.
+      rpc_service_->SetCond(0);
+      size_t recv_var_cnt = 0;
+      int batch_barrier = 0;
+      while (batch_barrier != fan_in) {
+        const detail::MessageWithName &v = rpc_service_->Get();
+        auto grad_var_name = v.first;
+        if (grad_var_name == LISTEN_TERMINATE_MESSAGE) {
+          LOG(INFO) << "received terminate message and exit";
+          exit_flag = true;
+          break;
+        } else if (grad_var_name == BATCH_BARRIER_MESSAGE) {
+          VLOG(3) << "recv batch barrier message";
+          batch_barrier++;
+          continue;
+        } else {
+          // receive a variable
+          recv_var_cnt++;
+          auto it =
+              std::find(grad_list.begin(), grad_list.end(), grad_var_name);
+          std::string param_var_name;
+          if (it != grad_list.end()) {
+            param_var_name = param_list[it - grad_list.begin()];
+          } else {
+            LOG(ERROR) << "grad has no paired param:" << grad_var_name;
+          }
+          VLOG(3) << "received grad: " << grad_var_name
+                  << " updating param: " << param_var_name;
+          if (fan_in > 1) {
+            grad_var_name = this->GetGradVarNameForTrainer(grad_var_name);
+          }
+          auto *var = recv_scope.FindVar(grad_var_name);
+          if (var == nullptr) {
+            LOG(ERROR) << "Can not find server side var: " << grad_var_name;
+            PADDLE_THROW("Can not find server side var");
+          }
+          detail::DeserializeFromMessage(v.second, dev_ctx, var);
+        }
+      }
+      VLOG(3) << "recv " << recv_var_cnt << " parmeters for one barrier.";
+      // TODO(Yancey1989): merge SelectedRows variables here
+      if (exit_flag) {
+        rpc_service_->ShutDown();
+      }
+      try {
+        executor.Run(*program, &recv_scope, block->ID(), /*global_block*/
+                     false /*create_local_scope*/, false /*create_vars*/);
+      } catch (std::exception &e) {
+        LOG(ERROR) << "run sub program error " << e.what();
+      }
+      rpc_service_->SetCond(1);
+      rpc_service_->WaitClientGet(recv_var_cnt);
+      grads_counter_.clear();
+    }  // while(true)
+  }
+ protected:
+  std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
+  std::shared_ptr<std::thread> server_thread_;
+  mutable std::unordered_map<std::string, int> grads_counter_;
+};
+class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ListenAndServOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddComment(R"DOC(
+ListenAndServ operator
+This operator will start a RPC server which can receive variables
+from send_op and send back variables to recv_op.
+)DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string, default 127.0.0.1:6164)"
+                         "IP address to listen on.")
+        .SetDefault("127.0.0.1:6164")
+        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
+    AddAttr<framework::BlockDesc *>(kOptimizeBlock,
+                                    "BlockID to run on server side.");
+    AddAttr<std::vector<std::string>>(
+        "ParamList", "type list of string",
+        "grad->param name mapping to find which parameters to optimize.")
+        .SetDefault({});
+    AddAttr<std::vector<std::string>>(
+        "GradList", "type list of string",
+        "grad->param name mapping to find which parameters to optimize.")
+        .SetDefault({});
+    AddAttr<int>("Fanin", "type int",
+                 "Number of trainers in the current cluster job")
+        .SetDefault(1);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(listen_and_serv, ops::ListenAndServOp,
+                  ops::ListenAndServOpMaker);
--- a/paddle/operators/load_combine_op.cc
+++ b/paddle/operators/load_combine_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <fstream>
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"
+namespace paddle {
+namespace operators {
+class LoadCombineOp : public framework::OperatorBase {
+ public:
+  LoadCombineOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto filename = Attr<std::string>("file_path");
+    std::ifstream fin(filename);
+    PADDLE_ENFORCE(static_cast<bool>(fin),
+                   "Cannot open file %s for load_combine op", filename);
+    auto out_var_names = Outputs("Out");
+    PADDLE_ENFORCE_GT(
+        static_cast<int>(out_var_names.size()), 0,
+        "The number of output variables should be greater than 0.");
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+    for (size_t i = 0; i < out_var_names.size(); i++) {
+      auto *out_var = scope.FindVar(out_var_names[i]);
+      PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
+                     out_var_names[i]);
+      auto *tensor = out_var->GetMutable<framework::LoDTensor>();
+      // Error checking
+      PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot read more from file %s",
+                     filename);
+      // Get data from fin to tensor
+      DeserializeFromStream(fin, tensor, dev_ctx);
+      if (platform::is_gpu_place(place)) {
+        // copy CPU to GPU
+        framework::LoDTensor cpu_tensor;
+        cpu_tensor.ShareDataWith(*tensor);
+        cpu_tensor.set_lod(tensor->lod());
+        // reset tensor
+        out_var->Clear();
+        tensor = out_var->GetMutable<framework::LoDTensor>();
+        tensor->set_lod(cpu_tensor.lod());
+        Copy(cpu_tensor, place, dev_ctx, tensor);
+      }
+    }
+  }
+};
+class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LoadCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput(
+        "Out",
+        "(vector) The output LoDTensors that will be read from the input file.")
+        .AsDuplicable();
+    AddAttr<std::string>("file_path",
+                         "(string) "
+                         "LoDTensors will be loaded from \"file_path\".")
+        .AddCustomChecker(
+            [](const std::string &path) { return !path.empty(); });
+    AddComment(R"DOC(
+LoadCombine Operator.
+LoadCombine operator loads LoDTensor variables from a file. The file should 
+contain one or more LoDTensors serialized using the SaveCombine operator. The 
+LoadCombine operator applies a deserialization strategy to appropriately load 
+the LodTensors, and this strategy complements the serialization strategy used 
+in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled
+with the SaveCombine operator, and can only deserialize one or more LoDTensors 
+that were saved using the SaveCombine operator.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(load_combine, ops::LoadCombineOp,
+                  ops::LoadCombineOpProtoMaker);
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
@@ -125,8 +125,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
      new_rows.resize(ids_dim[0]);
      auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
-      memory::Copy(platform::CPUPlace(), new_rows.data(), gpu_place, ids_data,
+      memory::Copy(platform::CPUPlace(), new_rows.cuda_data(), gpu_place,
-                   ids_dim[0] * sizeof(int64_t), stream);
+                   ids_data, ids_dim[0] * sizeof(int64_t), stream);
      d_table->set_rows(new_rows);

--- a/paddle/operators/lstm_op.h
+++ b/paddle/operators/lstm_op.h
@@ -27,11 +27,12 @@ using Tensor = framework::Tensor;
 template <typename DeviceContext, typename T>
 inline void ReorderInitState(const DeviceContext& ctx,
-                             const framework::Tensor& src, const size_t* index,
+                             const framework::Tensor& src,
+                             framework::Vector<size_t> index_lod,
                             framework::Tensor* dst, bool indexed_src) {
  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  row_shuffle(ctx, src, index, *dst, indexed_src);
+  row_shuffle(ctx, src, index_lod, *dst, indexed_src);
 }
 template <typename DeviceContext, typename T>
@@ -84,7 +85,9 @@ class LSTMKernel : public framework::OpKernel<T> {
    }
    lstm_value.prev_state_value = nullptr;
    Tensor ordered_c0;
-    const size_t* order = batch_gate->lod()[2].data();
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
    if (cell_t0) {
      // Since the batch computing for LSTM reorders the input sequence
      // according to their length. The initialized cell state also needs
@@ -202,7 +205,8 @@ class LSTMGradKernel : public framework::OpKernel<T> {
    // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
    // initialization.
    Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
-    const size_t* order = batch_gate->lod()[2].data();
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
    if (c0) {
      ReorderInitState<DeviceContext, T>(device_ctx, *c0, order, &ordered_c0,
                                         true);

--- a/paddle/operators/lstmp_op.h
+++ b/paddle/operators/lstmp_op.h
@@ -34,7 +34,8 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename DeviceContext, typename T>
 inline void ReorderInitState(const DeviceContext& ctx,
-                             const framework::Tensor& src, const size_t* index,
+                             const framework::Tensor& src,
+                             framework::Vector<size_t> index,
                             framework::Tensor* dst, bool indexed_src) {
  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
@@ -109,7 +110,9 @@ class LSTMPKernel : public framework::OpKernel<T> {
    }
    lstmp_value.prev_state_value = nullptr;
    Tensor ordered_c0;
-    const size_t* order = batch_gate->lod()[2].data();
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
    if (cell_t0) {
      // Since the batch computing for LSTMP reorders the input sequence
      // according to their length. The initialized cell state also needs
@@ -275,7 +278,9 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
    // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
    // initialization.
    Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
-    const size_t* order = batch_gate->lod()[2].data();
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
    if (c0) {
      ReorderInitState<DeviceContext, T>(device_ctx, *c0, order, &ordered_c0,
                                         true);

--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -8,6 +8,7 @@ if(WITH_GPU)
    nv_library(softmax SRCS softmax.cc softmax.cu DEPS device_context)
    nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context)
    nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
+    nv_library(depthwise_conv SRCS depthwise_conv.cu DEPS device_context)
    nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function)
    nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor)
    nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)

--- a/paddle/operators/math/depthwise_conv.cu
+++ b/paddle/operators/math/depthwise_conv.cu
+/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/math/depthwise_conv.h"
+#include "paddle/platform/cuda_helper.h"
+namespace paddle {
+namespace operators {
+namespace math {
+// A Cuda kernel to compute the depthwise convolution forward pass
+// in NCHW format.
+template <typename T>
+__global__ void KernelDepthwiseConv(
+    const int nthreads, const T* const input_data, const T* const filter_data,
+    const int batch_size, const int output_channels, const int output_height,
+    const int output_width, const int input_channels, const int input_height,
+    const int input_width, const int filter_multiplier, const int filter_height,
+    const int filter_width, const int stride_height, const int stride_width,
+    const int padding_height, const int padding_width, T* const output_data) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    const int batch = index / output_channels / output_height / output_width;
+    const int c_out = (index / output_height / output_width) % output_channels;
+    const int h_out = (index / output_width) % output_height;
+    const int w_out = index % output_width;
+    const int c_in = c_out / filter_multiplier;
+    const T* weight = filter_data + c_out * filter_height * filter_width;
+    T value = 0;
+    const int h_in_start = -padding_height + h_out * stride_height;
+    const int w_in_start = -padding_width + w_out * stride_width;
+    const int h_in_end = h_in_start + filter_height;
+    const int w_in_end = w_in_start + filter_width;
+    const int in_offset =
+        ((batch * input_channels + c_in) * input_height) * input_width;
+    const int h_end = h_in_end < input_height ? h_in_end : input_height;
+    const int w_end = w_in_end < input_width ? w_in_end : input_width;
+    const int h_start = h_in_start > 0 ? h_in_start : 0;
+    const int w_start = w_in_start > 0 ? w_in_start : 0;
+    for (int h_in = h_start; h_in < h_end; h_in++) {
+      for (int w_in = w_start; w_in < w_end; w_in++) {
+        const int offset = in_offset + h_in * input_width + w_in;
+        value +=
+            weight[(h_in - h_in_start) * filter_width + (w_in - w_in_start)] *
+            input_data[offset];
+      }
+    }
+    output_data[index] = value;
+  }
+}
+// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
+template <typename T>
+__global__ void KernelDepthwiseConvInputGrad(
+    const int nthreads, const T* const output_grad_data,
+    const T* const filter_data, const int batch_size, const int output_channels,
+    const int output_height, const int output_width, const int input_channels,
+    const int input_height, const int input_width, const int filter_multiplier,
+    const int filter_height, const int filter_width, const int stride_height,
+    const int stride_width, const int padding_height, const int padding_width,
+    T* const input_grad_data) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    const int batch = index / input_channels / input_height / input_width;
+    const int c_in = (index / input_height / input_width) % input_channels;
+    const int h_in = (index / input_width) % input_height;
+    const int w_in = index % input_width;
+    const int c_out_start = c_in * filter_multiplier;
+    int h_out_start =
+        (h_in - filter_height + padding_height + stride_height) / stride_height;
+    h_out_start = 0 > h_out_start ? 0 : h_out_start;
+    int h_out_end = (h_in + padding_height) / stride_height;
+    h_out_end = output_height - 1 < h_out_end ? output_height - 1 : h_out_end;
+    int w_out_start =
+        (w_in - filter_width + padding_width + stride_width) / stride_width;
+    w_out_start = 0 > w_out_start ? 0 : w_out_start;
+    int w_out_end = (w_in + padding_width) / stride_width;
+    w_out_end = output_width - 1 < w_out_end ? output_width - 1 : w_out_end;
+    T value = 0;
+    for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier;
+         c_out++) {
+      for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
+        const int filter_h = h_in + padding_height - h_out * stride_height;
+        for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
+          const int filter_w = w_in + padding_width - w_out * stride_width;
+          const int filter_offset = c_out * filter_height * filter_width +
+                                    filter_h * filter_width + filter_w;
+          const int output_grad_offset =
+              ((batch * output_channels + c_out) * output_height + h_out) *
+                  output_width +
+              w_out;
+          value +=
+              output_grad_data[output_grad_offset] * filter_data[filter_offset];
+        }
+      }
+    }
+    input_grad_data[index] += value;
+  }
+}
+// Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
+template <typename T>
+__global__ void KernelDepthwiseConvFilterGrad(
+    const int nthreads, const T* const output_grad_data,
+    const T* const input_data, const int num, const int output_channels,
+    const int output_height, const int output_width, const int input_channels,
+    const int input_height, const int input_width, const int filter_multiplier,
+    const int filter_height, const int filter_width, const int stride_height,
+    const int stride_width, const int padding_height, const int padding_width,
+    T* const filter_grad_data) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    const int w_out = index % output_width;
+    const int h_out = (index / output_width) % output_height;
+    const int c_out = (index / output_width / output_height) % output_channels;
+    const int batch = (index / output_width / output_height / output_channels);
+    const int c_in = c_out / filter_multiplier;
+    const int h_in_start = -padding_height + h_out * stride_height;
+    const int w_in_start = -padding_width + w_out * stride_width;
+    const int h_in_end =
+        -padding_height + h_out * stride_height + filter_height;
+    const int w_in_end = -padding_width + w_out * stride_width + filter_width;
+    const int in_offset =
+        (batch * input_channels + c_in) * input_height * input_width;
+    T* addr_offset = filter_grad_data + c_out * filter_height * filter_width;
+    const int h_end = h_in_end < input_height ? h_in_end : input_height;
+    const int w_end = w_in_end < input_width ? w_in_end : input_width;
+    const int h_start = h_in_start > 0 ? h_in_start : 0;
+    const int w_start = w_in_start > 0 ? w_in_start : 0;
+    for (int h_in = h_start; h_in < h_end; h_in++) {
+      for (int w_in = w_start; w_in < w_end; w_in++) {
+        const int offset = in_offset + h_in * input_width + w_in;
+        const T diff_temp = output_grad_data[index] * input_data[offset];
+        T* addr = addr_offset + (h_in - h_in_start) * filter_width +
+                  (w_in - w_in_start);
+        paddle::platform::CudaAtomicAdd(addr, diff_temp);
+      }
+    }
+  }
+}
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <class T>
+class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& filter,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    const int ksize_height = filter.dims()[2];
+    const int ksize_width = filter.dims()[3];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+    const T* input_data = input.data<T>();
+    const T* filter_data = filter.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    int nthreads = batch_size * output_channels * output_height * output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+    KernelDepthwiseConv<T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, filter_data, batch_size, output_channels,
+        output_height, output_width, input_channels, input_height, input_width,
+        output_channels / input_channels, ksize_height, ksize_width,
+        stride_height, stride_width, padding_height, padding_width,
+        output_data);
+  }
+};
+template <typename T>
+class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& filter,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output_grad.dims()[1];
+    const int output_height = output_grad.dims()[2];
+    const int output_width = output_grad.dims()[3];
+    const int ksize_height = filter.dims()[2];
+    const int ksize_width = filter.dims()[3];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+    const T* filter_data = filter.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    int nthreads = batch_size * input_channels * input_height * input_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+    KernelDepthwiseConvInputGrad<T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, output_grad_data, filter_data, batch_size, output_channels,
+        output_height, output_width, input_channels, input_height, input_width,
+        output_channels / input_channels, ksize_height, ksize_width,
+        stride_height, stride_width, padding_height, padding_width,
+        input_grad_data);
+  }
+};
+template <typename T>
+class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* filter_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output_grad.dims()[1];
+    const int output_height = output_grad.dims()[2];
+    const int output_width = output_grad.dims()[3];
+    const int ksize_height = filter_grad->dims()[2];
+    const int ksize_width = filter_grad->dims()[3];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+    const T* input_data = input.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* filter_grad_data = filter_grad->mutable_data<T>(context.GetPlace());
+    int nthreads = batch_size * output_channels * output_height * output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+    KernelDepthwiseConvFilterGrad<T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, output_grad_data, input_data, batch_size, output_channels,
+        output_height, output_width, input_channels, input_height, input_width,
+        output_channels / input_channels, ksize_height, ksize_width,
+        stride_height, stride_width, padding_height, padding_width,
+        filter_grad_data);
+  }
+};
+template class DepthwiseConvFunctor<platform::CUDADeviceContext, float>;
+template class DepthwiseConvFunctor<platform::CUDADeviceContext, double>;
+template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
+                                             float>;
+template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
+                                             double>;
+template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
+                                              float>;
+template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
+                                              double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/depthwise_conv.h
+++ b/paddle/operators/math/depthwise_conv.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/hostdevice.h"
+namespace paddle {
+namespace operators {
+namespace math {
+/*
+ * \brief Compute the depthwise convolution which include
+ * forward process and backpropagation process
+ */
+template <typename DeviceContext, typename T>
+class DepthwiseConvFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& filter,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* output);
+};
+template <typename DeviceContext, typename T>
+class DepthwiseConvInputGradFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& filter,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
+};
+template <typename DeviceContext, typename T>
+class DepthwiseConvFilterGradFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* filter_grad);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/selected_rows_functor.cu
+++ b/paddle/operators/math/selected_rows_functor.cu
@@ -31,7 +31,7 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
    PADDLE_ENFORCE_EQ(in1_height, input2.height());
    output->set_height(in1_height);
-    auto& in1_rows = input1.rows();
+    framework::Vector<int64_t> in1_rows(input1.rows());
    auto& in2_rows = input2.rows();
    std::vector<int64_t> out_rows;
    out_rows.reserve(in1_rows.size() + in2_rows.size());
@@ -108,7 +108,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
    PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
    auto& in1_value = input1.value();
-    auto& in1_rows = input1.rows();
+    framework::Vector<int64_t> in1_rows(input1.rows());
    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
    PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
@@ -126,7 +126,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
    dim3 grid(1, in1_rows.size());
    SelectedRowsAddTensorKernel<
        T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, in1_rows.data(), out_data, in1_row_numel);
+        in1_data, in1_rows.cuda_data(), out_data, in1_row_numel);
    auto out_eigen = framework::EigenVector<T>::Flatten(*output);
    auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
@@ -146,7 +146,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
    auto in1_height = input1.height();
    PADDLE_ENFORCE_EQ(in1_height, input2->height());
-    auto& in1_rows = input1.rows();
+    framework::Vector<int64_t> in1_rows(input1.rows());
    auto& in2_rows = *(input2->mutable_rows());
    auto& in1_value = input1.value();
@@ -204,7 +204,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
    auto& in1_value = input1.value();
-    auto& in1_rows = input1.rows();
+    framework::Vector<int64_t> in1_rows(input1.rows());
    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
@@ -216,7 +216,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
    dim3 grid(1, in1_rows.size());
    SelectedRowsAddToTensorKernel<
        T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, in1_rows.data(), in2_data, in1_row_numel);
+        in1_data, in1_rows.cuda_data(), in2_data, in1_row_numel);
  }
 };
@@ -257,7 +257,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
  framework::SelectedRows operator()(const platform::CUDADeviceContext& context,
                                     const framework::SelectedRows& input) {
    framework::SelectedRows out;
-    auto input_rows = input.rows();
+    framework::Vector<int64_t> input_rows(input.rows());
    std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
    std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
@@ -283,9 +283,9 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
    MergeAddKernel<
        T, 256><<<grid1, threads, 0,
                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(input_data, input.rows().data(), out_data,
+                      .stream()>>>(input_data, input_rows.cuda_data(), out_data,
-                                   out.rows().data(), out.rows().size(),
+                                   out.mutable_rows()->cuda_data(),
-                                   input_width);
+                                   out.rows().size(), input_width);
    return out;
  }
 };
@@ -370,8 +370,8 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> {
    dim3 threads(platform::PADDLE_CUDA_NUM_THREADS, 1);
    dim3 grid(1, in1_rows.size());
    UpdateToTensorKernel<T, platform::PADDLE_CUDA_NUM_THREADS><<<
-        grid, threads, 0, context.stream()>>>(in1_data, in1_rows.data(), op,
+        grid, threads, 0, context.stream()>>>(in1_data, in1_rows.cuda_data(),
-                                              in2_data, in1_row_numel);
+                                              op, in2_data, in1_row_numel);
  }
 };
 }  // namespace scatter

--- a/paddle/operators/math/sequence2batch.cc
+++ b/paddle/operators/math/sequence2batch.cc
@@ -23,8 +23,10 @@ template <typename T>
 class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
 public:
  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& src, const size_t* index,
+                  const framework::Tensor& src,
-                  framework::Tensor& dst, bool is_src_index) {
+                  framework::Vector<size_t> index_lod, framework::Tensor& dst,
+                  bool is_src_index) {
+    size_t* index = index_lod.data();
    auto src_dims = src.dims();
    auto dst_dims = dst.dims();
    PADDLE_ENFORCE_EQ(src_dims.size(), 2UL,

--- a/paddle/operators/math/sequence2batch.cu
+++ b/paddle/operators/math/sequence2batch.cu
@@ -42,8 +42,10 @@ template <typename T>
 class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
 public:
  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& src, const size_t* index,
+                  const framework::Tensor& src,
-                  framework::Tensor& dst, bool is_src_index) {
+                  framework::Vector<size_t> index_lod, framework::Tensor& dst,
+                  bool is_src_index) {
+    size_t* index = index_lod.cuda_data();
    auto src_dims = src.dims();
    auto dst_dims = dst.dims();
    PADDLE_ENFORCE_EQ(src_dims.size(), 2,

--- a/paddle/operators/math/sequence2batch.h
+++ b/paddle/operators/math/sequence2batch.h
@@ -35,7 +35,7 @@ class CopyMatrixRowsFunctor {
  // copy the input src to the indexed rows of output dst.
  // The indexed rows are based on the input index.
  void operator()(const DeviceContext& context, const framework::Tensor& src,
-                  const size_t* index, framework::Tensor& dst,
+                  framework::Vector<size_t> index_lod, framework::Tensor& dst,
                  bool is_src_index);
 };
@@ -66,7 +66,7 @@ class LoDTensor2BatchFunctor {
      PADDLE_ENFORCE_EQ(lods[1].size(),
                        static_cast<size_t>(lod_tensor.dims()[0]));
      CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
-      to_batch(context, lod_tensor, lods[1].data(), batch, true);
+      to_batch(context, lod_tensor, lods[1], batch, true);
      return;
    }
@@ -144,7 +144,7 @@ class LoDTensor2BatchFunctor {
    batch.set_lod(batch_lods);
    CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
-    to_batch(context, lod_tensor, seq2batch_idx, batch, true);
+    to_batch(context, lod_tensor, batch_lods[1], batch, true);
  }
 };
@@ -159,8 +159,7 @@ class Batch2LoDTensorFunctor {
    PADDLE_ENFORCE_EQ(in_lod[1].size(),
                      static_cast<size_t>(lod_tensor.dims()[0]));
    CopyMatrixRowsFunctor<DeviceContext, T> to_seq;
-    size_t* index = in_lod[1].data();
+    to_seq(context, batch, in_lod[1], lod_tensor, false);
-    to_seq(context, batch, index, lod_tensor, false);
  }
 };

--- a/paddle/operators/math/sequence_padding.cu
+++ b/paddle/operators/math/sequence_padding.cu
@@ -120,12 +120,14 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
    T* padding_data = padding.data<T>();
    if (norm_by_times) {
      SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>(
-          padding_data, const_cast<T*>(seq_data), abs_offset_lod[level].data(),
+          padding_data, const_cast<T*>(seq_data),
-          sequence_width, max_sequence_length, num_sequences);
+          abs_offset_lod[level].cuda_data(), sequence_width,
+          max_sequence_length, num_sequences);
    } else {
      SequencePaddingKernel<T, 0, 1><<<grid, threads, 0, context.stream()>>>(
-          padding_data, const_cast<T*>(seq_data), abs_offset_lod[level].data(),
+          padding_data, const_cast<T*>(seq_data),
-          sequence_width, max_sequence_length, num_sequences);
+          abs_offset_lod[level].cuda_data(), sequence_width,
+          max_sequence_length, num_sequences);
    }
  }
 };
@@ -193,12 +195,14 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
    T* seq_data = seq.data<T>();
    if (norm_by_times) {
      SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>(
-          const_cast<T*>(padding_data), seq_data, abs_offset_lod[level].data(),
+          const_cast<T*>(padding_data), seq_data,
-          sequence_width, max_sequence_length, num_sequences);
+          abs_offset_lod[level].cuda_data(), sequence_width,
+          max_sequence_length, num_sequences);
    } else {
      SequencePaddingKernel<T, 0, 0><<<grid, threads, 0, context.stream()>>>(
-          const_cast<T*>(padding_data), seq_data, abs_offset_lod[level].data(),
+          const_cast<T*>(padding_data), seq_data,
-          sequence_width, max_sequence_length, num_sequences);
+          abs_offset_lod[level].cuda_data(), sequence_width,
+          max_sequence_length, num_sequences);
    }
  }
 };

--- a/paddle/operators/math/sequence_pooling.cu
+++ b/paddle/operators/math/sequence_pooling.cu
@@ -73,7 +73,7 @@ class MaxSeqPoolFunctor<platform::CUDADeviceContext, T> {
    dim3 grid(num_seq, 1);
    auto stream = context.stream();
    KeMaxSequencePool<T><<<grid, threads, 0, stream>>>(
-        in_data, starts.data(), out_data, max_index, num_seq, dim);
+        in_data, starts.cuda_data(), out_data, max_index, num_seq, dim);
  }
 };

--- a/paddle/operators/math/sequence_scale.cu
+++ b/paddle/operators/math/sequence_scale.cu
@@ -46,7 +46,7 @@ class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
    SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<<
        num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
-        seq_data, abs_offset_lod[level].data(), scales, seq_width);
+        seq_data, abs_offset_lod[level].cuda_data(), scales, seq_width);
  }
 };

--- a/paddle/operators/mine_hard_examples_op.cc
+++ b/paddle/operators/mine_hard_examples_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+enum MiningType { kNone = 0, kMaxNegative, kHardExample };
+template <typename T>
+bool SortScoreDescend(const std::pair<float, T>& pair1,
+                      const std::pair<float, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+inline bool IsEligibleMining(const MiningType mining_type, const int match_idx,
+                             const float match_dist,
+                             const float neg_dist_threshold) {
+  if (mining_type == MiningType::kMaxNegative) {
+    return match_idx == -1 && match_dist < neg_dist_threshold;
+  } else if (mining_type == MiningType::kHardExample) {
+    return true;
+  } else {
+    return false;
+  }
+}
+inline MiningType GetMiningType(std::string str) {
+  if (str == "max_negative") {
+    return MiningType::kMaxNegative;
+  } else if (str == "hard_example") {
+    return MiningType::kHardExample;
+  } else {
+    return MiningType::kNone;
+  }
+}
+template <typename DeviceContext, typename T>
+class MineHardExamplesKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in_cls_loss = ctx.Input<framework::Tensor>("ClsLoss");
+    auto* in_loc_loss = ctx.Input<framework::Tensor>("LocLoss");
+    auto* in_matched_indices = ctx.Input<framework::Tensor>("MatchIndices");
+    auto* in_match_dist = ctx.Input<framework::Tensor>("MatchDist");
+    float neg_pos_ratio = ctx.Attr<float>("neg_pos_ratio");
+    T neg_dist_threshold =
+        static_cast<T>(ctx.Attr<float>("neg_dist_threshold"));
+    int sample_size = ctx.Attr<int>("sample_size");
+    MiningType mining_type =
+        GetMiningType(ctx.Attr<std::string>("mining_type"));
+    auto out_neg_indices = ctx.Output<framework::LoDTensor>("NegIndices");
+    auto out_match_indices =
+        ctx.Output<framework::Tensor>("UpdatedMatchIndices");
+    framework::Copy(*in_matched_indices, ctx.GetPlace(), out_match_indices);
+    int batch_size = in_matched_indices->dims()[0];
+    int prior_num = in_matched_indices->dims()[1];
+    auto match_indices = framework::EigenMatrix<int>::From(*in_matched_indices);
+    auto match_indices_et =
+        framework::EigenMatrix<int>::From(*out_match_indices);
+    auto match_dist = framework::EigenMatrix<T>::From(*in_match_dist);
+    const T* cls_loss = in_cls_loss->data<T>();
+    const T* loc_loss = nullptr;
+    if (in_loc_loss) {
+      loc_loss = in_loc_loss->data<T>();
+    }
+    std::vector<std::vector<int>> all_neg_indices;
+    std::vector<size_t> batch_starts = {0};
+    for (int n = 0; n < batch_size; ++n) {
+      std::vector<std::pair<T, size_t>> loss_idx;
+      int neg_sel = 0;
+      for (int m = 0; m < prior_num; ++m) {
+        if (IsEligibleMining(mining_type, match_indices(n, m), match_dist(n, m),
+                             neg_dist_threshold)) {
+          T loss = cls_loss[n * prior_num + m];
+          if (mining_type == MiningType::kHardExample && loc_loss != nullptr) {
+            loss = cls_loss[n * prior_num + m] + loc_loss[n * prior_num + m];
+          }
+          loss_idx.push_back(std::make_pair(loss, m));
+          ++neg_sel;
+        }
+      }
+      if (mining_type == MiningType::kMaxNegative) {
+        int num_pos = 0;
+        for (int m = 0; m < prior_num; ++m) {
+          if (match_indices(n, m) != -1) ++num_pos;
+        }
+        neg_sel = std::min(static_cast<int>(num_pos * neg_pos_ratio), neg_sel);
+      } else if (mining_type == MiningType::kHardExample) {
+        neg_sel = std::min(sample_size, neg_sel);
+      }
+      std::sort(loss_idx.begin(), loss_idx.end(), SortScoreDescend<size_t>);
+      std::set<int> sel_indices;
+      std::vector<int> neg_indices;
+      std::transform(loss_idx.begin(), loss_idx.begin() + neg_sel,
+                     std::inserter(sel_indices, sel_indices.begin()),
+                     [](std::pair<T, size_t>& l) -> int {
+                       return static_cast<int>(l.second);
+                     });
+      if (mining_type == MiningType::kHardExample) {
+        for (int m = 0; m < prior_num; ++m) {
+          if (match_indices(n, m) > -1) {
+            if (sel_indices.find(m) == sel_indices.end()) {
+              match_indices_et(n, m) = -1;
+            }
+          } else {
+            if (sel_indices.find(m) != sel_indices.end()) {
+              neg_indices.push_back(m);
+            }
+          }
+        }
+      } else {
+        neg_indices.resize(sel_indices.size());
+        std::copy(sel_indices.begin(), sel_indices.end(), neg_indices.begin());
+      }
+      all_neg_indices.push_back(neg_indices);
+      batch_starts.push_back(batch_starts.back() + neg_indices.size());
+    }
+    framework::LoD out_neg_indices_lod;
+    out_neg_indices_lod.emplace_back(batch_starts);
+    int neg_offset = 0;
+    auto neg_data = out_neg_indices->mutable_data<int>(
+        framework::make_ddim({static_cast<int>(batch_starts.back()), 1}),
+        ctx.GetPlace());
+    for (auto neg_indices : all_neg_indices) {
+      std::copy(neg_indices.begin(), neg_indices.end(), neg_data + neg_offset);
+      neg_offset += neg_indices.size();
+    }
+    out_neg_indices->set_lod(out_neg_indices_lod);
+    return;
+  }
+};
+class MineHardExamplesOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("ClsLoss"),
+                   "Input(ClsLoss) of MineHardExamplesOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("MatchIndices"),
+        "Input(MatchIndices) of MineHardExamplesOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("MatchDist"),
+        "Input(MatchDist) of MineHardExamplesOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("NegIndices"),
+        "Output(NegIndices) of MineHardExamplesOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("UpdatedMatchIndices"),
+                   "Output(UpdatedMatchIndices) of MineHardExamplesOp should "
+                   "not be null.");
+    auto cls_loss_dims = ctx->GetInputDim("ClsLoss");
+    auto idx_dims = ctx->GetInputDim("MatchIndices");
+    auto dis_dims = ctx->GetInputDim("MatchDist");
+    PADDLE_ENFORCE_EQ(cls_loss_dims.size(), 2UL,
+                      "The shape of ClsLoss is [N, Np].");
+    PADDLE_ENFORCE_EQ(idx_dims.size(), 2UL,
+                      "The shape of MatchIndices is [N, Np].");
+    PADDLE_ENFORCE_EQ(dis_dims.size(), 2UL,
+                      "The shape of MatchDist is [N, Np].");
+    if (ctx->HasInput("LocLoss")) {
+      auto loc_loss_dims = ctx->GetInputDim("LocLoss");
+      PADDLE_ENFORCE_EQ(loc_loss_dims.size(), 2UL,
+                        "The shape of LocLoss is [N, Np].");
+      PADDLE_ENFORCE_EQ(cls_loss_dims[0], loc_loss_dims[0],
+                        "Batch size of ClsLoss and LocLoss must be the same.");
+      PADDLE_ENFORCE_EQ(
+          cls_loss_dims[1], loc_loss_dims[1],
+          "Prior box number of ClsLoss and LocLoss must be the same.");
+    }
+    PADDLE_ENFORCE_EQ(
+        cls_loss_dims[0], idx_dims[0],
+        "Batch size of ClsLoss and MatchIndices must be the same.");
+    PADDLE_ENFORCE_EQ(
+        cls_loss_dims[1], idx_dims[1],
+        "Prior box number of ClsLoss and MatchIndices must be the same.");
+    PADDLE_ENFORCE_EQ(cls_loss_dims[0], dis_dims[0],
+                      "Batch size of ClsLoss and MatchDist must be the same.");
+    PADDLE_ENFORCE_EQ(
+        cls_loss_dims[1], idx_dims[1],
+        "Prior box number of ClsLoss and MatchDist must be the same.");
+    auto mining_type =
+        GetMiningType(ctx->Attrs().Get<std::string>("mining_type"));
+    PADDLE_ENFORCE_NE(mining_type, MiningType::kNone,
+                      "mining_type must be hard_example or max_negative");
+    if (mining_type == MiningType::kMaxNegative) {
+      auto neg_pos_ratio = ctx->Attrs().Get<float>("neg_pos_ratio");
+      auto neg_dist_threshold = ctx->Attrs().Get<float>("neg_dist_threshold");
+      PADDLE_ENFORCE_GT(
+          neg_pos_ratio, 0.0f,
+          "neg_pos_ratio must greater than zero in max_negative mode");
+      PADDLE_ENFORCE_GT(
+          neg_dist_threshold, 0.0f,
+          "neg_dist_threshold must greater than zero in max_negative mode");
+    } else if (mining_type == MiningType::kHardExample) {
+      auto sample_size = ctx->Attrs().Get<int>("sample_size");
+      PADDLE_ENFORCE_GT(
+          sample_size, 0,
+          "sample_size must greater than zero in hard_example mode");
+    }
+    ctx->SetOutputDim("UpdatedMatchIndices", idx_dims);
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("ClsLoss")->type()),
+        ctx.device_context());
+  }
+};
+class MineHardExamplesOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MineHardExamplesOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "ClsLoss",
+        "(Tensor, default Tensor<float>), The classification loss with shape "
+        "[N, Np], N is the batch size and Np is the number of prior box.");
+    AddInput("LocLoss",
+             "(Tensor, optional, default Tensor<float>), The localization loss "
+             "with shape [N, Np], N is the batch size and Np is the number of "
+             "prior box.")
+        .AsDispensable();
+    AddInput("MatchIndices",
+             "(Tensor, Tensor<int>), Matched indices with shape [N, Np], N is "
+             "the batch size and Np is the number of prior box. "
+             "MatchIndices[i][j] equal -1 means the j-th prior box in i-th "
+             "instance does not match any entity, otherwise means it is "
+             "matched to row.");
+    AddInput("MatchDist",
+             "(Tensor, default Tensor<float>) Matched indices with shape [N, "
+             "Np], N is the batch size and Np is the number of prior box.");
+    AddAttr<float>("neg_pos_ratio",
+                   "(float) The ratio of the negative box to the positive "
+                   "box. Use only when mining_type is max_negative.")
+        .SetDefault(1.0);
+    AddAttr<float>("neg_dist_threshold",
+                   "(float) The negative overlap upper bound for the unmatched "
+                   "predictions. Use only when mining_type is max_negative.")
+        .SetDefault(0.5);
+    AddAttr<int>("sample_size",
+                 "(float) The max sample size of negative box. Use only when "
+                 "mining_type is hard_example.")
+        .SetDefault(0);
+    AddAttr<std::string>("mining_type",
+                         "(float) The mining algorithm name, the value is "
+                         "hard_example or max_negative.")
+        .SetDefault("max_negative")
+        .InEnum({"hard_example", "max_negative"});
+    AddOutput(
+        "NegIndices",
+        "(LoDTensor<int>) The output of negative example indices. a LoDTensor "
+        "with shape [Neg, 1]. The size of lod[0] minus 1 is batch size, "
+        "and each element is the prior box index. "
+        "For example, the batch size is 2, the lod is [[0, 1, 2]], "
+        "the sample 0's box 1(MatchIndices[0][1]) is selected, "
+        "and sample 1's box 0 is selected. The output NegIndices is "
+        "[[1], [0]].");
+    AddOutput("UpdatedMatchIndices",
+              "(Tensor<int>) The output of updated MatchIndices, a tensor with "
+              "shape [N, Np]. Only update when mining_type is "
+              "hard_example. The input MatchIndices elements will be update to "
+              "-1 when it is not in the candidate high loss list of negative "
+              "examples.");
+    AddComment(R"DOC(
+Mine hard examples Operator.
+This operator implements hard example mining to select a subset of negative box indices.
+For each image, selects the box with highest losses. subject to the condition that the 
+box cannot have an Matcht > neg_dist_threshold when mining_type is max_negative. 
+The selected number is min(sample_size, max_negative_box_number) when mining_type is 
+hard_example, or min(neg_pos_ratio * positive_box_number, max_negative_box_number) 
+when mining_type is max_negative, where the max_negative_box_number is the count of 
+MatchIndices elements with value -1.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(mine_hard_examples, ops::MineHardExamplesOp,
+                             ops::MineHardExamplesOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    mine_hard_examples,
+    ops::MineHardExamplesKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MineHardExamplesKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/operators/multiclass_nms_op.cc
+++ b/paddle/operators/multiclass_nms_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+constexpr int64_t kOutputDim = 6;
+constexpr int64_t kBBoxSize = 4;
+class MultiClassNMSOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("BBoxes"),
+                   "Input(BBoxes) of MultiClassNMS should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Scores"),
+                   "Input(Scores) of MultiClassNMS should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of MultiClassNMS should not be null.");
+    auto box_dims = ctx->GetInputDim("BBoxes");
+    auto score_dims = ctx->GetInputDim("Scores");
+    PADDLE_ENFORCE_EQ(box_dims.size(), 2,
+                      "The rank of Input(BBoxes) must be 2.");
+    PADDLE_ENFORCE_EQ(score_dims.size(), 3,
+                      "The rank of Input(Scores) must be 3.");
+    PADDLE_ENFORCE_EQ(box_dims[1], 4,
+                      "The 2nd dimension of Input(BBoxes) must be 4, "
+                      "represents the layout of coordinate "
+                      "[xmin, ymin, xmax, ymax]");
+    PADDLE_ENFORCE_EQ(box_dims[0], score_dims[2],
+                      "The 1st dimensiong of Input(BBoxes) must be equal to "
+                      "3rd dimension of Input(Scores), which represents the "
+                      "predicted bboxes.");
+    // Here the box_dims[0] is not the real dimension of output.
+    // It will be rewritten in the computing kernel.
+    ctx->SetOutputDim("Out", {box_dims[0], 6});
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>("Scores")->type()),
+        ctx.device_context());
+  }
+};
+template <class T>
+bool SortScorePairDescend(const std::pair<float, T>& pair1,
+                          const std::pair<float, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+template <class T>
+static inline void GetMaxScoreIndex(
+    const std::vector<T>& scores, const T threshold, int top_k,
+    std::vector<std::pair<T, int>>* sorted_indices) {
+  for (size_t i = 0; i < scores.size(); ++i) {
+    if (scores[i] > threshold) {
+      sorted_indices->push_back(std::make_pair(scores[i], i));
+    }
+  }
+  // Sort the score pair according to the scores in descending order
+  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
+                   SortScorePairDescend<int>);
+  // Keep top_k scores if needed.
+  if (top_k > -1 && top_k < sorted_indices->size()) {
+    sorted_indices->resize(top_k);
+  }
+}
+template <class T>
+static inline T BBoxArea(const T* box, const bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+template <class T>
+static inline T JaccardOverlap(const T* box1, const T* box2,
+                               const bool normalized) {
+  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
+      box2[3] < box1[1]) {
+    return static_cast<T>(0.);
+  } else {
+    const T inter_xmin = std::max(box1[0], box2[0]);
+    const T inter_ymin = std::max(box1[1], box2[1]);
+    const T inter_xmax = std::min(box1[2], box2[2]);
+    const T inter_ymax = std::min(box1[3], box2[3]);
+    const T inter_w = inter_xmax - inter_xmin;
+    const T inter_h = inter_ymax - inter_ymin;
+    const T inter_area = inter_w * inter_h;
+    const T bbox1_area = BBoxArea<T>(box1, normalized);
+    const T bbox2_area = BBoxArea<T>(box2, normalized);
+    return inter_area / (bbox1_area + bbox2_area - inter_area);
+  }
+}
+template <typename T>
+class MultiClassNMSKernel : public framework::OpKernel<T> {
+ public:
+  void NMSFast(const Tensor& bbox, const Tensor& scores,
+               const T score_threshold, const T nms_threshold, const T eta,
+               const int64_t top_k, std::vector<int>* selected_indices) const {
+    // The total boxes for each instance.
+    int64_t num_boxes = bbox.dims()[0];
+    // 4: [xmin ymin xmax ymax]
+    int64_t box_size = bbox.dims()[1];
+    std::vector<T> scores_data(num_boxes);
+    std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
+    std::vector<std::pair<T, int>> sorted_indices;
+    GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
+    selected_indices->clear();
+    T adaptive_threshold = nms_threshold;
+    const T* bbox_data = bbox.data<T>();
+    while (sorted_indices.size() != 0) {
+      const int idx = sorted_indices.front().second;
+      bool keep = true;
+      for (int k = 0; k < selected_indices->size(); ++k) {
+        if (keep) {
+          const int kept_idx = (*selected_indices)[k];
+          T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
+                                        bbox_data + kept_idx * box_size, true);
+          keep = overlap <= adaptive_threshold;
+        } else {
+          break;
+        }
+      }
+      if (keep) {
+        selected_indices->push_back(idx);
+      }
+      sorted_indices.erase(sorted_indices.begin());
+      if (keep && eta < 1 && adaptive_threshold > 0.5) {
+        adaptive_threshold *= eta;
+      }
+    }
+  }
+  void MultiClassNMS(const framework::ExecutionContext& ctx,
+                     const Tensor& scores, const Tensor& bboxes,
+                     std::map<int, std::vector<int>>& indices,
+                     int& num_nmsed_out) const {
+    int64_t background_label = ctx.Attr<int>("background_label");
+    int64_t nms_top_k = ctx.Attr<int>("nms_top_k");
+    int64_t keep_top_k = ctx.Attr<int>("keep_top_k");
+    T nms_threshold = static_cast<T>(ctx.Attr<float>("nms_threshold"));
+    T nms_eta = static_cast<T>(ctx.Attr<float>("nms_eta"));
+    T score_threshold = static_cast<T>(ctx.Attr<float>("score_threshold"));
+    int64_t class_num = scores.dims()[0];
+    int64_t predict_dim = scores.dims()[1];
+    int num_det = 0;
+    for (int64_t c = 0; c < class_num; ++c) {
+      if (c == background_label) continue;
+      Tensor score = scores.Slice(c, c + 1);
+      NMSFast(bboxes, score, score_threshold, nms_threshold, nms_eta, nms_top_k,
+              &(indices[c]));
+      num_det += indices[c].size();
+    }
+    num_nmsed_out = num_det;
+    const T* scores_data = scores.data<T>();
+    if (keep_top_k > -1 && num_det > keep_top_k) {
+      std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
+      for (const auto& it : indices) {
+        int label = it.first;
+        const T* sdata = scores_data + label * predict_dim;
+        const std::vector<int>& label_indices = it.second;
+        for (int j = 0; j < label_indices.size(); ++j) {
+          int idx = label_indices[j];
+          PADDLE_ENFORCE_LT(idx, predict_dim);
+          score_index_pairs.push_back(
+              std::make_pair(sdata[idx], std::make_pair(label, idx)));
+        }
+      }
+      // Keep top k results per image.
+      std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
+                       SortScorePairDescend<std::pair<int, int>>);
+      score_index_pairs.resize(keep_top_k);
+      // Store the new indices.
+      std::map<int, std::vector<int>> new_indices;
+      for (int j = 0; j < score_index_pairs.size(); ++j) {
+        int label = score_index_pairs[j].second.first;
+        int idx = score_index_pairs[j].second.second;
+        new_indices[label].push_back(idx);
+      }
+      new_indices.swap(indices);
+      num_nmsed_out = keep_top_k;
+    }
+  }
+  void MultiClassOutput(const Tensor& scores, const Tensor& bboxes,
+                        std::map<int, std::vector<int>>& selected_indices,
+                        Tensor* outs) const {
+    int predict_dim = scores.dims()[1];
+    auto* scores_data = scores.data<T>();
+    auto* bboxes_data = bboxes.data<T>();
+    auto* odata = outs->data<T>();
+    int count = 0;
+    for (const auto& it : selected_indices) {
+      int label = it.first;
+      const T* sdata = scores_data + label * predict_dim;
+      const std::vector<int>& indices = it.second;
+      for (int j = 0; j < indices.size(); ++j) {
+        int idx = indices[j];
+        const T* bdata = bboxes_data + idx * kBBoxSize;
+        odata[count * kOutputDim] = label;           // label
+        odata[count * kOutputDim + 1] = sdata[idx];  // score
+        // xmin, ymin, xmax, ymax
+        std::memcpy(odata + count * kOutputDim + 2, bdata, 4 * sizeof(T));
+        count++;
+      }
+    }
+  }
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* boxes = ctx.Input<Tensor>("BBoxes");
+    auto* scores = ctx.Input<Tensor>("Scores");
+    auto* outs = ctx.Output<LoDTensor>("Out");
+    auto score_dims = scores->dims();
+    int64_t batch_size = score_dims[0];
+    int64_t class_num = score_dims[1];
+    int64_t predict_dim = score_dims[2];
+    std::vector<std::map<int, std::vector<int>>> all_indices;
+    std::vector<size_t> batch_starts = {0};
+    for (int64_t i = 0; i < batch_size; ++i) {
+      Tensor ins_score = scores->Slice(i, i + 1);
+      ins_score.Resize({class_num, predict_dim});
+      std::map<int, std::vector<int>> indices;
+      int num_nmsed_out = 0;
+      MultiClassNMS(ctx, ins_score, *boxes, indices, num_nmsed_out);
+      all_indices.push_back(indices);
+      batch_starts.push_back(batch_starts.back() + num_nmsed_out);
+    }
+    int num_kept = batch_starts.back();
+    if (num_kept == 0) {
+      T* od = outs->mutable_data<T>({1}, ctx.GetPlace());
+      od[0] = -1;
+    } else {
+      outs->mutable_data<T>({num_kept, kOutputDim}, ctx.GetPlace());
+      for (int64_t i = 0; i < batch_size; ++i) {
+        Tensor ins_score = scores->Slice(i, i + 1);
+        ins_score.Resize({class_num, predict_dim});
+        int64_t s = batch_starts[i];
+        int64_t e = batch_starts[i + 1];
+        if (e > s) {
+          Tensor out = outs->Slice(s, e);
+          MultiClassOutput(ins_score, *boxes, all_indices[i], &out);
+        }
+      }
+    }
+    framework::LoD lod;
+    lod.emplace_back(batch_starts);
+    outs->set_lod(lod);
+  }
+};
+class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MultiClassNMSOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("BBoxes",
+             "(Tensor) A 2-D Tensor with shape [M, 4] represents the "
+             "predicted locations of M bounding bboxes. Each bounding box "
+             "has four coordinate values and the layout is "
+             "[xmin, ymin, xmax, ymax].");
+    AddInput("Scores",
+             "(Tensor) A 3-D Tensor with shape [N, C, M] represents the "
+             "predicted confidence predictions. N is the batch size, C is the "
+             "class number, M is number of bounding boxes. For each category "
+             "there are total M scores which corresponding M bounding boxes. "
+             " Please note, M is equal to the 1st dimension of BBoxes. ");
+    AddAttr<int>(
+        "background_label",
+        "(int64_t, defalut: 0) "
+        "The index of background label, the background label will be ignored. "
+        "If set to -1, then all categories will be considered.")
+        .SetDefault(0);
+    AddAttr<float>("score_threshold",
+                   "(float) "
+                   "Threshold to filter out bounding boxes with low "
+                   "confidence score. If not provided, consider all boxes.");
+    AddAttr<int>("nms_top_k",
+                 "(int64_t) "
+                 "Maximum number of detections to be kept according to the "
+                 "confidences aftern the filtering detections based on "
+                 "score_threshold");
+    AddAttr<float>("nms_threshold",
+                   "(float, defalut: 0.3) "
+                   "The threshold to be used in NMS.")
+        .SetDefault(0.3);
+    AddAttr<float>("nms_eta",
+                   "(float) "
+                   "The parameter for adaptive NMS.")
+        .SetDefault(1.0);
+    AddAttr<int>("keep_top_k",
+                 "(int64_t) "
+                 "Number of total bboxes to be kept per image after NMS "
+                 "step. -1 means keeping all bboxes after NMS step.");
+    AddOutput("Out",
+              "(LoDTensor) A 2-D LoDTensor with shape [No, 6] represents the "
+              "detections. Each row has 6 values: "
+              "[label, confidence, xmin, ymin, xmax, ymax], No is the total "
+              "number of detections in this mini-batch. For each instance, "
+              "the offsets in first dimension are called LoD, the number of "
+              "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is "
+              "no detected bbox.");
+    AddComment(R"DOC(
+This operator is to do multi-class non maximum suppression (NMS) on a batched
+of boxes and scores.
+In the NMS step, this operator greedily selects a subset of detection bounding
+boxes that have high scores larger than score_threshold, if providing this
+threshold, then selects the largest nms_top_k confidences scores if nms_top_k
+is larger than -1. Then this operator pruns away boxes that have high IOU
+(intersection over union) overlap with already selected boxes by adaptive
+threshold NMS based on parameters of nms_threshold and nms_eta.
+Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
+per image if keep_top_k is larger than -1.
+This operator support multi-class and batched inputs. It applying NMS
+independently for each class. The outputs is a 2-D LoDTenosr, for each
+image, the offsets in first dimension of LoDTensor are called LoD, the number
+of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0,
+means there is no detected bbox for this image. If there is no detected boxes
+for all images, all the elements in LoD are 0, and the Out only contains one
+value which is -1.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(multiclass_nms, ops::MultiClassNMSOp,
+                  ops::MultiClassNMSOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(multiclass_nms, ops::MultiClassNMSKernel<float>,
+                       ops::MultiClassNMSKernel<double>);
--- a/paddle/operators/recv_op.cc
+++ b/paddle/operators/recv_op.cc
@@ -12,187 +12,60 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <stdint.h>
-#include <sys/stat.h>
 #include <ostream>
-#include <thread>
-#include <unistd.h>
+#include "paddle/framework/data_type.h"
-#include "paddle/framework/executor.h"
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/framework/proto_desc.h"
-#include "paddle/operators/detail/grpc_server.h"
+#include <future>
-#include "paddle/operators/detail/sendrecvop_utils.h"
+#include "paddle/operators/detail/grpc_client.h"
-#include "paddle/operators/detail/simple_block_queue.h"
-#include "paddle/string/printf.h"
 namespace paddle {
 namespace operators {
-constexpr char kOptimizeBlock[] = "OptimizeBlock";
-void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
-  service->RunSyncUpdate();
-  VLOG(4) << "RunServer thread end";
-}
-static void CreateTensorFromMessageType(framework::Variable *var,
-                                        sendrecv::VarType var_type) {
-  if (var_type == sendrecv::VarType::LOD_TENSOR) {
-    var->GetMutable<framework::LoDTensor>();
-  } else if (var_type == sendrecv::VarType::SELECTED_ROWS) {
-    var->GetMutable<framework::SelectedRows>();
-  } else {
-    PADDLE_THROW(
-        "VariableMessage type %d is not in "
-        "[LoDTensor, SelectedRows]",
-        var_type);
-  }
-}
 class RecvOp : public framework::OperatorBase {
 public:
-  RecvOp(const std::string &type, const framework::VariableNameMap &inputs,
+  RecvOp(const std::string& type, const framework::VariableNameMap& inputs,
-         const framework::VariableNameMap &outputs,
+         const framework::VariableNameMap& outputs,
-         const framework::AttributeMap &attrs)
+         const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {
+      : OperatorBase(type, inputs, outputs, attrs) {}
-    if (!rpc_service_) {
-      std::string endpoint = Attr<std::string>("endpoint");
+  void Run(const framework::Scope& scope,
-      rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
+           const platform::Place& place) const override {
-      server_thread_.reset(new std::thread(RunServer, rpc_service_));
+    auto outs = Outputs("Out");
-    }
+    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
-  }
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  void Stop() override {
+    auto& ctx = *pool.Get(place);
-    detail::MessageWithName term_msg;
-    term_msg.first = LISTEN_TERMINATE_MESSAGE;
+    for (size_t i = 0; i < outs.size(); i++) {
-    rpc_service_->Push(term_msg);
+      VLOG(3) << "getting " << outs[i];
-    rpc_service_->ShutDown();
+      client_.AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
-    server_thread_->join();
-  }
-  std::string GetGradVarNameForTrainer(const std::string &varname) const {
-    if (grads_counter_.find(varname) == grads_counter_.end()) {
-      grads_counter_[varname] = 0;
    }
-    return string::Sprintf("%s.trainer_%d", varname, grads_counter_[varname]++);
+    PADDLE_ENFORCE(client_.Wait());
  }
-  void Run(const framework::Scope &scope,
+ private:
-           const platform::Place &dev_place) const override {
+  mutable detail::RPCClient client_;
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(dev_place);
-    framework::Scope &recv_scope = scope.NewScope();
-    // FIXME(Yancey1989): initialize rpc server with laze mode.
-    rpc_service_->SetScope(&recv_scope);
-    rpc_service_->SetDevCtx(&dev_ctx);
-    auto param_list = Attr<std::vector<std::string>>("ParamList");
-    auto grad_list = Attr<std::vector<std::string>>("GradList");
-    auto fan_in = Attr<int>("Fanin");
-    auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
-    auto *program = block->Program();
-    framework::Executor executor(dev_place);
-    // TODO(typhoonzero): change this to a while_op for every cluster-batch.
-    bool exit_flag = false;
-    while (!exit_flag) {
-      // Get from multiple trainers, we don't care about the order in which
-      // the gradients arrives, just add suffix 0~n and merge the gradient.
-      rpc_service_->SetCond(0);
-      size_t recv_var_cnt = 0;
-      int batch_barrier = 0;
-      while (batch_barrier != fan_in) {
-        const detail::MessageWithName &v = rpc_service_->Get();
-        auto grad_var_name = v.first;
-        if (grad_var_name == LISTEN_TERMINATE_MESSAGE) {
-          LOG(INFO) << "received terminate message and exit";
-          exit_flag = true;
-          break;
-        } else if (grad_var_name == BATCH_BARRIER_MESSAGE) {
-          VLOG(3) << "recv batch barrier message";
-          batch_barrier++;
-          continue;
-        } else {
-          // receive a variable
-          recv_var_cnt++;
-          auto it =
-              std::find(grad_list.begin(), grad_list.end(), grad_var_name);
-          std::string param_var_name;
-          if (it != grad_list.end()) {
-            param_var_name = param_list[it - grad_list.begin()];
-          } else {
-            LOG(ERROR) << "grad has no paired param:" << grad_var_name;
-          }
-          VLOG(3) << "received grad: " << grad_var_name
-                  << " updating param: " << param_var_name;
-          if (fan_in > 1) {
-            grad_var_name = this->GetGradVarNameForTrainer(grad_var_name);
-          }
-          auto *var = recv_scope.FindVar(grad_var_name);
-          if (var == nullptr) {
-            LOG(ERROR) << "Can not find server side var: " << grad_var_name;
-            PADDLE_THROW("Can not find server side var");
-          }
-          detail::DeserializeFromMessage(v.second, dev_ctx, var);
-        }
-      }
-      VLOG(3) << "recv " << recv_var_cnt << " parmeters for one barrier.";
-      // TODO(Yancey1989): merge SelectedRows variables here
-      if (exit_flag) {
-        break;
-      }
-      try {
-        executor.Run(*program, &recv_scope, block->ID(), /*global_block*/
-                     false /*create_local_scope*/, false /*create_vars*/);
-      } catch (std::exception &e) {
-        LOG(ERROR) << "run sub program error " << e.what();
-      }
-      rpc_service_->SetCond(1);
-      rpc_service_->WaitClientGet(recv_var_cnt);
-      grads_counter_.clear();
-    }  // while(true)
-  }
- protected:
-  std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
-  std::shared_ptr<std::thread> server_thread_;
-  mutable std::unordered_map<std::string, int> grads_counter_;
 };
 class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  RecvOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  RecvOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "(Tensor) Variables to get from server.").AsDuplicable();
    AddComment(R"DOC(
 Recv operator
-This operator will recieve tensor from send_op
+This operator can get variables from server side.
 )DOC");
-    AddAttr<std::string>("endpoint",
+    AddAttr<std::vector<std::string>>("epmap",
-                         "(string, default 127.0.0.1:6164)"
+                                      "(string vector, default 127.0.0.1:6164)"
-                         "IP address to listen on.")
+                                      "Server endpoints in the order of input "
-        .SetDefault("127.0.0.1:6164")
+                                      "variables for mapping")
-        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
-    AddAttr<framework::BlockDesc *>(
-        kOptimizeBlock, "Serialized ProgramDesc string for recv to run.");
-    AddAttr<std::vector<std::string>>(
-        "ParamList", "type list of string",
-        "grad->param name mapping to find which parameters to optimize.")
-        .SetDefault({});
-    AddAttr<std::vector<std::string>>(
-        "GradList", "type list of string",
-        "grad->param name mapping to find which parameters to optimize.")
        .SetDefault({});
-    AddAttr<int>("Fanin", "type int",
-                 "Number of trainers in the current cluster job")
-        .SetDefault(1);
  }
 };

--- a/paddle/operators/row_conv_op.cu
+++ b/paddle/operators/row_conv_op.cu
@@ -307,7 +307,7 @@ class RowConvKernel<platform::CUDADeviceContext, T>
    int input_dim = X->dims()[1];
    int num_sequence = batch_indices.size() - 1;
    int future_context = Filter->dims()[0];
-    size_t *idx = batch_indices.data();
+    size_t *idx = batch_indices.cuda_data();
    auto stream = context.cuda_device_context().stream();
    if (future_context <= 32) {
@@ -345,7 +345,7 @@ class RowConvGradKernel<platform::CUDADeviceContext, T>
    int input_dim = X->dims()[1];
    int num_sequence = batch_indices.size() - 1;
    int future_context = Filter->dims()[0];
-    size_t *idx = batch_indices.data();
+    size_t *idx = batch_indices.cuda_data();
    auto &device_ctx = context.cuda_device_context();
    math::SetConstant<platform::CUDADeviceContext, T> zero;

--- a/paddle/operators/save_combine_op.cc
+++ b/paddle/operators/save_combine_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <stdint.h>
+#include <sys/stat.h>
+#include <fstream>
+#include <numeric>
+#include <sstream>
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"
+namespace paddle {
+namespace operators {
+// TODO(sidgoyal78): These function are needed by other files (save_op), move
+// them to paddle::filesystem namespace. (as noted by yuyang18 in save_op).
+constexpr char kSEP = '/';
+static bool FileExists(const std::string &filepath) {
+  struct stat buffer;
+  return (stat(filepath.c_str(), &buffer) == 0);
+}
+static std::string DirName(const std::string &filepath) {
+  auto pos = filepath.rfind(kSEP);
+  if (pos == std::string::npos) {
+    return "";
+  }
+  return filepath.substr(0, pos);
+}
+static void MkDir(const char *path) {
+  if (mkdir(path, 0755)) {
+    PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
+  }
+}
+static void MkDirRecursively(const char *fullpath) {
+  if (*fullpath == '\0') return;  // empty string
+  if (FileExists(fullpath)) return;
+  MkDirRecursively(DirName(fullpath).c_str());
+  MkDir(fullpath);
+}
+class SaveCombineOp : public framework::OperatorBase {
+ public:
+  SaveCombineOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto filename = Attr<std::string>("file_path");
+    auto overwrite = Attr<bool>("overwrite");
+    bool is_present = FileExists(filename);
+    if (is_present && !overwrite) {
+      PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false",
+                   filename, overwrite);
+    }
+    MkDirRecursively(DirName(filename).c_str());
+    std::ofstream fout(filename);
+    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
+                   filename);
+    auto inp_var_names = Inputs("X");
+    PADDLE_ENFORCE_GT(static_cast<int>(inp_var_names.size()), 0,
+                      "The number of input variables should be greater than 0");
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+    for (size_t i = 0; i < inp_var_names.size(); i++) {
+      auto *var = scope.FindVar(inp_var_names[i]);
+      PADDLE_ENFORCE(var != nullptr,
+                     "Cannot find variable %s for save_combine_op",
+                     inp_var_names[i]);
+      PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
+                     "SaveCombineOp only supports LoDTensor, %s has wrong type",
+                     inp_var_names[i]);
+      auto &tensor = var->Get<framework::LoDTensor>();
+      // Serialize tensor
+      framework::SerializeToStream(fout, tensor, dev_ctx);
+    }
+    fout.close();
+  }
+};
+class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SaveCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(vector) Input LoDTensors that need to be saved together in a file.")
+        .AsDuplicable();
+    AddComment(R"DOC(
+SaveCombine operator
+This operator will serialize and write a list of input LoDTensor variables 
+to a file on disk.
+)DOC");
+    AddAttr<bool>("overwrite",
+                  "(boolean, default true)"
+                  "Overwrite the output file if it exists.")
+        .SetDefault(true);
+    AddAttr<std::string>(
+        "file_path",
+        "(string)"
+        "The \"file_path\" where the LoDTensor variables will be saved.")
+        .AddCustomChecker(
+            [](const std::string &path) { return !path.empty(); });
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(save_combine, ops::SaveCombineOp,
+                  ops::SaveCombineOpProtoMaker);
--- a/paddle/operators/save_load_combine_op_test.cc
+++ b/paddle/operators/save_load_combine_op_test.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <iostream>
+#include <string>
+#include <vector>
+#include "gtest/gtest.h"
+#include "paddle/framework/op_registry.h"
+USE_NO_KERNEL_OP(save_combine);
+USE_NO_KERNEL_OP(load_combine);
+int* CreateForSaveCombineOp(int x, int y, const std::vector<int>& lod_info,
+                            std::string var_name,
+                            paddle::platform::CPUPlace& place,
+                            paddle::framework::Scope& scope,
+                            paddle::framework::LoD& expect_lod) {
+  auto var = scope.Var(var_name);
+  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
+  tensor->Resize({x, y});
+  expect_lod.resize(1);
+  for (size_t i = 0; i < lod_info.size(); i++) {
+    expect_lod[0].push_back(lod_info[i]);
+  }
+  tensor->set_lod(expect_lod);
+  int* expect = tensor->mutable_data<int>(place);
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    expect[i] = static_cast<int>(i);
+  }
+  return expect;
+}
+paddle::framework::LoDTensor* GeneratePlaceholderBeforeLoad(
+    const std::string out_var_name, paddle::framework::Scope& scope) {
+  auto load_var = scope.Var(out_var_name);
+  auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
+  return target;
+}
+int* GetValuesAfterLoadCombineOp(paddle::framework::LoDTensor* target,
+                                 paddle::framework::Scope& scope,
+                                 paddle::framework::LoD& actual_lod) {
+  int* actual = target->data<int>();
+  actual_lod = target->lod();
+  return actual;
+}
+void CheckValues(int* expect, int* actual, paddle::framework::LoD expect_lod,
+                 paddle::framework::LoD actual_lod, const int& numel) {
+  for (int64_t i = 0; i < numel; ++i) {
+    EXPECT_EQ(expect[i], actual[i]);
+  }
+  EXPECT_EQ(expect_lod.size(), actual_lod.size());
+  for (size_t i = 0; i < expect_lod.size(); ++i) {
+    for (size_t j = 0; j < expect_lod[i].size(); ++j) {
+      EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
+    }
+  }
+}
+// Here, we create 4 LoDTensors and use save_combine_op to first save these
+// in a single file. Then, we use load_combine_op to load these sequentially
+TEST(SaveLoadCombineOp, CPU) {
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+  std::vector<int> lod1 = {0, 1, 2, 3, 10};
+  int numel1 = 100;
+  paddle::framework::LoD expect_lod1;
+  int* expect1 = CreateForSaveCombineOp(10, 10, lod1, "test_var1", place, scope,
+                                        expect_lod1);
+  std::vector<int> lod2 = {0, 2, 5, 10};
+  int numel2 = 200;
+  paddle::framework::LoD expect_lod2;
+  int* expect2 = CreateForSaveCombineOp(10, 20, lod2, "test_var2", place, scope,
+                                        expect_lod2);
+  std::vector<int> lod3 = {0, 2, 3, 20};
+  int numel3 = 4000;
+  paddle::framework::LoD expect_lod3;
+  int* expect3 = CreateForSaveCombineOp(20, 200, lod3, "test_var3", place,
+                                        scope, expect_lod3);
+  std::vector<int> lod4 = {0, 1, 20};
+  int numel4 = 1000;
+  paddle::framework::LoD expect_lod4;
+  int* expect4 = CreateForSaveCombineOp(20, 50, lod4, "test_var4", place, scope,
+                                        expect_lod4);
+  // Set attributes
+  std::string filename = "check_tensor.ls";
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"file_path", std::string(filename)});
+  // Run the save_combine_op
+  auto save_combine_op = paddle::framework::OpRegistry::CreateOp(
+      "save_combine",
+      {{"X", {"test_var1", "test_var2", "test_var3", "test_var4"}}}, {}, attrs);
+  save_combine_op->Run(scope, place);
+  // Set up output vars
+  auto target1 = GeneratePlaceholderBeforeLoad("out_var1", scope);
+  auto target2 = GeneratePlaceholderBeforeLoad("out_var2", scope);
+  auto target3 = GeneratePlaceholderBeforeLoad("out_var3", scope);
+  auto target4 = GeneratePlaceholderBeforeLoad("out_var4", scope);
+  // Run the load_combine_op
+  auto load_combine_op = paddle::framework::OpRegistry::CreateOp(
+      "load_combine", {},
+      {{"Out", {"out_var1", "out_var2", "out_var3", "out_var4"}}}, attrs);
+  load_combine_op->Run(scope, place);
+  paddle::framework::LoD actual_lod1, actual_lod2, actual_lod3, actual_lod4;
+  int* actual1 = GetValuesAfterLoadCombineOp(target1, scope, actual_lod1);
+  int* actual2 = GetValuesAfterLoadCombineOp(target2, scope, actual_lod2);
+  int* actual3 = GetValuesAfterLoadCombineOp(target3, scope, actual_lod3);
+  int* actual4 = GetValuesAfterLoadCombineOp(target4, scope, actual_lod4);
+  CheckValues(expect1, actual1, expect_lod1, actual_lod1, numel1);
+  CheckValues(expect2, actual2, expect_lod2, actual_lod2, numel2);
+  CheckValues(expect3, actual3, expect_lod3, actual_lod3, numel3);
+  CheckValues(expect4, actual4, expect_lod4, actual_lod4, numel4);
+}
+// Test with original SaveLoadTest
+TEST(SaveLoadTestWithCombineOp, CPU) {
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+  auto var = scope.Var("test_var");
+  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
+  tensor->Resize({3, 10});
+  paddle::framework::LoD expect_lod;
+  expect_lod.resize(1);
+  expect_lod[0].push_back(0);
+  expect_lod[0].push_back(1);
+  expect_lod[0].push_back(2);
+  expect_lod[0].push_back(3);
+  tensor->set_lod(expect_lod);
+  int* expect = tensor->mutable_data<int>(place);
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    expect[i] = static_cast<int>(i);
+  }
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"file_path", std::string("check_t.save")});
+  auto save_op = paddle::framework::OpRegistry::CreateOp(
+      "save_combine", {{"X", {"test_var"}}}, {}, attrs);
+  save_op->Run(scope, place);
+  auto load_var = scope.Var("out_var");
+  auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
+  auto load_op = paddle::framework::OpRegistry::CreateOp(
+      "load_combine", {}, {{"Out", {"out_var"}}}, attrs);
+  load_op->Run(scope, place);
+  int* actual = target->data<int>();
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    EXPECT_EQ(expect[i], actual[i]);
+  }
+  auto& actual_lod = target->lod();
+  EXPECT_EQ(expect_lod.size(), actual_lod.size());
+  for (size_t i = 0; i < expect_lod.size(); ++i) {
+    for (size_t j = 0; j < expect_lod[i].size(); ++j) {
+      EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
+    }
+  }
+}
--- a/paddle/operators/save_load_op_test.cc
+++ b/paddle/operators/save_load_op_test.cc
@@ -24,7 +24,7 @@ TEST(SaveLoadOp, CPU) {
  auto var = scope.Var("test_var");
  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
-  tensor->Resize({10, 10});
+  tensor->Resize({3, 10});
  paddle::framework::LoD expect_lod;
  expect_lod.resize(1);
  expect_lod[0].push_back(0);

--- a/paddle/operators/send_op.cc
+++ b/paddle/operators/send_op.cc
@@ -42,28 +42,34 @@ class SendOp : public framework::OperatorBase {
    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
    auto& ctx = *pool.Get(place);
+    auto client_var_name = Output("RPCClient");
+    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
+                            "Can not find variable '%s' in the scope.",
+                            client_var_name);
+    auto* client_var = scope.FindVar(client_var_name);
+    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
    for (size_t i = 0; i < ins.size(); i++) {
      VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
-      client_.AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
+      rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
    }
-    PADDLE_ENFORCE(client_.Wait());
+    PADDLE_ENFORCE(rpc_client->Wait());
    for (auto& ep : endpoints) {
      VLOG(3) << "batch barrier, ep: " << ep;
-      client_.AsyncSendBatchBarrier(ep);
+      rpc_client->AsyncSendBatchBarrier(ep);
    }
-    PADDLE_ENFORCE(client_.Wait());
+    PADDLE_ENFORCE(rpc_client->Wait());
-    for (size_t i = 0; i < outs.size(); i++) {
+    if (outs.size() > 0) {
-      VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
+      for (size_t i = 0; i < outs.size(); i++) {
-      client_.AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
+        VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
+        rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
+      }
+      PADDLE_ENFORCE(rpc_client->Wait());
    }
-    PADDLE_ENFORCE(client_.Wait());
  }
- private:
-  mutable detail::RPCClient client_;
 };
 class SendOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -73,11 +79,16 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "(Tensor) Input tensor to be sent").AsDuplicable();
    AddOutput("Out", "(Tensor) Output tensor to be received from server")
        .AsDuplicable();
+    AddOutput("RPCClient",
+              "(RPCClient) The RPC client object which is"
+              "initialized at most once.");
    AddComment(R"DOC(
 Send operator
 This operator will send tensor to recv_op at the parameter server.
 )DOC");
+    // TODO(typhoonzero): remove this attr generate de-duplicated vector from
+    // epmap when initializing.
    AddAttr<std::vector<std::string>>("endpoints",
                                      "(string vector, default 127.0.0.1:6164)"
                                      "Server endpoints to send variables to.")

--- a/paddle/operators/send_recv_op_test.cc
+++ b/paddle/operators/send_recv_op_test.cc
@@ -25,7 +25,7 @@ limitations under the License. */
 #include "paddle/string/printf.h"
 USE_NO_KERNEL_OP(send);
-USE_NO_KERNEL_OP(recv);
+USE_NO_KERNEL_OP(listen_and_serv);
 USE_OP(sum);
 namespace f = paddle::framework;
@@ -33,7 +33,7 @@ namespace p = paddle::platform;
 namespace m = paddle::operators::math;
 // global for simplicity.
-std::unique_ptr<f::OperatorBase> recv_op;
+std::unique_ptr<f::OperatorBase> listen_and_serv_op;
 void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) {
  p::CPUDeviceContext ctx(place);
@@ -120,7 +120,7 @@ void StartServerNet(bool is_sparse) {
    InitTensorsInScope(scope, place);
  }
-  // sub program run in recv_op, for simple test we use sum
+  // sub program run in listen_and_serv_op, for simple test we use sum
  f::ProgramDesc program;
  f::BlockDesc *block = program.MutableBlock(0);
  // X for server side tensors, RX for received tensers, must be of same shape.
@@ -131,8 +131,9 @@ void StartServerNet(bool is_sparse) {
  attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
  attrs.insert({"GradList", std::vector<std::string>({"x1"})});
  attrs.insert({"OptimizeBlock", block});
-  recv_op = f::OpRegistry::CreateOp("recv", {{"RX", {"x1"}}}, {}, attrs);
+  listen_and_serv_op =
-  recv_op->Run(scope, place);
+      f::OpRegistry::CreateOp("listen_and_serv", {}, {}, attrs);
+  listen_and_serv_op->Run(scope, place);
 }
 TEST(SendRecvOp, CPUDense) {
@@ -161,9 +162,9 @@ TEST(SendRecvOp, CPUDense) {
  for (int64_t i = 0; i < target->numel(); ++i) {
    EXPECT_EQ(expected[i] * 2, actual[i]);
  }
-  recv_op->Stop();
+  listen_and_serv_op->Stop();
  server_thread.join();
-  recv_op.reset(nullptr);
+  listen_and_serv_op.reset(nullptr);
 }
 TEST(SendRecvOp, CPUSparse) {
@@ -200,7 +201,7 @@ TEST(SendRecvOp, CPUSparse) {
    EXPECT_EQ(expect_value->mutable_data<float>(place)[i],
              actual->mutable_data<float>(place)[i]);
  }
-  recv_op->Stop();
+  listen_and_serv_op->Stop();
  server_thread.join();
-  recv_op.reset();
+  listen_and_serv_op.reset();
 }
--- a/paddle/operators/sequence_erase_op.cu
+++ b/paddle/operators/sequence_erase_op.cu
@@ -96,9 +96,8 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
    GetOutLod<<<(lod_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
                PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
        num_erased_ptr, dev_in_lod_ptr, lod_len, dev_out_lod_ptr);
    // Set LoD for output
-    thrust::host_vector<size_t> out_lod0 = dev_out_lod;
+    std::vector<size_t> out_lod0(dev_out_lod.begin(), dev_out_lod.end());
    framework::LoD out_lod;
    out_lod.push_back(out_lod0);
    out->set_lod(out_lod);

--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
@@ -89,7 +89,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
      PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
      auto& in_value = grad->value();
-      auto& in_rows = grad->rows();
+      framework::Vector<int64_t> in_rows(grad->rows());
      int64_t in_row_numel = in_value.numel() / in_rows.size();
      PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
@@ -102,7 +102,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
      dim3 grid(1, in_rows.size());
      SparseSGDFunctorKernel<
          T, 256><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
-          in_data, in_rows.data(), learning_rate->data<T>(), out_data,
+          in_data, in_rows.cuda_data(), learning_rate->data<T>(), out_data,
          in_row_numel);
    } else {

--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
@@ -68,7 +68,32 @@ class SumKernel : public framework::OpKernel<T> {
        }
      }
    } else if (out_var->IsType<framework::SelectedRows>()) {
-      PADDLE_ENFORCE(!in_place, "SelectedRows not support inplace sum now");
+      std::unique_ptr<framework::SelectedRows> in0;
+      if (in_place) {
+        // If is in_place, we store the input[0] to in0
+        auto &in_sel0 = in_vars[0]->Get<SelectedRows>();
+        auto &rows = in_sel0.rows();
+#ifdef PADDLE_WITH_CUDA
+        std::vector<int64_t> rows_in_cpu;
+        rows_in_cpu.reserve(rows.size());
+        for (auto item : rows) {
+          rows_in_cpu.push_back(item);
+        }
+        in0.reset(new framework::SelectedRows(rows_in_cpu, in_sel0.height()));
+#else
+        in0.reset(new framework::SelectedRows(rows, in_sel0.height()));
+#endif
+        in0->mutable_value()->ShareDataWith(in_sel0.value());
+      }
+      auto get_selected_row = [&](size_t i) -> const SelectedRows & {
+        if (i == 0 && in0) {
+          return *in0.get();
+        } else {
+          return in_vars[i]->Get<SelectedRows>();
+        }
+      };
      auto *out = context.Output<SelectedRows>("Out");
      out->mutable_rows()->clear();
      auto *out_value = out->mutable_value();
@@ -76,24 +101,26 @@ class SumKernel : public framework::OpKernel<T> {
      // Runtime InferShape
      size_t first_dim = 0;
      for (int i = 0; i < N; i++) {
-        first_dim += in_vars[i]->Get<SelectedRows>().rows().size();
+        auto &sel_row = get_selected_row(i);
+        first_dim += sel_row.rows().size();
      }
-      auto in_dim = in_vars[0]->Get<SelectedRows>().value().dims();
+      auto in_dim =
-      auto in_dim_vec = framework::vectorize(in_dim);
+          framework::vectorize(get_selected_row(N - 1).value().dims());
-      in_dim_vec[0] = static_cast<int64_t>(first_dim);
+      in_dim[0] = static_cast<int64_t>(first_dim);
-      out_value->Resize(framework::make_ddim(in_dim_vec));
+      out_value->Resize(framework::make_ddim(in_dim));
      out_value->mutable_data<T>(context.GetPlace());
      math::SelectedRowsAddTo<DeviceContext, T> functor;
      int64_t offset = 0;
      for (int i = 0; i < N; i++) {
-        PADDLE_ENFORCE_EQ(out->height(),
+        auto &sel_row = get_selected_row(i);
-                          in_vars[i]->Get<SelectedRows>().height());
-        functor(context.template device_context<DeviceContext>(),
+        PADDLE_ENFORCE_EQ(out->height(), sel_row.height());
-                in_vars[i]->Get<SelectedRows>(), offset, out);
+        functor(context.template device_context<DeviceContext>(), sel_row,
-        offset += in_vars[i]->Get<SelectedRows>().value().numel();
+                offset, out);
+        offset += sel_row.value().numel();
      }
    } else if (out_var->IsType<framework::LoDTensorArray>()) {
      auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>();

--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
@@ -53,6 +53,8 @@ class WhileOp : public framework::OperatorBase {
    auto step_scopes =
        scope.FindVar(Output(kStepScopes))->GetMutable<StepScopeVar>();
+    PADDLE_ENFORCE(platform::is_cpu_place(cond.place()),
+                   "Condition of while op must in CPU memory.");
    while (cond.data<bool>()[0]) {
      auto &current_scope = scope.NewScope();
      step_scopes->push_back(&current_scope);
@@ -99,6 +101,9 @@ class WhileGradOp : public framework::OperatorBase {
  void Run(const framework::Scope &scope,
           const platform::Place &dev_place) const override {
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
    framework::Executor executor(dev_place);
    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
    auto *program = block->Program();
@@ -205,6 +210,8 @@ class WhileGradOp : public framework::OperatorBase {
        sum_op->Run(cur_scope, dev_place);
        cur_scope.Rename(new_inside_name, inside_grad_name);
      }
+      dev_ctx.Wait();
+      const_cast<framework::Scope &>(scope).DeleteScope(&cur_scope);
    }
  }
 };

--- a/paddle/platform/profiler.cc
+++ b/paddle/platform/profiler.cc
@@ -233,7 +233,7 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
      };
      break;
    default:
-      sorted_domain = "event end time";
+      sorted_domain = "event first end time";
  }
  std::vector<std::vector<EventItem>> events_table;

--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -124,44 +124,25 @@ PYBIND11_PLUGIN(core) {
      .def(
          "__init__",
          [](LoDTensor &instance, const std::vector<std::vector<size_t>> &lod) {
-#ifndef PADDLE_WITH_CUDA
+            LoD new_lod;
-            new (&instance) LoDTensor(lod);
+            new_lod.reserve(lod.size());
-#else
+            std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-             LoD new_lod;
+            new (&instance) LoDTensor(new_lod);
-             new_lod.reserve(lod.size());
-             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-             new (&instance) LoDTensor(new_lod);
-#endif
          })
      .def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); })
      .def("set_lod",
           [](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
-#ifndef PADDLE_WITH_CUDA
-             self.set_lod(lod);
-#else
             LoD new_lod;
             new_lod.reserve(lod.size());
             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
             self.set_lod(new_lod);
-#endif
           })
      .def("lod", [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
-#ifndef PADDLE_WITH_CUDA
+        auto lod = self.lod();
-        return self.lod();
+        std::vector<std::vector<size_t>> new_lod;
-#else
+        new_lod.reserve(lod.size());
-           auto lod = self.lod();
+        std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-           std::vector<std::vector<size_t>> new_lod;
+        return new_lod;
-           new_lod.reserve(lod.size());
-           std::transform(lod.begin(), lod.end(), std::back_inserter(new_lod),
-               [](Vector<size_t> item) ->
-                   std::vector<size_t> {
-                 std::vector<size_t> v;
-                 v.reserve(item.size());
-                 std::copy(item.begin(), item.end(), std::back_inserter(v));
-                 return v;
-               });
-           return new_lod;
-#endif
      });
  py::class_<SelectedRows>(m, "SelectedRows")

--- a/paddle/scripts/docker/README.md
+++ b/paddle/scripts/docker/README.md
@@ -56,7 +56,7 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
 | ------ | -------- | ----------- |
 | `WITH_GPU` | OFF | Generates NVIDIA CUDA GPU code and relies on CUDA libraries. |
 | `WITH_AVX` | OFF | Set to "ON" to enable AVX support. |
-| `WITH_TESTING` | ON | Build unit tests binaries. |
+| `WITH_TESTING` | OFF | Build unit tests binaries. |
 | `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. |
 | `WITH_GOLANG` | ON | Build fault-tolerant parameter server written in go. |
 | `WITH_SWIG_PY` | ON | Build with SWIG python API support. |

--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -32,7 +32,7 @@ function cmake_gen() {
    cat <<EOF
    ========================================
    Configuring cmake in /paddle/build ...
-        -DCMAKE_BUILD_TYPE=${BUILD_TYPE:Release}
+        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
        ${PYTHON_FLAGS}
        -DWITH_DOC=OFF
        -DWITH_GPU=${WITH_GPU:-OFF}
@@ -40,6 +40,7 @@ function cmake_gen() {
        -DWITH_MKL=${WITH_MKL:-ON}
        -DWITH_AVX=${WITH_AVX:-OFF}
        -DWITH_GOLANG=${WITH_GOLANG:-ON}
+        -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
        -DWITH_SWIG_PY=ON
        -DWITH_C_API=${WITH_C_API:-OFF}
        -DWITH_PYTHON=${WITH_PYTHON:-ON}
@@ -54,7 +55,7 @@ EOF
    # docker environment is fully controlled by this script.
    # See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
    cmake .. \
-        -DCMAKE_BUILD_TYPE=${BUILD_TYPE:Release} \
+        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \
        ${PYTHON_FLAGS} \
        -DWITH_DOC=OFF \
        -DWITH_GPU=${WITH_GPU:-OFF} \
@@ -62,6 +63,7 @@ EOF
        -DWITH_MKL=${WITH_MKL:-ON} \
        -DWITH_AVX=${WITH_AVX:-OFF} \
        -DWITH_GOLANG=${WITH_GOLANG:-ON} \
+        -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
        -DWITH_C_API=${WITH_C_API:-OFF} \
        -DWITH_PYTHON=${WITH_PYTHON:-ON} \
@@ -77,6 +79,7 @@ function run_build() {
    Building in /paddle/build ...
    ============================================
 EOF
+    make clean
    make -j `nproc`
 }

--- a/paddle/scripts/docker/test.sh
+++ b/paddle/scripts/docker/test.sh
+#!/bin/bash
+set -e
+# the number of process to run tests
+NUM_PROC=6
+# calculate and set the memory usage for each process
+MEM_USAGE=$(printf "%.2f" `echo "scale=5; 1.0 / $NUM_PROC" | bc`)
+export FLAGS_fraction_of_gpu_memory_to_use=$MEM_USAGE
+# get the CUDA device count
+CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
+for (( i = 0; i < $NUM_PROC; i++ )); do
+    cuda_list=()
+    for (( j = 0; j < $CUDA_DEVICE_COUNT; j++ )); do
+        s=$[i+j]
+        n=$[s%CUDA_DEVICE_COUNT]
+        if [ $j -eq 0 ]; then
+            cuda_list=("$n")
+        else
+            cuda_list="$cuda_list,$n"
+        fi
+    done
+    echo $cuda_list
+    # CUDA_VISIBLE_DEVICES http://acceleware.com/blog/cudavisibledevices-masking-gpus
+    # ctest -I https://cmake.org/cmake/help/v3.0/manual/ctest.1.html?highlight=ctest
+    env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC --output-on-failure &
+done
+wait
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -22,12 +22,15 @@ limitations under the License. */
 int main(int argc, char** argv) {
  std::vector<char*> new_argv;
  std::string gflags_env;
-  new_argv.push_back(argv[0]);
+  for (int i = 0; i < argc; ++i) {
+    new_argv.push_back(argv[i]);
+  }
 #ifdef PADDLE_WITH_CUDA
  new_argv.push_back(
-      strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
+      strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory,"
+             "warpctc_dir"));
 #else
-  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory"));
+  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory,warpctc_dir"));
 #endif
  int new_argc = static_cast<int>(new_argv.size());
  char** new_argv_address = new_argv.data();

--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -26,6 +26,7 @@ import initializer
 import layers
 import nets
 import optimizer
+import learning_rate_decay
 import backward
 import regularizer
 from param_attr import ParamAttr
@@ -35,27 +36,16 @@ from distribute_transpiler import DistributeTranspiler
 from distribute_transpiler_simple import SimpleDistributeTranspiler
 import clip
 from memory_optimization_transpiler import memory_optimize
+import profiler
 Tensor = LoDTensor
 __all__ = framework.__all__ + executor.__all__ + [
-    'io',
+    'io', 'initializer', 'layers', 'nets', 'optimizer', 'learning_rate_decay',
-    'initializer',
+    'backward', 'regularizer', 'LoDTensor', 'CPUPlace', 'CUDAPlace', 'Tensor',
-    'layers',
-    'nets',
-    'optimizer',
-    'backward',
-    'regularizer',
-    'LoDTensor',
-    'CPUPlace',
-    'CUDAPlace',
-    'Tensor',
    'ParamAttr'
-    'DataFeeder',
+    'DataFeeder', 'clip', 'SimpleDistributeTranspiler', 'DistributeTranspiler',
-    'clip',
+    'memory_optimize', 'profiler'
-    'SimpleDistributeTranspiler',
-    'DistributeTranspiler',
-    'memory_optimize',
 ]
@@ -87,10 +77,10 @@ def __bootstrap__():
    os.environ['OMP_NUM_THREADS'] = str(num_threads)
    read_env_flags = [
-        'use_pinned_memory', 'check_nan_inf', 'do_memory_benchmark'
+        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir'
    ]
    if core.is_compiled_with_cuda():
-        read_env_flags += ['fraction_of_gpu_memory_to_use', 'op_sync']
+        read_env_flags += ['fraction_of_gpu_memory_to_use']
    core.init_gflags([sys.argv[0]] +
                     ["--tryfromenv=" + ",".join(read_env_flags)])
    core.init_glog(sys.argv[0])

--- a/python/paddle/v2/fluid/debuger.py
+++ b/python/paddle/v2/fluid/debuger.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import re
+from graphviz import GraphPreviewGenerator
+import proto.framework_pb2 as framework_pb2
+_vartype2str_ = [
+    "UNK",
+    "LoDTensor",
+    "SelectedRows",
+    "FeedMinibatch",
+    "FetchList",
+    "StepScopes",
+    "LodRankTable",
+    "LoDTensorArray",
+    "PlaceList",
+]
+_dtype2str_ = [
+    "bool",
+    "int16",
+    "int32",
+    "int64",
+    "float16",
+    "float32",
+    "float64",
+]
+def repr_data_type(type):
+    return _dtype2str_[type]
+def repr_tensor(proto):
+    return "tensor(type={}, shape={})".format(_dtype2str_[int(proto.data_type)],
+                                              str(proto.dims))
+reprtpl = "{ttype} {name} ({reprs})"
+def repr_lodtensor(proto):
+    if not proto.lod_tensor: return
+    level = proto.lod_tensor.lod_level
+    reprs = repr_tensor(proto.lod_tensor.tensor)
+    return reprtpl.format(
+        ttype="LoDTensor" if level > 0 else "Tensor",
+        name=proto.name,
+        reprs="level=%d, %s" % (level, reprs) if level > 0 else reprs)
+def repr_selected_rows(proto):
+    if not proto.selected_rows: return
+    return reprtpl.format(
+        ttype="SelectedRows",
+        name=proto.name,
+        reprs=repr_tensor(proto.selected_rows))
+def repr_tensor_array(proto):
+    if not proto.tensor_array: return
+    return reprtpl.format(
+        ttype="TensorArray",
+        name=proto.name,
+        reprs="level=%d, %s" % (proto.tensor_array.lod_level,
+                                repr_tensor(proto.lod_tensor)))
+type_handlers = [
+    repr_lodtensor,
+    repr_selected_rows,
+    repr_tensor_array,
+]
+def repr_var(vardesc):
+    for handler in type_handlers:
+        res = handler(vardesc)
+        if res:
+            return res
+def pprint_program_codes(program_desc):
+    reprs = []
+    for block_idx in range(program_desc.num_blocks()):
+        block_desc = program_desc.block(block_idx)
+        block_repr = pprint_block_codes(block_desc)
+        reprs.append(block_repr)
+    return '\n'.join(reprs)
+def pprint_block_codes(block_desc, show_backward=False):
+    def is_op_backward(op_desc):
+        if op_desc.type.endswith('_grad'): return True
+        def is_var_backward(var):
+            if "@GRAD" in var.parameter: return True
+            for arg in var.arguments:
+                if "@GRAD" in arg: return True
+        for var in op_desc.inputs:
+            if is_var_backward(var): return True
+        for var in op_desc.outputs:
+            if is_var_backward(var): return True
+        return False
+    def is_var_backward(var_desc):
+        return "@GRAD" in var_desc.name
+    if type(block_desc) is not framework_pb2.BlockDesc:
+        block_desc = framework_pb2.BlockDesc.FromString(
+            block_desc.serialize_to_string())
+    var_reprs = []
+    op_reprs = []
+    for var in block_desc.vars:
+        if not show_backward and is_var_backward(var):
+            continue
+        var_reprs.append(repr_var(var))
+    for op in block_desc.ops:
+        if not show_backward and is_op_backward(op): continue
+        op_reprs.append(repr_op(op))
+    tpl = "// block-{idx}  parent-{pidx}\n// variables\n{vars}\n\n// operators\n{ops}\n"
+    return tpl.format(
+        idx=block_desc.idx,
+        pidx=block_desc.parent_idx,
+        vars='\n'.join(var_reprs),
+        ops='\n'.join(op_reprs), )
+def repr_attr(desc):
+    tpl = "{key}={value}"
+    valgetter = [
+        lambda attr: attr.i,
+        lambda attr: attr.f,
+        lambda attr: attr.s,
+        lambda attr: attr.ints,
+        lambda attr: attr.floats,
+        lambda attr: attr.strings,
+        lambda attr: attr.b,
+        lambda attr: attr.bools,
+        lambda attr: attr.block_idx,
+        lambda attr: attr.l,
+    ]
+    key = desc.name
+    value = valgetter[desc.type](desc)
+    if key == "dtype":
+        value = repr_data_type(value)
+    return tpl.format(key=key, value=str(value)), (key, value)
+def _repr_op_fill_constant(optype, inputs, outputs, attrs):
+    if optype == "fill_constant":
+        return "{output} = {data} [shape={shape}]".format(
+            output=','.join(outputs),
+            data=attrs['value'],
+            shape=str(attrs['shape']))
+op_repr_handlers = [_repr_op_fill_constant, ]
+def repr_op(opdesc):
+    optype = None
+    attrs = []
+    attr_dict = {}
+    is_target = None
+    inputs = []
+    outputs = []
+    tpl = "{outputs} = {optype}({inputs}{is_target}) [{attrs}]"
+    args2value = lambda args: args[0] if len(args) == 1 else str(list(args))
+    for var in opdesc.inputs:
+        key = var.parameter
+        value = args2value(var.arguments)
+        inputs.append("%s=%s" % (key, value))
+    for var in opdesc.outputs:
+        value = args2value(var.arguments)
+        outputs.append(value)
+    for attr in opdesc.attrs:
+        attr_repr, attr_pair = repr_attr(attr)
+        attrs.append(attr_repr)
+        attr_dict[attr_pair[0]] = attr_pair[1]
+    is_target = opdesc.is_target
+    for handler in op_repr_handlers:
+        res = handler(opdesc.type, inputs, outputs, attr_dict)
+        if res: return res
+    return tpl.format(
+        outputs=', '.join(outputs),
+        optype=opdesc.type,
+        inputs=', '.join(inputs),
+        attrs="{%s}" % ','.join(attrs),
+        is_target=", is_target" if is_target else "")
+def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
+    '''
+    Generate a debug graph for block.
+    Args:
+        block(Block): a block.
+    '''
+    graph = GraphPreviewGenerator("some graph")
+    # collect parameters and args
+    protostr = block.desc.serialize_to_string()
+    desc = framework_pb2.BlockDesc.FromString(str(protostr))
+    def need_highlight(name):
+        if highlights is None: return False
+        for pattern in highlights:
+            assert type(pattern) is str
+            if re.match(pattern, name):
+                return True
+        return False
+    # draw parameters and args
+    vars = {}
+    for var in desc.vars:
+        shape = [str(i) for i in var.lod_tensor.tensor.dims]
+        if not shape:
+            shape = ['null']
+        # create var
+        if var.persistable:
+            varn = graph.add_param(
+                var.name, var.type, shape, highlight=need_highlight(var.name))
+        else:
+            varn = graph.add_arg(var.name, highlight=need_highlight(var.name))
+        vars[var.name] = varn
+    def add_op_link_var(op, var, op2var=False):
+        for arg in var.arguments:
+            if arg not in vars:
+                # add missing variables as argument
+                vars[arg] = graph.add_arg(arg, highlight=need_highlight(arg))
+            varn = vars[arg]
+            highlight = need_highlight(op.description) or need_highlight(
+                varn.description)
+            if op2var:
+                graph.add_edge(op, varn, highlight=highlight)
+            else:
+                graph.add_edge(varn, op, highlight=highlight)
+    for op in desc.ops:
+        opn = graph.add_op(op.type, highlight=need_highlight(op.type))
+        for var in op.inputs:
+            add_op_link_var(opn, var, False)
+        for var in op.outputs:
+            add_op_link_var(opn, var, True)
+    graph(path, show=True)
--- a/python/paddle/v2/fluid/distribute_transpiler.py
+++ b/python/paddle/v2/fluid/distribute_transpiler.py
@@ -153,11 +153,18 @@ class DistributeTranspiler:
            self.param_grad_ep_mapping[ep]["params"].append(param)
            self.param_grad_ep_mapping[ep]["grads"].append(grad)
+        rpc_client_var = program.global_block().create_var(
+            name="RPC_CLIENT_VAR",
+            psersistable=True,
+            dtype='float32',  # dtype and shape is not used in fact
+            shape=[0])
        # create send_op
        send_op = program.global_block().append_op(
            type="send",
            inputs={"X": send_inputs},
-            outputs={"Out": send_outputs},
+            outputs={"Out": send_outputs,
+                     "RPCClient": rpc_client_var},
            attrs={"endpoints": pserver_endpoints,
                   "epmap": eplist})
        # step4
@@ -471,9 +478,9 @@ class DistributeTranspiler:
            else:
                self._append_pserver_non_opt_ops(optimize_sub_program,
                                                 pserver_program, opt_op)
-        # Append the recv op
+        # Append the listen_and_serv op
        pserver_program.global_block().append_op(
-            type="recv",
+            type="listen_and_serv",
            inputs={},
            outputs={},
            attrs={

--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -451,9 +451,8 @@ class Operator(object):
            if not given == need:
                raise ValueError(("Incorrect setting for output(s) of "
                                  "operator \"%s\". Need: [%s] Given: [%s]") %
-                                 (type, ", ".join(str(e)
+                                 (type, ", ".join(str(e) for e in need),
-                                                  for e in need), ", ".join(
+                                  ", ".join(str(e) for e in given)))
-                                                      str(e) for e in given)))
            for out_proto in proto.outputs:
                out_args = outputs[out_proto.name]
@@ -489,7 +488,8 @@ class Operator(object):
        no_kernel_op_set = {
            'feed', 'fetch', 'save', 'load', 'recurrent',
            'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
-            'recv', 'parallel_do'
+            'recv', 'listen_and_serv', 'parallel_do', 'save_combine',
+            'load_combine'
        }
        if type not in no_kernel_op_set:
            self.desc.infer_var_type(self.block.desc)

--- a/python/paddle/v2/fluid/graphviz.py
+++ b/python/paddle/v2/fluid/graphviz.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import random
+import subprocess
+import logging
+def crepr(v):
+    if type(v) is str or type(v) is unicode:
+        return '"%s"' % v
+    return str(v)
+class Rank(object):
+    def __init__(self, kind, name, priority):
+        '''
+        kind: str
+        name: str
+        priority: int
+        '''
+        self.kind = kind
+        self.name = name
+        self.priority = priority
+        self.nodes = []
+    def __str__(self):
+        if not self.nodes:
+            return ''
+        return '{' + 'rank={};'.format(self.kind) + \
+               ','.join([node.name for node in self.nodes]) + '}'
+class Graph(object):
+    rank_counter = 0
+    def __init__(self, title, **attrs):
+        self.title = title
+        self.attrs = attrs
+        self.nodes = []
+        self.edges = []
+        self.rank_groups = {}
+    def code(self):
+        return self.__str__()
+    def rank_group(self, kind, priority):
+        name = "rankgroup-%d" % Graph.rank_counter
+        Graph.rank_counter += 1
+        rank = Rank(kind, name, priority)
+        self.rank_groups[name] = rank
+        return name
+    def node(self, label, prefix, description="", **attrs):
+        node = Node(label, prefix, description, **attrs)
+        if 'rank' in attrs:
+            rank = self.rank_groups[attrs['rank']]
+            del attrs['rank']
+            rank.nodes.append(node)
+        self.nodes.append(node)
+        return node
+    def edge(self, source, target, **attrs):
+        edge = Edge(source, target, **attrs)
+        self.edges.append(edge)
+        return edge
+    def compile(self, dot_path):
+        file = open(dot_path, 'w')
+        file.write(self.__str__())
+        image_path = os.path.join(
+            os.path.dirname(__file__), dot_path[:-3] + "pdf")
+        cmd = ["dot", "-Tpdf", dot_path, "-o", image_path]
+        subprocess.Popen(
+            cmd,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+        logging.warning("write block debug graph to {}".format(image_path))
+        return image_path
+    def show(self, dot_path):
+        image = self.compile(dot_path)
+        cmd = ["open", image]
+        subprocess.Popen(
+            cmd,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+    def _rank_repr(self):
+        ranks = sorted(
+            self.rank_groups.items(),
+            cmp=lambda a, b: a[1].priority > b[1].priority)
+        repr = []
+        for x in ranks:
+            repr.append(str(x[1]))
+        return '\n'.join(repr) + '\n'
+    def __str__(self):
+        reprs = [
+            'digraph G {',
+            'title = {}'.format(crepr(self.title)),
+        ]
+        for attr in self.attrs:
+            reprs.append("{key}={value};".format(
+                key=attr, value=crepr(self.attrs[attr])))
+        reprs.append(self._rank_repr())
+        random.shuffle(self.nodes)
+        reprs += [str(node) for node in self.nodes]
+        for x in self.edges:
+            reprs.append(str(x))
+        reprs.append('}')
+        return '\n'.join(reprs)
+class Node(object):
+    counter = 1
+    def __init__(self, label, prefix, description="", **attrs):
+        self.label = label
+        self.name = "%s_%d" % (prefix, Node.counter)
+        self.description = description
+        self.attrs = attrs
+        Node.counter += 1
+    def __str__(self):
+        reprs = '{name} [label={label} {extra} ];'.format(
+            name=self.name,
+            label=self.label,
+            extra=',' + ','.join("%s=%s" % (key, crepr(value))
+                                 for key, value in self.attrs.items())
+            if self.attrs else "")
+        return reprs
+class Edge(object):
+    def __init__(self, source, target, **attrs):
+        '''
+        Link source to target.
+        :param source: Node
+        :param target: Node
+        :param graph: Graph
+        :param attrs: dic
+        '''
+        self.source = source
+        self.target = target
+        self.attrs = attrs
+    def __str__(self):
+        repr = "{source} -> {target} {extra}".format(
+            source=self.source.name,
+            target=self.target.name,
+            extra="" if not self.attrs else
+            "[" + ','.join("{}={}".format(attr[0], crepr(attr[1]))
+                           for attr in self.attrs.items()) + "]")
+        return repr
+class GraphPreviewGenerator(object):
+    '''
+    Generate a graph image for ONNX proto.
+    '''
+    def __init__(self, title):
+        # init graphviz graph
+        self.graph = Graph(
+            title,
+            layout="dot",
+            concentrate="true",
+            rankdir="TB", )
+        self.op_rank = self.graph.rank_group('same', 2)
+        self.param_rank = self.graph.rank_group('same', 1)
+        self.arg_rank = self.graph.rank_group('same', 0)
+    def __call__(self, path='temp.dot', show=False):
+        if not show:
+            self.graph.compile(path)
+        else:
+            self.graph.show(path)
+    def add_param(self, name, data_type, shape, highlight=False):
+        label = '\n'.join([
+            '<<table cellpadding="5">',
+            '  <tr>',
+            '    <td bgcolor="#2b787e">',
+            '    <b>',
+            name,
+            '    </b>',
+            '    </td>',
+            '  </tr>',
+            '  <tr>',
+            '    <td>',
+            str(data_type),
+            '    </td>'
+            '  </tr>',
+            '  <tr>',
+            '    <td>',
+            '[%s]' % 'x'.join(shape),
+            '    </td>'
+            '  </tr>',
+            '</table>>',
+        ])
+        return self.graph.node(
+            label,
+            prefix="param",
+            description=name,
+            shape="none",
+            style="rounded,filled,bold",
+            width="1.3",
+            color="#148b97" if not highlight else "orange",
+            fontcolor="#ffffff",
+            fontname="Arial")
+    def add_op(self, opType, **kwargs):
+        highlight = False
+        if 'highlight' in kwargs:
+            highlight = kwargs['highlight']
+            del kwargs['highlight']
+        return self.graph.node(
+            "<<B>%s</B>>" % opType,
+            prefix="op",
+            description=opType,
+            shape="box",
+            style="rounded, filled, bold",
+            color="#303A3A" if not highlight else "orange",
+            fontname="Arial",
+            fontcolor="#ffffff",
+            width="1.3",
+            height="0.84", )
+    def add_arg(self, name, highlight=False):
+        return self.graph.node(
+            crepr(name),
+            prefix="arg",
+            description=name,
+            shape="box",
+            style="rounded,filled,bold",
+            fontname="Arial",
+            fontcolor="#999999",
+            color="#dddddd" if not highlight else "orange")
+    def add_edge(self, source, target, **kwargs):
+        highlight = False
+        if 'highlight' in kwargs:
+            highlight = kwargs['highlight']
+            del kwargs['highlight']
+        return self.graph.edge(
+            source,
+            target,
+            color="#00000" if not highlight else "orange",
+            **kwargs)
--- a/python/paddle/v2/fluid/io.py
+++ b/python/paddle/v2/fluid/io.py
@@ -46,6 +46,9 @@ def is_parameter(var):
 def is_persistable(var):
+    if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+       var.desc.type() == core.VarDesc.VarType.FETCH_LIST:
+        return False
    return var.persistable
@@ -60,7 +63,12 @@ def _clone_var_in_block_(block, var):
        persistable=True)
-def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
+def save_vars(executor,
+              dirname,
+              main_program=None,
+              vars=None,
+              predicate=None,
+              save_file_name=None):
    """
    Save variables to directory by executor.
@@ -69,9 +77,12 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
    :param main_program: program. If vars is None, then filter all variables in this
    program which fit `predicate`. Default default_main_program.
    :param predicate: The Predicate describes a callable that returns a variable
-    as a bool. If it returns true, the variables will be saved.
+    as a bool. If it returns true, the corresponding input variable will be saved.
-    :param vars: variables need to be saved. If specify vars, program & predicate
+    :param vars: variables need to be saved. If vars is specified, program & predicate
    will be ignored
+    :param save_file_name: The name of a single file that all vars are saved to. 
+    If it is None, save variables to separate files.
    :return: None
    """
    if vars is None:
@@ -83,21 +94,39 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
        save_vars(
            executor,
            dirname=dirname,
-            vars=filter(predicate, main_program.list_vars()))
+            vars=filter(predicate, main_program.list_vars()),
+            save_file_name=save_file_name)
    else:
        save_program = Program()
        save_block = save_program.global_block()
+        save_var_map = {}
        for each_var in vars:
            new_var = _clone_var_in_block_(save_block, each_var)
+            if save_file_name is None:
+                save_block.append_op(
+                    type='save',
+                    inputs={'X': [new_var]},
+                    outputs={},
+                    attrs={'file_path': os.path.join(dirname, new_var.name)})
+            else:
+                save_var_map[new_var.name] = new_var
+        if save_file_name is not None:
+            save_var_list = []
+            for name in sorted(save_var_map.keys()):
+                save_var_list.append(save_var_map[name])
            save_block.append_op(
-                type='save',
+                type='save_combine',
-                inputs={'X': [new_var]},
+                inputs={'X': save_var_list},
                outputs={},
-                attrs={'file_path': os.path.join(dirname, new_var.name)})
+                attrs={'file_path': os.path.join(dirname, save_file_name)})
        executor.run(save_program)
-def save_params(executor, dirname, main_program=None):
+def save_params(executor, dirname, main_program=None, save_file_name=None):
    """
    Save all parameters to directory with executor.
    """
@@ -106,10 +135,12 @@ def save_params(executor, dirname, main_program=None):
        dirname=dirname,
        main_program=main_program,
        vars=None,
-        predicate=is_parameter)
+        predicate=is_parameter,
+        save_file_name=save_file_name)
-def save_persistables(executor, dirname, main_program=None):
+def save_persistables(executor, dirname, main_program=None,
+                      save_file_name=None):
    """
    Save all persistables to directory with executor.
    """
@@ -118,21 +149,30 @@ def save_persistables(executor, dirname, main_program=None):
        dirname=dirname,
        main_program=main_program,
        vars=None,
-        predicate=is_persistable)
+        predicate=is_persistable,
+        save_file_name=save_file_name)
-def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
+def load_vars(executor,
+              dirname,
+              main_program=None,
+              vars=None,
+              predicate=None,
+              load_file_name=None):
    """
    Load variables from directory by executor.
-    :param executor: executor that save variable
+    :param executor: executor that load variable
    :param dirname: directory path
    :param main_program: program. If vars is None, then filter all variables in this
    program which fit `predicate`. Default default_main_program().
    :param predicate: The Predicate describes a callable that returns a variable
-    as a bool. If it returns true, the variables will be loaded.
+    as a bool. If it returns true, the corresponding input variable will be loaded.
-    :param vars: variables need to be loaded. If specify vars, program &
+    :param vars: variables need to be loaded. If vars is specified, program &
    predicate will be ignored
+    :param load_file_name: The name of the single file that all vars are loaded from.   
+    If it is None, load variables from separate files.
    :return: None
    """
    if vars is None:
@@ -144,23 +184,40 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
        load_vars(
            executor,
            dirname=dirname,
-            vars=filter(predicate, main_program.list_vars()))
+            vars=filter(predicate, main_program.list_vars()),
+            load_file_name=load_file_name)
    else:
        load_prog = Program()
        load_block = load_prog.global_block()
+        load_var_map = {}
        for each_var in vars:
            assert isinstance(each_var, Variable)
            new_var = _clone_var_in_block_(load_block, each_var)
+            if load_file_name is None:
+                load_block.append_op(
+                    type='load',
+                    inputs={},
+                    outputs={'Out': [new_var]},
+                    attrs={'file_path': os.path.join(dirname, new_var.name)})
+            else:
+                load_var_map[new_var.name] = new_var
+        if load_file_name is not None:
+            load_var_list = []
+            for name in sorted(load_var_map.keys()):
+                load_var_list.append(load_var_map[name])
            load_block.append_op(
-                type='load',
+                type='load_combine',
                inputs={},
-                outputs={"Out": [new_var]},
+                outputs={"Out": load_var_list},
-                attrs={'file_path': os.path.join(dirname, new_var.name)})
+                attrs={'file_path': os.path.join(dirname, load_file_name)})
        executor.run(load_prog)
-def load_params(executor, dirname, main_program=None):
+def load_params(executor, dirname, main_program=None, load_file_name=None):
    """
    load all parameters from directory by executor.
    """
@@ -168,10 +225,12 @@ def load_params(executor, dirname, main_program=None):
        executor,
        dirname=dirname,
        main_program=main_program,
-        predicate=is_parameter)
+        predicate=is_parameter,
+        load_file_name=load_file_name)
-def load_persistables(executor, dirname, main_program=None):
+def load_persistables(executor, dirname, main_program=None,
+                      load_file_name=None):
    """
    load all persistables from directory by executor.
    """
@@ -179,7 +238,8 @@ def load_persistables(executor, dirname, main_program=None):
        executor,
        dirname=dirname,
        main_program=main_program,
-        predicate=is_persistable)
+        predicate=is_persistable,
+        load_file_name=load_file_name)
 def get_inference_program(target_vars, main_program=None):
@@ -238,7 +298,8 @@ def save_inference_model(dirname,
                         feeded_var_names,
                         target_vars,
                         executor,
-                         main_program=None):
+                         main_program=None,
+                         save_file_name=None):
    """
    Build a model especially for inference,
    and save it to directory by the executor.
@@ -249,6 +310,8 @@ def save_inference_model(dirname,
    :param executor: executor that save inference model
    :param main_program: original program, which will be pruned to build the inference model.
            Default default_main_program().
+    :param save_file_name: The name of a single file that all parameters are saved to. 
+    If it is None, save parameters to separate files.
    :return: None
    """
@@ -283,25 +346,7 @@ def save_inference_model(dirname,
    with open(model_file_name, "wb") as f:
        f.write(inference_program.desc.serialize_to_string())
-    save_params(executor, dirname, main_program)
+    save_persistables(executor, dirname, inference_program, save_file_name)
-def load_persistables_if_exist(executor, dirname, main_program=None):
-    filenames = next(os.walk(dirname))[2]
-    filenames = set(filenames)
-    def _is_presistable_and_exist_(var):
-        if not is_persistable(var):
-            return False
-        else:
-            return var.name in filenames
-    load_vars(
-        executor,
-        dirname,
-        main_program=main_program,
-        vars=None,
-        predicate=_is_presistable_and_exist_)
 def get_feed_targets_names(program):
@@ -322,13 +367,15 @@ def get_fetch_targets_names(program):
    return fetch_targets_names
-def load_inference_model(dirname, executor):
+def load_inference_model(dirname, executor, load_file_name=None):
    """
    Load inference model from a directory
    :param dirname: directory path
    :param executor: executor that load inference model
+    :param load_file_name: The name of the single file that all parameters are loaded from.   
+    If it is None, load parameters from separate files.
    :return: [program, feed_target_names, fetch_targets]
             program: program especially for inference.
             feed_target_names: Names of variables that need to feed data
@@ -342,7 +389,7 @@ def load_inference_model(dirname, executor):
        program_desc_str = f.read()
    program = Program.parse_from_string(program_desc_str)
-    load_persistables_if_exist(executor, dirname, program)
+    load_persistables(executor, dirname, program, load_file_name)
    feed_target_names = get_feed_targets_names(program)
    fetch_target_names = get_fetch_targets_names(program)
@@ -359,6 +406,7 @@ def get_parameter_value(para, executor):
    :param executor: executor for retrieving the value
    :param para: the given parameter
    :return: the LoDTensor for the parameter
    """
    assert is_parameter(para)
@@ -377,6 +425,7 @@ def get_parameter_value_by_name(name, executor, program=None):
    :param name: the name of the parameter
    :param program: the program where the variable is found
            Default default_main_program().
    :return: the LoDTensor for the variable
    """
    if program is None:

--- a/python/paddle/v2/fluid/layers/io.py
+++ b/python/paddle/v2/fluid/layers/io.py
@@ -108,7 +108,7 @@ class ListenAndServ(object):
    """
    def __init__(self, endpoint, fan_in=1, optimizer_mode=True):
-        self.helper = LayerHelper("recv")
+        self.helper = LayerHelper("listen_and_serv")
        self.inputs = []
        self.outputs = []
        self.endpoint = endpoint
@@ -158,7 +158,7 @@ class ListenAndServ(object):
        param_names = [p.name for p in params]
        grad_names = [g.name for g in grads]
        parent_block.append_op(
-            type='recv',
+            type='listen_and_serv',
            inputs={},
            outputs={},
            attrs={
@@ -196,3 +196,31 @@ def Send(endpoints, send_vars, get_vars):
        outputs={"Out": get_vars},
        attrs={"endpoints": endpoints,
               "epmap": epmap})
+def Recv(endpoints, get_vars):
+    """
+    Recv layer
+    Args:
+        endpoints: comma seperated IP:PORT pairs in the order
+                   of send_vars to send
+        send_vars: vars to send
+        get_vars: vars to get from server after send completes.
+    Send variables to the server side, and get vars from server
+    side when server have finished running server side program.
+    """
+    assert (type(send_vars) == list)
+    assert (type(get_vars) == list)
+    epmap = endpoints.split(",")
+    endpoints = list(set(epmap))
+    helper = LayerHelper("Recv", **locals())
+    helper.append_op(
+        type="recv",
+        inputs={"X": get_vars},
+        outputs={"Out": get_vars},
+        attrs={"endpoints": endpoints,
+               "epmap": epmap})
--- a/python/paddle/v2/fluid/layers/math_op_patch.py
+++ b/python/paddle/v2/fluid/layers/math_op_patch.py
@@ -145,7 +145,9 @@ def monkey_patch_variable():
            # a*b == b*a. Do not need to reverse explicitly
        ("__rmul__", "elementwise_mul", False),
        ("__div__", "elementwise_div", False),
-        ("__rdiv__", "elementwise_div", True)):
+        ("__rdiv__", "elementwise_div", True),
+        ("__pow__", "elementwise_pow", False),
+        ("__rpow__", "elementwise_pow", True)):
        setattr(Variable, method_name,
                _elemwise_method_creator_(method_name, op_type, reverse))

--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -847,7 +847,35 @@ def cos_sim(X, Y, **kwargs):
    return out
-def dropout(x, dropout_prob, is_test=False, seed=0, **kwargs):
+def dropout(x, dropout_prob, is_test=False, seed=None, **kwargs):
+    """
+    Computes dropout.
+    Drop or keep each element of `x` independently. Dropout is a regularization
+    technique for reducing overfitting by preventing neuron co-adaption during
+    training. The dropout operator randomly set (according to the given dropout
+    probability) the outputs of some units to zero, while others are remain
+    unchanged.
+    Args:
+       x(variable): The input tensor.
+       dropout_prob(float): Probability of setting units to zero.
+       is_test(bool): A flag indicating whether it is in test phrase or not.
+       seed(int): A Python integer used to create random seeds. If this
+                  parameter is set to None, a random seed is used.
+                  NOTE: If an integer seed is given, always the same output
+                  units will be dropped. DO NOT use a fixed seed in training.
+    Returns:
+        Variable: A tensor variable.
+    Examples:
+        .. code-block:: python
+          x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+          droped = fluid.layers.dropout(input=x, dropout_rate=0.5)
+    """
    helper = LayerHelper('dropout', **kwargs)
    out = helper.create_tmp_variable(dtype=x.dtype)
    mask = helper.create_tmp_variable(dtype=x.dtype, stop_gradient=True)
@@ -856,9 +884,12 @@ def dropout(x, dropout_prob, is_test=False, seed=0, **kwargs):
        inputs={'X': [x]},
        outputs={'Out': [out],
                 'Mask': [mask]},
-        attrs={'dropout_prob': dropout_prob,
+        attrs={
-               'is_test': is_test,
+            'dropout_prob': dropout_prob,
-               'seed': seed})
+            'is_test': is_test,
+            'fix_seed': seed is not None,
+            'seed': seed if seed is not None else 0
+        })
    return out
@@ -1200,10 +1231,17 @@ def conv2d(input,
    """
    if stride is None:
        stride = [1, 1]
-    helper = LayerHelper('conv2d', **locals())
-    dtype = helper.input_dtype()
    num_channels = input.shape[1]
+    l_type = 'conv2d'
+    if (num_channels == groups and num_filters % num_channels == 0 and
+            not use_cudnn):
+        l_type = 'depthwise_conv2d'
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
    if groups is None:
        num_filter_channels = num_channels
    else:
@@ -1236,7 +1274,7 @@ def conv2d(input,
    pre_bias = helper.create_tmp_variable(dtype)
    helper.append_op(
-        type='conv2d',
+        type=l_type,
        inputs={
            'Input': input,
            'Filter': filter_param,
@@ -1447,7 +1485,9 @@ def batch_norm(input,
               param_attr=None,
               bias_attr=None,
               data_layout='NCHW',
-               name=None):
+               name=None,
+               moving_mean_name=None,
+               moving_variance_name=None):
    """
    This function helps create an operator to implement
    the BatchNorm layer using the configurations from the input parameters.
@@ -1477,6 +1517,7 @@ def batch_norm(input,
        attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
    mean = helper.create_global_variable(
+        name=moving_mean_name,
        dtype=input.dtype,
        shape=param_shape,
        persistable=True,
@@ -1484,6 +1525,7 @@ def batch_norm(input,
    helper.set_variable_initializer(var=mean, initializer=Constant(0.0))
    variance = helper.create_global_variable(
+        name=moving_variance_name,
        dtype=input.dtype,
        shape=param_shape,
        persistable=True,

--- a/python/paddle/v2/fluid/layers/ops.py
+++ b/python/paddle/v2/fluid/layers/ops.py
@@ -56,8 +56,10 @@ __all__ = [
    'elementwise_mul',
    'elementwise_max',
    'elementwise_min',
+    'elementwise_pow',
    'clip',
    'clip_by_norm',
+    'softmax',
    'sequence_softmax',
 ] + __activations__

--- a/python/paddle/v2/fluid/layers/tensor.py
+++ b/python/paddle/v2/fluid/layers/tensor.py
@@ -16,12 +16,14 @@ from ..layer_helper import LayerHelper
 from ..param_attr import ParamAttr
 from ..framework import convert_np_dtype_to_dtype_
 from ..framework import Variable
+from ..initializer import Constant
 from ..core import DataType
 import numpy
 __all__ = [
    'create_tensor',
    'create_parameter',
+    'create_global_var',
    'cast',
    'concat',
    'sums',
@@ -58,13 +60,22 @@ def create_parameter(shape,
    Returns:
        Parameter: the created parameter
    """
-    helper = LayerHelper("create_parameter")
+    helper = LayerHelper("create_parameter", **locals())
    if attr is None:
        attr = ParamAttr()
    return helper.create_parameter(attr, shape, dtype, is_bias,
                                   default_initializer)
+def create_global_var(shape, value, dtype, persistable=False, name=None):
+    helper = LayerHelper("global_var", **locals())
+    var = helper.create_global_variable(
+        dtype=dtype, shape=shape, persistable=persistable, name=name)
+    helper.set_variable_initializer(
+        var, initializer=Constant(value=float(value)))
+    return var
 def cast(x, dtype):
    """
    This function takes in the input with input_dtype
@@ -284,7 +295,7 @@ def fill_constant_batch_size_like(input,
    return out
-def ones(shape, dtype):
+def ones(shape, dtype, force_cpu=False):
    """
    **ones**
@@ -308,7 +319,7 @@ def ones(shape, dtype):
    return fill_constant(value=1.0, **locals())
-def zeros(shape, dtype):
+def zeros(shape, dtype, force_cpu=False):
    """
    **zeros**

--- a/python/paddle/v2/fluid/learning_rate_decay.py
+++ b/python/paddle/v2/fluid/learning_rate_decay.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import layers
+from framework import Variable
+__all__ = ['exponential_decay', 'natural_exp_decay', 'inverse_time_decay']
+"""
+When training a model, it's often useful to decay the
+learning rate during training process, this is called
+learning_rate_decay. There are many strategies to do
+this, this module will provide some classical method.
+User can also implement their own learning_rate_decay
+strategy according to this module.
+"""
+def exponential_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False):
+    """Applies exponential decay to the learning rate.
+    ```python
+    decayed_learning_rate = learning_rate *
+            decay_rate ^ (global_step / decay_steps)
+    ```
+    Args:
+        learning_rate: A scalar float32 value or a Variable. This
+          will be the initial learning rate during training
+        global_step: A Variable that record the training step.
+        decay_steps: A Python `int32` number.
+        decay_rate: A Python `float` number.
+        staircase: Boolean. If set true, decay the learning rate every decay_steps.
+    Returns:
+        The decayed learning rate
+    """
+    if not isinstance(global_step, Variable):
+        raise ValueError("global_step is required for exponential_decay.")
+    # update learning_rate
+    div_res = global_step / decay_steps
+    if staircase:
+        div_res = layers.floor(x=div_res)
+    return learning_rate * (decay_rate**div_res)
+def natural_exp_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False):
+    """Applies natural exponential decay to the initial learning rate.
+    ```python
+    if not staircase:
+        decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
+    else:
+        decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
+    ```
+    Args:
+        learning_rate: A scalar float32 value or a Variable. This
+          will be the initial learning rate during training
+        global_step: A Variable that record the training step.
+        decay_steps: A Python `int32` number.
+        decay_rate: A Python `float` number.
+        staircase: Boolean. If set true, decay the learning rate every decay_steps.
+    Returns:
+        The decayed learning rate
+    """
+    if not isinstance(global_step, Variable):
+        raise ValueError("global_step is required for natural_exp_decay.")
+    div_res = global_step / decay_steps
+    if staircase:
+        div_res = layers.floor(x=div_res)
+    return learning_rate * layers.exp(x=(-1 * decay_rate * div_res))
+def inverse_time_decay(learning_rate,
+                       global_step,
+                       decay_steps,
+                       decay_rate,
+                       staircase=False):
+    """Applies inverse time decay to the initial learning rate.
+    ```python
+    if staircase:
+      decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
+    else
+      decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
+    ```
+    Args:
+        learning_rate: A scalar float32 value or a Variable. This
+          will be the initial learning rate during training
+        global_step: A Variable that record the training step.
+        decay_steps: A Python `int32` number.
+        decay_rate: A Python `float` number.
+        staircase: Boolean. If set true, decay the learning rate every decay_steps.
+    Returns:
+        The decayed learning rate
+    """
+    if not isinstance(global_step, Variable):
+        raise ValueError("global_step is required for inverse_time_decay.")
+    div_res = global_step / decay_steps
+    if staircase:
+        div_res = layers.floor(x=div_res)
+    return learning_rate / (1 + decay_rate * div_res)
--- a/python/paddle/v2/fluid/memory_optimization_transpiler.py
+++ b/python/paddle/v2/fluid/memory_optimization_transpiler.py
@@ -31,7 +31,7 @@ dtype_to_size = {
 class ControlFlowGraph(object):
-    def __init__(self, Program, ops, forward_num):
+    def __init__(self, Program, ops, forward_num, skip_opt):
        self._program = Program
        self._ops = ops
        self._forward_num = forward_num
@@ -41,6 +41,7 @@ class ControlFlowGraph(object):
        self._defs = defaultdict(set)
        self._live_in = defaultdict(set)
        self._live_out = defaultdict(set)
+        self._skip_opt = skip_opt
    def _add_connections(self, connections):
        for node1, node2 in connections:
@@ -130,6 +131,10 @@ class ControlFlowGraph(object):
                    block_desc, x,
                    is_forward).type() != core.VarDesc.VarType.LOD_TENSOR:
                return False
+            if x in self._skip_opt:
+                return False
+            if not self._find_var(block_desc, x, is_forward).shape():
+                return False
            return True
        self._build_graph()
@@ -140,6 +145,7 @@ class ControlFlowGraph(object):
            if op.type() == "while" or op.type() == "while_grad":
                continue
            block_desc = op.block()
+            self.current_block_desc = block_desc
            is_forward = i < self._forward_num
            if self.pool:
                defs_can_optimize = filter(
@@ -197,28 +203,32 @@ def get_cfgs(input_program):
    block_desc = pdesc.block(0)
    op_size = block_desc.op_size()
    # Get global block ops
-    ops_list.append(([block_desc.op(i) for i in range(op_size)], op_size))
+    ops_list.append(
+        ([block_desc.op(i) for i in range(op_size)], op_size, set()))
    while_sub_block_ids = []
    while_grad_sub_block_ids = []
-    while_pair = []
+    while_op_output = set()
+    while_block_id_pair = []
    for i in range(op_size):
        op = block_desc.op(i)
        if op.type() == "while":
            while_sub_block_ids.append(op.attr("sub_block").id)
+            while_op_output.update(op.output_arg_names())
        elif op.type() == "while_grad":
            while_grad_sub_block_ids.append(op.attr("sub_block").id)
+            while_op_output.update(op.output_arg_names())
    # Find while/while_grad block pair
    for grad_id in while_grad_sub_block_ids:
        parent_id = pdesc.block(grad_id).parent
        if parent_id in while_sub_block_ids:
-            while_pair.append((parent_id, grad_id))
+            while_block_id_pair.append((parent_id, grad_id))
            while_sub_block_ids.remove(parent_id)
    # Get while/while_grad block ops
-    for parent_id, grad_id in while_pair:
+    for parent_id, grad_id in while_block_id_pair:
        while_block_ops = []
        while_block = pdesc.block(parent_id)
        while_block_op_size = while_block.op_size()
@@ -230,7 +240,7 @@ def get_cfgs(input_program):
        for i in range(while_grad_block_op_size):
            while_block_ops.append(while_grad_block.op(i))
-        ops_list.append((while_block_ops, while_block_op_size))
+        ops_list.append((while_block_ops, while_block_op_size, while_op_output))
    # Process rest while block ops
    for parent_id in while_sub_block_ids:
@@ -242,7 +252,7 @@ def get_cfgs(input_program):
        ops_list.append((while_block_ops, while_block_op_size))
-    cfgs = [ControlFlowGraph(input_program, i, j) for i, j in ops_list]
+    cfgs = [ControlFlowGraph(input_program, i, j, k) for i, j, k in ops_list]
    return cfgs

--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
@@ -15,6 +15,7 @@
 from collections import defaultdict
 import framework
+import layers
 from backward import append_backward
 from framework import unique_name, program_guard
 from initializer import Constant
@@ -33,9 +34,11 @@ class Optimizer(object):
    but need to use one of it's implementation.
    """
-    def __init__(self, global_step=None, regularization=None):
+    def __init__(self, learning_rate, global_step=None, regularization=None):
+        assert learning_rate is not None
        self._global_step = global_step
        self.regularization = regularization
+        self._global_learning_rate = learning_rate
        # Dictionary of accumulators. Some optimizer subclasses need to
        # allocate and manage extra variables associated with the parameters
        # to train. These variables are called accumulators.
@@ -43,6 +46,28 @@ class Optimizer(object):
        self._accumulators = defaultdict(lambda: dict())
        self.helper = None
+    def _create_global_learning_rate(self):
+        if isinstance(self._global_learning_rate, float):
+            self._global_learning_rate = layers.create_global_var(
+                name=unique_name("learning_rate"),
+                shape=[1],
+                value=float(self._global_learning_rate),
+                dtype='float32',
+                persistable=True)
+        if not isinstance(self._global_learning_rate, framework.Variable):
+            raise ValueError("learning rate should be a Variable, "
+                             "actual type is %s",
+                             type(self._global_learning_rate))
+    @property
+    def global_learning_rate(self):
+        """
+        get global decayed learning rate
+        :return:
+        """
+        return self._global_learning_rate
    def _append_optimize_op(self, block, param_and_grad):
        """ append optimize operator to block and return all the added optimize_op
        """
@@ -52,17 +77,7 @@ class Optimizer(object):
        # create learning rate variable for every parameter
        param = param_and_grad[0]
        param_lr = param.optimize_attr['learning_rate']
-        param_lr_shape = [1]
+        return self._global_learning_rate * param_lr
-        param_lr_var = self.helper.create_global_variable(
-            name=unique_name("learning_rate"),
-            dtype='float32',
-            shape=param_lr_shape,
-            lod_level=1,
-            persistable=True)
-        param_lr = param_lr * self._learning_rate
-        self.helper.set_variable_initializer(
-            var=param_lr_var, initializer=Constant(param_lr))
-        return param_lr_var
    def _create_accumulators(self, block, parameters):
        """Create all accumulators needed by the parameters
@@ -163,7 +178,7 @@ class Optimizer(object):
          optimization. This will include parameter update ops, global step
          update ops and any other custom ops required by subclasses to manage
          their internal state.
-          :param startup_program: 
+          :param startup_program:
        """
        # This is a default implementation of create_optimization_pass that
        # can be shared by most optimizers. This implementation assumes that
@@ -178,6 +193,7 @@ class Optimizer(object):
            self.helper = LayerHelper(self.__class__.__name__)
            self._create_accumulators(loss.block,
                                      [p[0] for p in parameters_and_grads])
+            self._create_global_learning_rate()
            optimize_ops = []
            for param_and_grad in parameters_and_grads:
@@ -231,9 +247,9 @@ class SGDOptimizer(Optimizer):
    def __init__(self, learning_rate, **kwargs):
        assert learning_rate is not None
-        super(SGDOptimizer, self).__init__(**kwargs)
+        super(SGDOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
        self.type = "sgd"
-        self._learning_rate = learning_rate
    def _append_optimize_op(self, block, param_and_grad):
        assert isinstance(block, framework.Block)
@@ -259,9 +275,9 @@ class MomentumOptimizer(Optimizer):
    def __init__(self, learning_rate, momentum, use_nesterov=False, **kwargs):
        assert learning_rate is not None
        assert momentum is not None
-        super(MomentumOptimizer, self).__init__(**kwargs)
+        super(MomentumOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
        self.type = "momentum"
-        self._learning_rate = learning_rate
        self._momentum = momentum
        self._use_nesterov = bool(use_nesterov)
@@ -303,9 +319,9 @@ class AdagradOptimizer(Optimizer):
    def __init__(self, learning_rate, epsilon=1.0e-6, **kwargs):
        assert learning_rate is not None
        assert epsilon is not None
-        super(AdagradOptimizer, self).__init__(**kwargs)
+        super(AdagradOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
        self.type = "adagrad"
-        self._learning_rate = learning_rate
        self._epsilon = epsilon
    def _create_accumulators(self, block, parameters):
@@ -352,9 +368,9 @@ class AdamOptimizer(Optimizer):
        assert beta1 is not None
        assert beta2 is not None
        assert epsilon is not None
-        super(AdamOptimizer, self).__init__(**kwargs)
+        super(AdamOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
        self.type = "adam"
-        self._learning_rate = learning_rate
        self._beta1 = beta1
        self._beta2 = beta2
        self._epsilon = epsilon
@@ -457,9 +473,9 @@ class AdamaxOptimizer(Optimizer):
        assert beta1 is not None
        assert beta2 is not None
        assert epsilon is not None
-        super(AdamaxOptimizer, self).__init__(**kwargs)
+        super(AdamaxOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
        self.type = "adamax"
-        self._learning_rate = learning_rate
        self._beta1 = beta1
        self._beta2 = beta2
        self._epsilon = epsilon
@@ -535,9 +551,9 @@ class DecayedAdagradOptimizer(Optimizer):
        assert decay is not None
        assert epsilon is not None
-        super(DecayedAdagradOptimizer, self).__init__(**kwargs)
+        super(DecayedAdagradOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
        self.type = "decayed_adagrad"
-        self._learning_rate = learning_rate
        self._decay = decay
        self._epsilon = epsilon

--- a/python/paddle/v2/fluid/profiler.py
+++ b/python/paddle/v2/fluid/profiler.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import paddle.v2.fluid.core as core
+import core
 from contextlib import contextmanager
 import os
-__all__ = ['CudaProfiler']
+__all__ = ['cuda_profiler', 'reset_profiler', 'profiler']
 NVPROF_CONFIG = [
    "gpustarttimestamp",
@@ -103,10 +103,10 @@ def profiler(state, sorted_key=None):
    core.enable_profiler(prof_state)
    yield
-    if sorted_key not in ['calls', 'total', 'max', 'min', 'ave']:
-        raise ValueError("The state must be in 'calls', 'total', "
-                         "'max', 'min', 'ave'")
    sorted_key = 'default' if sorted_key is None else sorted_key
+    if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']:
+        raise ValueError("The sorted_key must be None or in 'calls', 'total', "
+                         "'max', 'min' and 'ave'")
    key_map = {
        'default': core.EventSortingKey.kDefault,
        'calls': core.EventSortingKey.kCalls,

--- a/python/paddle/v2/fluid/tests/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/CMakeLists.txt
@@ -5,9 +5,11 @@ if(NOT WITH_DISTRIBUTE)
    list(REMOVE_ITEM TEST_OPS test_recv_op)
 endif(NOT WITH_DISTRIBUTE)
+list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 foreach(src ${TEST_OPS})
    py_test(${src} SRCS ${src}.py)
 endforeach()
+py_test(test_warpctc_op SRCS test_warpctc_op.py ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR})
 add_subdirectory(book)
 add_subdirectory(book_distribute)

--- a/python/paddle/v2/fluid/tests/book/.gitignore
+++ b/python/paddle/v2/fluid/tests/book/.gitignore
+recognize_digits_*.inference.model
--- a/python/paddle/v2/fluid/tests/book/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/book/CMakeLists.txt
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-list(REMOVE_ITEM TEST_OPS test_image_classification_train test_recognize_digits)
-py_test(test_image_classification_train_resnet SRCS test_image_classification_train.py ARGS resnet)
-py_test(test_image_classification_train_vgg SRCS test_image_classification_train.py ARGS vgg)
-py_test(test_recognize_digits_mlp_cpu
-  SRCS test_recognize_digits.py
-  ARGS mlp)
-py_test(test_recognize_digits_mlp_cuda
-  SRCS test_recognize_digits.py
-  ARGS mlp --use_cuda)
-py_test(test_recognize_digits_conv_cpu
-  SRCS test_recognize_digits.py
-  ARGS conv)
-py_test(test_recognize_digits_conv_cuda
-  SRCS test_recognize_digits.py
-  ARGS conv --use_cuda)
-py_test(test_recognize_digits_mlp_cpu_parallel
-  SRCS test_recognize_digits.py
-  ARGS mlp --parallel)
-py_test(test_recognize_digits_mlp_cuda_parallel
-  SRCS test_recognize_digits.py
-  ARGS mlp --use_cuda --parallel)
-py_test(test_recognize_digits_conv_cpu_parallel
-  SRCS test_recognize_digits.py
-  ARGS conv --parallel)
-py_test(test_recognize_digits_conv_cuda_parallel
-  SRCS test_recognize_digits.py
-  ARGS conv --use_cuda --parallel)
 # default test
 foreach(src ${TEST_OPS})
    py_test(${src} SRCS ${src}.py)

--- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
@@ -12,44 +12,74 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
+import contextlib
+import unittest
-x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-y_predict = fluid.layers.fc(input=x, size=1, act=None)
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
-y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
-avg_cost = fluid.layers.mean(x=cost)
-sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-sgd_optimizer.minimize(avg_cost)
-BATCH_SIZE = 20
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    avg_cost = fluid.layers.mean(x=cost)
-train_reader = paddle.batch(
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-    paddle.reader.shuffle(
+    sgd_optimizer.minimize(avg_cost)
-        paddle.dataset.uci_housing.train(), buf_size=500),
-    batch_size=BATCH_SIZE)
-place = fluid.CPUPlace()
+    BATCH_SIZE = 20
-feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-exe = fluid.Executor(place)
-exe.run(fluid.default_startup_program())
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.uci_housing.train(), buf_size=500),
+        batch_size=BATCH_SIZE)
-PASS_NUM = 100
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-for pass_id in range(PASS_NUM):
+    feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-    fluid.io.save_persistables(exe, "./fit_a_line.model/")
+    exe = fluid.Executor(place)
-    fluid.io.load_persistables(exe, "./fit_a_line.model/")
-    for data in train_reader():
+    exe.run(fluid.default_startup_program())
-        avg_loss_value, = exe.run(fluid.default_main_program(),
-                                  feed=feeder.feed(data),
+    PASS_NUM = 100
-                                  fetch_list=[avg_cost])
+    for pass_id in range(PASS_NUM):
-        print(avg_loss_value)
+        fluid.io.save_persistables(exe, "./fit_a_line.model/")
-        if avg_loss_value[0] < 10.0:
+        fluid.io.load_persistables(exe, "./fit_a_line.model/")
-            exit(0)  # if avg cost less than 10.0, we think our code is good.
+        for data in train_reader():
-exit(1)
+            avg_loss_value, = exe.run(fluid.default_main_program(),
+                                      feed=feeder.feed(data),
+                                      fetch_list=[avg_cost])
+            print(avg_loss_value)
+            if avg_loss_value[0] < 10.0:
+                return
+    raise AssertionError("Fit a line cost is too large, {0:2.2}".format(
+        avg_loss_value[0]))
+class TestFitALine(unittest.TestCase):
+    def test_cpu(self):
+        with self.program_scope_guard():
+            main(use_cuda=False)
+    def test_cuda(self):
+        with self.program_scope_guard():
+            main(use_cuda=True)
+    @contextlib.contextmanager
+    def program_scope_guard(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
@@ -14,10 +14,10 @@
 from __future__ import print_function
-import sys
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
+import unittest
+import contextlib
 def resnet_cifar10(input, depth=32):
@@ -89,56 +89,89 @@ def vgg16_bn_drop(input):
    return fc2
-classdim = 10
+def main(net_type, use_cuda):
-data_shape = [3, 32, 32]
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
-images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
-label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    classdim = 10
+    data_shape = [3, 32, 32]
-net_type = "vgg"
-if len(sys.argv) >= 2:
+    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
-    net_type = sys.argv[1]
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-if net_type == "vgg":
+    if net_type == "vgg":
-    print("train vgg net")
+        print("train vgg net")
-    net = vgg16_bn_drop(images)
+        net = vgg16_bn_drop(images)
-elif net_type == "resnet":
+    elif net_type == "resnet":
-    print("train resnet")
+        print("train resnet")
-    net = resnet_cifar10(images, 32)
+        net = resnet_cifar10(images, 32)
-else:
+    else:
-    raise ValueError("%s network is not supported" % net_type)
+        raise ValueError("%s network is not supported" % net_type)
-predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
-cost = fluid.layers.cross_entropy(input=predict, label=label)
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
-avg_cost = fluid.layers.mean(x=cost)
+    avg_cost = fluid.layers.mean(x=cost)
-optimizer = fluid.optimizer.Adam(learning_rate=0.001)
+    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
-opts = optimizer.minimize(avg_cost)
+    optimizer.minimize(avg_cost)
-accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+    accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
-BATCH_SIZE = 128
+    BATCH_SIZE = 128
-PASS_NUM = 1
+    PASS_NUM = 1
-train_reader = paddle.batch(
+    train_reader = paddle.batch(
-    paddle.reader.shuffle(
+        paddle.reader.shuffle(
-        paddle.dataset.cifar.train10(), buf_size=128 * 10),
+            paddle.dataset.cifar.train10(), buf_size=128 * 10),
-    batch_size=BATCH_SIZE)
+        batch_size=BATCH_SIZE)
-place = fluid.CPUPlace()
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-exe = fluid.Executor(place)
+    exe = fluid.Executor(place)
-feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
+    feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
-exe.run(fluid.default_startup_program())
+    exe.run(fluid.default_startup_program())
-for pass_id in range(PASS_NUM):
+    loss = 0.0
-    accuracy.reset(exe)
+    for pass_id in range(PASS_NUM):
-    for data in train_reader():
+        accuracy.reset(exe)
-        loss, acc = exe.run(fluid.default_main_program(),
+        for data in train_reader():
-                            feed=feeder.feed(data),
+            loss, acc = exe.run(fluid.default_main_program(),
-                            fetch_list=[avg_cost] + accuracy.metrics)
+                                feed=feeder.feed(data),
-        pass_acc = accuracy.eval(exe)
+                                fetch_list=[avg_cost] + accuracy.metrics)
-        print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
+            pass_acc = accuracy.eval(exe)
-            pass_acc))
+            print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
-        # this model is slow, so if we can train two mini batch, we think it works properly.
+                pass_acc))
-        exit(0)
+            return
-exit(1)
+    raise AssertionError(
+        "Image classification loss is too large, {0:2.2}".format(loss))
+class TestImageClassification(unittest.TestCase):
+    def test_vgg_cuda(self):
+        with self.scope_prog_guard():
+            main('vgg', use_cuda=True)
+    def test_resnet_cuda(self):
+        with self.scope_prog_guard():
+            main('resnet', use_cuda=True)
+    def test_vgg_cpu(self):
+        with self.scope_prog_guard():
+            main('vgg', use_cuda=False)
+    def test_resnet_cpu(self):
+        with self.scope_prog_guard():
+            main('resnet', use_cuda=False)
+    @contextlib.contextmanager
+    def scope_prog_guard(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
@@ -175,7 +175,7 @@ def main():
        paddle.reader.shuffle(
            paddle.dataset.conll05.test(), buf_size=8192),
        batch_size=BATCH_SIZE)
-    #place = fluid.CPUPlace()
+    # place = fluid.CPUPlace()
    place = fluid.CUDAPlace(0)
    feeder = fluid.DataFeeder(
        feed_list=[

--- a/python/paddle/v2/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
@@ -11,21 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import contextlib
 import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
-import paddle.v2.fluid.core as core
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.layers as pd
 from paddle.v2.fluid.executor import Executor
+import unittest
 dict_size = 30000
 source_dict_dim = target_dict_dim = dict_size
-src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
 hidden_dim = 32
 word_dim = 16
-IS_SPARSE = True
 batch_size = 2
 max_length = 8
 topk_size = 50
@@ -34,10 +33,8 @@ beam_size = 2
 decoder_size = hidden_dim
-place = core.CPUPlace()
+def encoder(is_sparse):
-def encoder():
    # encoder
    src_word_id = pd.data(
        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
@@ -45,7 +42,7 @@ def encoder():
        input=src_word_id,
        size=[dict_size, word_dim],
        dtype='float32',
-        is_sparse=IS_SPARSE,
+        is_sparse=is_sparse,
        param_attr=fluid.ParamAttr(name='vemb'))
    fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
@@ -54,7 +51,7 @@ def encoder():
    return encoder_out
-def decoder_train(context):
+def decoder_train(context, is_sparse):
    # decoder
    trg_language_word = pd.data(
        name="target_language_word", shape=[1], dtype='int64', lod_level=1)
@@ -62,7 +59,7 @@ def decoder_train(context):
        input=trg_language_word,
        size=[dict_size, word_dim],
        dtype='float32',
-        is_sparse=IS_SPARSE,
+        is_sparse=is_sparse,
        param_attr=fluid.ParamAttr(name='vemb'))
    rnn = pd.DynamicRNN()
@@ -82,10 +79,10 @@ def decoder_train(context):
    return rnn()
-def decoder_decode(context):
+def decoder_decode(context, is_sparse):
    init_state = context
    array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
-    counter = pd.zeros(shape=[1], dtype='int64')
+    counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True)
    # fill the first element with init_state
    state_array = pd.create_array('float32')
@@ -117,7 +114,7 @@ def decoder_decode(context):
            input=pre_ids,
            size=[dict_size, word_dim],
            dtype='float32',
-            is_sparse=IS_SPARSE)
+            is_sparse=is_sparse)
        # use rnn unit to update rnn
        current_state = pd.fc(input=[pre_ids_emb, pre_state_expanded],
@@ -150,7 +147,7 @@ def decoder_decode(context):
 def set_init_lod(data, lod, place):
-    res = core.LoDTensor()
+    res = fluid.LoDTensor()
    res.set(data, place)
    res.set_lod(lod)
    return res
@@ -165,15 +162,19 @@ def to_lodtensor(data, place):
        lod.append(cur_len)
    flattened_data = np.concatenate(data, axis=0).astype("int64")
    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = core.LoDTensor()
+    res = fluid.LoDTensor()
    res.set(flattened_data, place)
    res.set_lod([lod])
    return res
-def train_main():
+def train_main(use_cuda, is_sparse):
-    context = encoder()
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
-    rnn_out = decoder_train(context)
+        return
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    context = encoder(is_sparse)
+    rnn_out = decoder_train(context, is_sparse)
    label = pd.data(
        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
    cost = pd.cross_entropy(input=rnn_out, label=label)
@@ -212,9 +213,13 @@ def train_main():
            batch_id += 1
-def decode_main():
+def decode_main(use_cuda, is_sparse):
-    context = encoder()
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
-    translation_ids, translation_scores = decoder_decode(context)
+        return
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    context = encoder(is_sparse)
+    translation_ids, translation_scores = decoder_decode(context, is_sparse)
    exe = Executor(place)
    exe.run(framework.default_startup_program())
@@ -250,6 +255,60 @@ def decode_main():
        break
+class TestMachineTranslation(unittest.TestCase):
+    pass
+@contextlib.contextmanager
+def scope_prog_guard():
+    prog = fluid.Program()
+    startup_prog = fluid.Program()
+    scope = fluid.core.Scope()
+    with fluid.scope_guard(scope):
+        with fluid.program_guard(prog, startup_prog):
+            yield
+def inject_test_train(use_cuda, is_sparse):
+    f_name = 'test_{0}_{1}_train'.format('cuda' if use_cuda else 'cpu', 'sparse'
+                                         if is_sparse else 'dense')
+    def f(*args):
+        with scope_prog_guard():
+            train_main(use_cuda, is_sparse)
+    setattr(TestMachineTranslation, f_name, f)
+def inject_test_decode(use_cuda, is_sparse, decorator=None):
+    f_name = 'test_{0}_{1}_decode'.format('cuda'
+                                          if use_cuda else 'cpu', 'sparse'
+                                          if is_sparse else 'dense')
+    def f(*args):
+        with scope_prog_guard():
+            decode_main(use_cuda, is_sparse)
+    if decorator is not None:
+        f = decorator(f)
+    setattr(TestMachineTranslation, f_name, f)
+for _use_cuda_ in (False, True):
+    for _is_sparse_ in (False, True):
+        inject_test_train(_use_cuda_, _is_sparse_)
+for _use_cuda_ in (False, True):
+    for _is_sparse_ in (False, True):
+        _decorator_ = None
+        if _use_cuda_:
+            _decorator_ = unittest.skip(
+                reason='Beam Search does not support CUDA!')
+        inject_test_decode(
+            is_sparse=_is_sparse_, use_cuda=_use_cuda_, decorator=_decorator_)
 if __name__ == '__main__':
-    # train_main()
+    unittest.main()
-    decode_main()
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
@@ -17,6 +17,7 @@ import paddle.v2.fluid as fluid
 import paddle.v2 as paddle
 import sys
 import numpy
+import unittest
 def parse_arg():
@@ -45,8 +46,9 @@ BATCH_SIZE = 64
 def loss_net(hidden, label):
    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    return fluid.layers.mean(x=loss), fluid.layers.accuracy(
+    avg_loss = fluid.layers.mean(x=loss)
-        input=prediction, label=label)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+    return prediction, avg_loss, acc
 def mlp(img, label):
@@ -73,25 +75,25 @@ def conv_net(img, label):
    return loss_net(conv_pool_2, label)
-def main():
+def train(nn_type, use_cuda, parallel, save_dirname):
-    args = parse_arg()
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
-    print("recognize digits with args: {0}".format(" ".join(sys.argv[1:])))
+        return
    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    if args.nn_type == 'mlp':
+    if nn_type == 'mlp':
        net_conf = mlp
    else:
        net_conf = conv_net
-    if args.parallel:
+    if parallel:
        places = fluid.layers.get_places()
        pd = fluid.layers.ParallelDo(places)
        with pd.do():
            img_ = pd.read_input(img)
            label_ = pd.read_input(label)
-            for o in net_conf(img_, label_):
+            prediction, avg_loss, acc = net_conf(img_, label_)
+            for o in [avg_loss, acc]:
                pd.write_output(o)
        avg_loss, acc = pd()
@@ -99,14 +101,14 @@ def main():
        avg_loss = fluid.layers.mean(x=avg_loss)
        acc = fluid.layers.mean(x=acc)
    else:
-        avg_loss, acc = net_conf(img, label)
+        prediction, avg_loss, acc = net_conf(img, label)
    test_program = fluid.default_main_program().clone()
    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
    optimizer.minimize(avg_loss)
-    place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
@@ -137,13 +139,85 @@ def main():
                acc_val = numpy.array(acc_set).mean()
                avg_loss_val = numpy.array(avg_loss_set).mean()
                if float(acc_val) > 0.85:  # test acc > 85%
-                    exit(0)
+                    if save_dirname is not None:
+                        fluid.io.save_inference_model(save_dirname, ["img"],
+                                                      [prediction], exe)
+                    return
                else:
                    print(
                        'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
                        format(pass_id, batch_id + 1,
                               float(avg_loss_val), float(acc_val)))
+    raise AssertionError("Loss of recognize digits is too large")
+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    # Use fluid.io.load_inference_model to obtain the inference program desc,
+    # the feed_target_names (the names of variables that will be feeded 
+    # data using feed operators), and the fetch_targets (variables that 
+    # we want to obtain data from using fetch operators).
+    [inference_program, feed_target_names,
+     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+    # The input's dimension of conv should be 4-D or 5-D.
+    tensor_img = numpy.random.rand(1, 1, 28, 28).astype("float32")
+    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+    # and results will contain a list of data corresponding to fetch_targets.
+    results = exe.run(inference_program,
+                      feed={feed_target_names[0]: tensor_img},
+                      fetch_list=fetch_targets)
+    print("infer results: ", results[0])
+def main(use_cuda, parallel, nn_type):
+    if not use_cuda and not parallel:
+        save_dirname = "recognize_digits_" + nn_type + ".inference.model"
+    else:
+        save_dirname = None
+    train(
+        nn_type=nn_type,
+        use_cuda=use_cuda,
+        parallel=parallel,
+        save_dirname=save_dirname)
+    infer(use_cuda=use_cuda, save_dirname=save_dirname)
+class TestRecognizeDigits(unittest.TestCase):
+    pass
+def inject_test_method(use_cuda, parallel, nn_type):
+    def __impl__(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                main(use_cuda, parallel, nn_type)
+    fn = 'test_{0}_{1}_{2}'.format(nn_type, 'cuda'
+                                   if use_cuda else 'cpu', 'parallel'
+                                   if parallel else 'normal')
+    setattr(TestRecognizeDigits, fn, __impl__)
+def inject_all_tests():
+    for use_cuda in (False, True):
+        for parallel in (False, True):
+            for nn_type in ('mlp', 'conv'):
+                inject_test_method(use_cuda, parallel, nn_type)
+inject_all_tests()
 if __name__ == '__main__':
-    main()
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,9 +12,36 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import numpy as np
+import unittest
-import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
+import paddle.v2 as paddle
+import contextlib
+def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
+                    hid_dim=32):
+    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
+    conv_3 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=3,
+        act="tanh",
+        pool_type="sqrt")
+    conv_4 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=4,
+        act="tanh",
+        pool_type="sqrt")
+    prediction = fluid.layers.fc(input=[conv_3, conv_4],
+                                 size=class_dim,
+                                 act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
+    adam_optimizer.minimize(avg_cost)
+    accuracy = fluid.layers.accuracy(input=prediction, label=label)
+    return avg_cost, accuracy
 def stacked_lstm_net(data,
@@ -51,63 +78,77 @@ def stacked_lstm_net(data,
    avg_cost = fluid.layers.mean(x=cost)
    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
    adam_optimizer.minimize(avg_cost)
-    accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
+    accuracy = fluid.layers.accuracy(input=prediction, label=label)
-    return avg_cost, accuracy, accuracy.metrics[0]
+    return avg_cost, accuracy
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-def main():
-    BATCH_SIZE = 100
-    PASS_NUM = 5
-    word_dict = paddle.dataset.imdb.word_dict()
-    print "load word dict successfully"
+def main(word_dict, net_method, use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    BATCH_SIZE = 128
+    PASS_NUM = 5
    dict_dim = len(word_dict)
    class_dim = 2
    data = fluid.layers.data(
        name="words", shape=[1], dtype="int64", lod_level=1)
    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-    cost, accuracy, acc_out = stacked_lstm_net(
+    cost, acc_out = net_method(
        data, label, input_dim=dict_dim, class_dim=class_dim)
    train_data = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.imdb.train(word_dict), buf_size=1000),
        batch_size=BATCH_SIZE)
-    place = fluid.CPUPlace()
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)
    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
    exe.run(fluid.default_startup_program())
    for pass_id in xrange(PASS_NUM):
-        accuracy.reset(exe)
        for data in train_data():
            cost_val, acc_val = exe.run(fluid.default_main_program(),
                                        feed=feeder.feed(data),
                                        fetch_list=[cost, acc_out])
-            pass_acc = accuracy.eval(exe)
+            print("cost=" + str(cost_val) + " acc=" + str(acc_val))
-            print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
+            if cost_val < 0.4 and acc_val > 0.8:
-                  " pass_acc=" + str(pass_acc))
+                return
-            if cost_val < 1.0 and acc_val > 0.8:
+    raise AssertionError("Cost is too large for {0}".format(
-                exit(0)
+        net_method.__name__))
-    exit(1)
+class TestUnderstandSentiment(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.word_dict = paddle.dataset.imdb.word_dict()
+    @contextlib.contextmanager
+    def new_program_scope(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+    def test_conv_cpu(self):
+        with self.new_program_scope():
+            main(self.word_dict, net_method=convolution_net, use_cuda=False)
+    def test_stacked_lstm_cpu(self):
+        with self.new_program_scope():
+            main(self.word_dict, net_method=stacked_lstm_net, use_cuda=False)
+    def test_conv_gpu(self):
+        with self.new_program_scope():
+            main(self.word_dict, net_method=convolution_net, use_cuda=True)
+    def test_stacked_lstm_gpu(self):
+        with self.new_program_scope():
+            main(self.word_dict, net_method=stacked_lstm_net, use_cuda=True)
 if __name__ == '__main__':
-    main()
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
-                    hid_dim=32):
-    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
-    conv_3 = fluid.nets.sequence_conv_pool(
-        input=emb,
-        num_filters=hid_dim,
-        filter_size=3,
-        act="tanh",
-        pool_type="sqrt")
-    conv_4 = fluid.nets.sequence_conv_pool(
-        input=emb,
-        num_filters=hid_dim,
-        filter_size=4,
-        act="tanh",
-        pool_type="sqrt")
-    prediction = fluid.layers.fc(input=[conv_3, conv_4],
-                                 size=class_dim,
-                                 act="softmax")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
-    adam_optimizer.minimize(avg_cost)
-    accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
-    return avg_cost, accuracy, accuracy.metrics[0]
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-def main():
-    BATCH_SIZE = 100
-    PASS_NUM = 5
-    word_dict = paddle.dataset.imdb.word_dict()
-    dict_dim = len(word_dict)
-    class_dim = 2
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-    cost, accuracy, acc_out = convolution_net(
-        data, label, input_dim=dict_dim, class_dim=class_dim)
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.imdb.train(word_dict), buf_size=1000),
-        batch_size=BATCH_SIZE)
-    place = fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
-    exe.run(fluid.default_startup_program())
-    for pass_id in xrange(PASS_NUM):
-        accuracy.reset(exe)
-        for data in train_data():
-            cost_val, acc_val = exe.run(fluid.default_main_program(),
-                                        feed=feeder.feed(data),
-                                        fetch_list=[cost, acc_out])
-            pass_acc = accuracy.eval(exe)
-            print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
-                  " pass_acc=" + str(pass_acc))
-            if cost_val < 1.0 and pass_acc > 0.8:
-                exit(0)
-    exit(1)
-if __name__ == '__main__':
-    main()
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-from paddle.v2.fluid.layer_helper import LayerHelper
-def lstm(x, c_pre_init, hidden_dim, forget_bias=None):
-    """
-    This function helps create an operator for the LSTM (Long Short Term
-    Memory) cell that can be used inside an RNN.
-    """
-    helper = LayerHelper('lstm_unit', **locals())
-    rnn = fluid.layers.StaticRNN()
-    with rnn.step():
-        c_pre = rnn.memory(init=c_pre_init)
-        x_t = rnn.step_input(x)
-        before_fc = fluid.layers.concat(input=[x_t, c_pre], axis=1)
-        after_fc = fluid.layers.fc(input=before_fc, size=hidden_dim * 4)
-        dtype = x.dtype
-        c = helper.create_tmp_variable(dtype)
-        h = helper.create_tmp_variable(dtype)
-        helper.append_op(
-            type='lstm_unit',
-            inputs={"X": after_fc,
-                    "C_prev": c_pre},
-            outputs={"C": c,
-                     "H": h},
-            attrs={"forget_bias": forget_bias})
-        rnn.update_memory(c_pre, c)
-        rnn.output(h)
-    return rnn()
-def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50):
-    data = fluid.layers.data(
-        name="words",
-        shape=[seq_len * batch_size, 1],
-        append_batch_size=False,
-        dtype="int64",
-        lod_level=1)
-    label = fluid.layers.data(
-        name="label",
-        shape=[batch_size, 1],
-        append_batch_size=False,
-        dtype="int64")
-    emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
-    emb = fluid.layers.reshape(x=emb, shape=[batch_size, seq_len, emb_dim])
-    emb = fluid.layers.transpose(x=emb, perm=[1, 0, 2])
-    c_pre_init = fluid.layers.fill_constant(
-        dtype=emb.dtype, shape=[batch_size, emb_dim], value=0.0)
-    c_pre_init.stop_gradient = False
-    layer_1_out = lstm(emb, c_pre_init=c_pre_init, hidden_dim=emb_dim)
-    layer_1_out = fluid.layers.transpose(x=layer_1_out, perm=[1, 0, 2])
-    prediction = fluid.layers.fc(input=layer_1_out,
-                                 size=class_dim,
-                                 act="softmax")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
-    adam_optimizer.minimize(avg_cost)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-    return avg_cost, acc
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-def chop_data(data, chop_len=80, batch_size=50):
-    data = [(x[0][:chop_len], x[1]) for x in data if len(x[0]) >= chop_len]
-    return data[:batch_size]
-def prepare_feed_data(data, place):
-    tensor_words = to_lodtensor(map(lambda x: x[0], data), place)
-    label = np.array(map(lambda x: x[1], data)).astype("int64")
-    label = label.reshape([len(label), 1])
-    tensor_label = fluid.LoDTensor()
-    tensor_label.set(label, place)
-    return tensor_words, tensor_label
-def main():
-    BATCH_SIZE = 100
-    PASS_NUM = 5
-    word_dict = paddle.dataset.imdb.word_dict()
-    print "load word dict successfully"
-    dict_dim = len(word_dict)
-    class_dim = 2
-    cost, acc = lstm_net(dict_dim=dict_dim, class_dim=class_dim)
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.imdb.train(word_dict), buf_size=BATCH_SIZE * 10),
-        batch_size=BATCH_SIZE)
-    place = fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    exe.run(fluid.default_startup_program())
-    for pass_id in xrange(PASS_NUM):
-        for data in train_data():
-            chopped_data = chop_data(data)
-            tensor_words, tensor_label = prepare_feed_data(chopped_data, place)
-            outs = exe.run(fluid.default_main_program(),
-                           feed={"words": tensor_words,
-                                 "label": tensor_label},
-                           fetch_list=[cost, acc])
-            cost_val = np.array(outs[0])
-            acc_val = np.array(outs[1])
-            print("cost=" + str(cost_val) + " acc=" + str(acc_val))
-            if acc_val > 0.7:
-                exit(0)
-    exit(1)
-if __name__ == '__main__':
-    main()
--- a/python/paddle/v2/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py
@@ -12,76 +12,145 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
+import unittest
+import os
-PASS_NUM = 100
-EMBED_SIZE = 32
+def main(use_cuda, is_sparse, parallel):
-HIDDEN_SIZE = 256
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
-N = 5
+        return
-BATCH_SIZE = 32
-IS_SPARSE = True
+    PASS_NUM = 100
+    EMBED_SIZE = 32
-word_dict = paddle.dataset.imikolov.build_dict()
+    HIDDEN_SIZE = 256
-dict_size = len(word_dict)
+    N = 5
+    BATCH_SIZE = 32
-first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
+    IS_SPARSE = is_sparse
-second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
-third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
+    def __network__(words):
-forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
+        embed_first = fluid.layers.embedding(
-next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+            input=words[0],
+            size=[dict_size, EMBED_SIZE],
-embed_first = fluid.layers.embedding(
+            dtype='float32',
-    input=first_word,
+            is_sparse=IS_SPARSE,
-    size=[dict_size, EMBED_SIZE],
+            param_attr='shared_w')
-    dtype='float32',
+        embed_second = fluid.layers.embedding(
-    is_sparse=IS_SPARSE,
+            input=words[1],
-    param_attr='shared_w')
+            size=[dict_size, EMBED_SIZE],
-embed_second = fluid.layers.embedding(
+            dtype='float32',
-    input=second_word,
+            is_sparse=IS_SPARSE,
-    size=[dict_size, EMBED_SIZE],
+            param_attr='shared_w')
-    dtype='float32',
+        embed_third = fluid.layers.embedding(
-    is_sparse=IS_SPARSE,
+            input=words[2],
-    param_attr='shared_w')
+            size=[dict_size, EMBED_SIZE],
-embed_third = fluid.layers.embedding(
+            dtype='float32',
-    input=third_word,
+            is_sparse=IS_SPARSE,
-    size=[dict_size, EMBED_SIZE],
+            param_attr='shared_w')
-    dtype='float32',
+        embed_forth = fluid.layers.embedding(
-    is_sparse=IS_SPARSE,
+            input=words[3],
-    param_attr='shared_w')
+            size=[dict_size, EMBED_SIZE],
-embed_forth = fluid.layers.embedding(
+            dtype='float32',
-    input=forth_word,
+            is_sparse=IS_SPARSE,
-    size=[dict_size, EMBED_SIZE],
+            param_attr='shared_w')
-    dtype='float32',
-    is_sparse=IS_SPARSE,
+        concat_embed = fluid.layers.concat(
-    param_attr='shared_w')
+            input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
+        hidden1 = fluid.layers.fc(input=concat_embed,
-concat_embed = fluid.layers.concat(
+                                  size=HIDDEN_SIZE,
-    input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
+                                  act='sigmoid')
-hidden1 = fluid.layers.fc(input=concat_embed, size=HIDDEN_SIZE, act='sigmoid')
+        predict_word = fluid.layers.fc(input=hidden1,
-predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax')
+                                       size=dict_size,
-cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
+                                       act='softmax')
-avg_cost = fluid.layers.mean(x=cost)
+        cost = fluid.layers.cross_entropy(input=predict_word, label=words[4])
-sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+        avg_cost = fluid.layers.mean(x=cost)
-sgd_optimizer.minimize(avg_cost)
+        return avg_cost
-train_reader = paddle.batch(
+    word_dict = paddle.dataset.imikolov.build_dict()
-    paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
+    dict_size = len(word_dict)
-place = fluid.CPUPlace()
+    first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
-exe = fluid.Executor(place)
+    second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
-feeder = fluid.DataFeeder(
+    third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
-    feed_list=[first_word, second_word, third_word, forth_word, next_word],
+    forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
-    place=place)
+    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
-exe.run(fluid.default_startup_program())
+    if not parallel:
+        avg_cost = __network__(
-for pass_id in range(PASS_NUM):
+            [first_word, second_word, third_word, forth_word, next_word])
-    for data in train_reader():
+    else:
-        avg_cost_np = exe.run(fluid.default_main_program(),
+        places = fluid.layers.get_places()
-                              feed=feeder.feed(data),
+        pd = fluid.layers.ParallelDo(places)
-                              fetch_list=[avg_cost])
+        with pd.do():
-        if avg_cost_np[0] < 5.0:
+            avg_cost = __network__(
-            exit(0)  # if avg cost less than 10.0, we think our code is good.
+                map(pd.read_input, [
-exit(1)
+                    first_word, second_word, third_word, forth_word, next_word
+                ]))
+            pd.write_output(avg_cost)
+        avg_cost = fluid.layers.mean(x=pd())
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    sgd_optimizer.minimize(avg_cost)
+    train_reader = paddle.batch(
+        paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(
+        feed_list=[first_word, second_word, third_word, forth_word, next_word],
+        place=place)
+    exe.run(fluid.default_startup_program())
+    for pass_id in range(PASS_NUM):
+        for data in train_reader():
+            avg_cost_np = exe.run(fluid.default_main_program(),
+                                  feed=feeder.feed(data),
+                                  fetch_list=[avg_cost])
+            if avg_cost_np[0] < 5.0:
+                return
+    raise AssertionError("Cost is too large {0:2.2}".format(avg_cost_np[0]))
+FULL_TEST = os.getenv('FULL_TEST',
+                      '0').lower() in ['true', '1', 't', 'y', 'yes', 'on']
+SKIP_REASON = "Only run minimum number of tests in CI server, to make CI faster"
+class W2VTest(unittest.TestCase):
+    pass
+def inject_test_method(use_cuda, is_sparse, parallel):
+    fn_name = "test_{0}_{1}_{2}".format("cuda" if use_cuda else "cpu", "sparse"
+                                        if is_sparse else "dense", "parallel"
+                                        if parallel else "normal")
+    def __impl__(*args, **kwargs):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                main(use_cuda=use_cuda, is_sparse=is_sparse, parallel=parallel)
+    if use_cuda and is_sparse and parallel:
+        fn = __impl__
+    else:
+        # skip the other test when on CI server
+        fn = unittest.skipUnless(
+            condition=FULL_TEST, reason=SKIP_REASON)(__impl__)
+    setattr(W2VTest, fn_name, fn)
+for use_cuda in (False, True):
+    for is_sparse in (False, True):
+        for parallel in (False, True):
+            inject_test_method(use_cuda, is_sparse, parallel)
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_bipartite_match_op.py
+++ b/python/paddle/v2/fluid/tests/test_bipartite_match_op.py
@@ -62,7 +62,7 @@ def batch_bipartite_match(distance, lod):
    return match_indices, match_dist
-class TestBipartiteMatchOpForWithLoD(OpTest):
+class TestBipartiteMatchOpWithLoD(OpTest):
    def setUp(self):
        self.op_type = 'bipartite_match'
        lod = [[0, 5, 11, 23]]
@@ -72,7 +72,7 @@ class TestBipartiteMatchOpForWithLoD(OpTest):
        self.inputs = {'DistMat': (dist, lod)}
        self.outputs = {
            'ColToRowMatchIndices': (match_indices),
-            'ColToRowMatchDis': (match_dist),
+            'ColToRowMatchDist': (match_dist),
        }
    def test_check_output(self):
@@ -89,7 +89,7 @@ class TestBipartiteMatchOpWithoutLoD(OpTest):
        self.inputs = {'DistMat': dist}
        self.outputs = {
            'ColToRowMatchIndices': match_indices,
-            'ColToRowMatchDis': match_dist,
+            'ColToRowMatchDist': match_dist,
        }
    def test_check_output(self):

--- a/python/paddle/v2/fluid/tests/test_box_coder_op.py
+++ b/python/paddle/v2/fluid/tests/test_box_coder_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import sys
+import math
+from op_test import OpTest
+def box_coder(target_box, prior_box, prior_box_var, output_box, code_type):
+    prior_box_x = (
+        (prior_box[:, 2] + prior_box[:, 0]) / 2).reshape(1, prior_box.shape[0])
+    prior_box_y = (
+        (prior_box[:, 3] + prior_box[:, 1]) / 2).reshape(1, prior_box.shape[0])
+    prior_box_width = (
+        (prior_box[:, 2] - prior_box[:, 0])).reshape(1, prior_box.shape[0])
+    prior_box_height = (
+        (prior_box[:, 3] - prior_box[:, 1])).reshape(1, prior_box.shape[0])
+    prior_box_var = prior_box_var.reshape(1, prior_box_var.shape[0],
+                                          prior_box_var.shape[1])
+    if (code_type == "EncodeCenterSize"):
+        target_box_x = ((target_box[:, 2] + target_box[:, 0]) / 2).reshape(
+            target_box.shape[0], 1)
+        target_box_y = ((target_box[:, 3] + target_box[:, 1]) / 2).reshape(
+            target_box.shape[0], 1)
+        target_box_width = ((target_box[:, 2] - target_box[:, 0])).reshape(
+            target_box.shape[0], 1)
+        target_box_height = ((target_box[:, 3] - target_box[:, 1])).reshape(
+            target_box.shape[0], 1)
+        output_box[:,:,0] = (target_box_x - prior_box_x) / prior_box_width / \
+                prior_box_var[:,:,0]
+        output_box[:,:,1] = (target_box_y - prior_box_y) / prior_box_height / \
+                prior_box_var[:,:,1]
+        output_box[:,:,2] = np.log(np.fabs(target_box_width / prior_box_width)) / \
+                prior_box_var[:,:,2]
+        output_box[:,:,3] = np.log(np.fabs(target_box_height / prior_box_height)) / \
+                prior_box_var[:,:,3]
+    elif (code_type == "DecodeCenterSize"):
+        target_box = target_box.reshape(target_box.shape[0], 1,
+                                        target_box.shape[1])
+        target_box_x = prior_box_var[:,:,0] * target_box[:,:,0] * \
+                       prior_box_width + prior_box_x
+        target_box_y = prior_box_var[:,:,1] * target_box[:,:,1] * \
+                       prior_box_height + prior_box_y
+        target_box_width = np.exp(prior_box_var[:,:,2] * target_box[:,:,2]) * \
+                           prior_box_width
+        target_box_height = np.exp(prior_box_var[:,:,3] * target_box[:,:,3]) * \
+                            prior_box_height
+        output_box[:, :, 0] = target_box_x - target_box_width / 2
+        output_box[:, :, 1] = target_box_y - target_box_height / 2
+        output_box[:, :, 2] = target_box_x + target_box_width / 2
+        output_box[:, :, 3] = target_box_y + target_box_height / 2
+def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type):
+    n = target_box.shape[0]
+    m = prior_box.shape[0]
+    output_box = np.zeros((n, m, 4), dtype=np.float32)
+    for i in range(len(lod) - 1):
+        box_coder(target_box[lod[i]:lod[i + 1], :], prior_box, prior_box_var,
+                  output_box[lod[i]:lod[i + 1], :, :], code_type)
+    return output_box
+class TestBoxCoderOp(OpTest):
+    def test_check_output(self):
+        self.check_output()
+    def setUp(self):
+        self.op_type = "box_coder"
+        lod = [[0, 20]]
+        prior_box = np.random.random((10, 4)).astype('float32')
+        prior_box_var = np.random.random((10, 4)).astype('float32')
+        target_box = np.random.random((20, 4)).astype('float32')
+        code_type = "DecodeCenterSize"
+        output_box = batch_box_coder(prior_box, prior_box_var, target_box,
+                                     lod[0], code_type)
+        self.inputs = {
+            'PriorBox': prior_box,
+            'PriorBoxVar': prior_box_var,
+            'TargetBox': target_box,
+        }
+        self.attrs = {'code_type': 'decode_center_size'}
+        self.outputs = {'OutputBox': output_box}
+class TestBoxCoderOpWithLoD(OpTest):
+    def test_check_output(self):
+        self.check_output()
+    def setUp(self):
+        self.op_type = "box_coder"
+        lod = [[0, 4, 12, 20]]
+        prior_box = np.random.random((10, 4)).astype('float32')
+        prior_box_var = np.random.random((10, 4)).astype('float32')
+        target_box = np.random.random((20, 4)).astype('float32')
+        code_type = "EncodeCenterSize"
+        output_box = batch_box_coder(prior_box, prior_box_var, target_box,
+                                     lod[0], code_type)
+        self.inputs = {
+            'PriorBox': prior_box,
+            'PriorBoxVar': prior_box_var,
+            'TargetBox': (target_box, lod),
+        }
+        self.attrs = {'code_type': 'encode_center_size'}
+        self.outputs = {'OutputBox': output_box}
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_conv2d_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv2d_op.py
@@ -241,6 +241,30 @@ class TestCUDNNWith1x1(TestWith1x1):
        self.op_type = "conv2d"
+class TestDepthwiseConv(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+class TestDepthwiseConv2(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
 #  cudnn v5 does not support dilation conv.
 # class TestCUDNNWithDilation(TestWithDilation):
 #     def init_op_type(self):

--- a/python/paddle/v2/fluid/tests/test_dropout_op.py
+++ b/python/paddle/v2/fluid/tests/test_dropout_op.py
@@ -21,7 +21,7 @@ class TestDropoutOp(OpTest):
    def setUp(self):
        self.op_type = "dropout"
        self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.0, 'is_test': False}
+        self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False}
        self.outputs = {
            'Out': self.inputs['X'],
            'Mask': np.ones((32, 64)).astype('float32')
@@ -38,7 +38,7 @@ class TestDropoutOp2(TestDropoutOp):
    def setUp(self):
        self.op_type = "dropout"
        self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 1.0, 'is_test': False}
+        self.attrs = {'dropout_prob': 1.0, 'fix_seed': True, 'is_test': False}
        self.outputs = {
            'Out': np.zeros((32, 64)).astype('float32'),
            'Mask': np.zeros((32, 64)).astype('float32')
@@ -49,7 +49,7 @@ class TestDropoutOp3(TestDropoutOp):
    def setUp(self):
        self.op_type = "dropout"
        self.inputs = {'X': np.random.random((32, 64, 2)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.0, 'is_test': False}
+        self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False}
        self.outputs = {
            'Out': self.inputs['X'],
            'Mask': np.ones((32, 64, 2)).astype('float32')
@@ -60,7 +60,7 @@ class TestDropoutOp4(OpTest):
    def setUp(self):
        self.op_type = "dropout"
        self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.35, 'is_test': True}
+        self.attrs = {'dropout_prob': 0.35, 'fix_seed': True, 'is_test': True}
        self.outputs = {
            'Out': self.inputs['X'] * (1.0 - self.attrs['dropout_prob'])
        }

--- a/python/paddle/v2/fluid/tests/test_elementwise_pow_op.py
+++ b/python/paddle/v2/fluid/tests/test_elementwise_pow_op.py
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+class TestElementwisePowOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        }
+        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+    def test_check_output(self):
+        self.check_output()
+class TestElementwisePowOp_scalar(TestElementwisePowOp):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype('float32'),
+            'Y': np.random.rand(1).astype('float32')
+        }
+        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_label_smooth_op.py
+++ b/python/paddle/v2/fluid/tests/test_label_smooth_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+class TestLabelSmoothOp(OpTest):
+    def config(self):
+        self.op_type = "label_smooth"
+        self.epsilon = 0.1
+        batch_size, self.label_dim = 5, 10
+        self.label = np.zeros((batch_size, self.label_dim)).astype("float64")
+        nonzero_index = np.random.randint(self.label_dim, size=(batch_size))
+        self.label[np.arange(batch_size), nonzero_index] = 1
+    def setUp(self):
+        self.config()
+        smoothed_label = (1 - self.epsilon
+                          ) * self.label + self.epsilon / self.label_dim
+        self.inputs = {'X': self.label}
+        self.attrs = {'epsilon': self.epsilon}
+        self.outputs = {'Out': smoothed_label}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+class TestLabelSmoothOpWithPriorDist(TestLabelSmoothOp):
+    def setUp(self):
+        self.config()
+        dist = np.random.random((1, self.label_dim))
+        smoothed_label = (1 - self.epsilon) * self.label + self.epsilon * dist
+        self.inputs = {'X': self.label, 'PriorDist': dist}
+        self.attrs = {'epsilon': self.epsilon}
+        self.outputs = {'Out': smoothed_label}
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_layer_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_layer_norm_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from operator import mul
+from op_test import OpTest
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.op import Operator
+from paddle.v2.fluid.framework import grad_var_name
+def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1):
+    x_shape = x.shape
+    N = reduce(mul, x_shape[0:begin_norm_axis], 1)
+    D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
+    x.shape = [N, D]
+    mean = np.mean(x, axis=1)
+    var = np.var(x, axis=1) + epsilon
+    output = scale.reshape([1, D]) * np.divide(
+        (x - mean.reshape([N, 1])),
+        (np.sqrt(var)).reshape([N, 1])) + beta.reshape([1, D])
+    x.shape, output.shape = x_shape, x_shape
+    return output, mean, var
+def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1):
+    x_shape = x.shape
+    scale_shape = scale.shape
+    N = reduce(mul, x_shape[0:begin_norm_axis], 1)
+    D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
+    x.shape, grad_y.shape = [N, D], [N, D]
+    var.shape, mean.shape = [N, 1], [N, 1]
+    scale.shape = [1, D]
+    # d_bias
+    d_bias = np.sum(grad_y, axis=0).reshape([1, D])
+    # d_scale
+    d_scale = np.sum(((x - mean) * np.sqrt(1 / var)) * grad_y,
+                     axis=0).reshape([1, D])
+    # dx
+    dx_end = scale * np.sqrt(1.0 / var) * grad_y
+    d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * scale, axis=1).reshape(
+        [N, 1])  # the second part equals to zero.
+    d_mean = 1.0 / D * d_mean_0
+    d_std = np.sum(
+        -(1.0 / var) * (x - mean) * grad_y * scale, axis=1).reshape([N, 1]) * (
+            1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) * (x - mean))
+    grad_x = dx_end + d_mean + d_std
+    grad_y.shape = x_shape
+    x.shape = x_shape
+    scale.shape = scale_shape
+    return grad_x, d_scale, d_bias
+def get_backward_op(scope, op, no_grad_set):
+    backward_op = core.Operator.backward(op, no_grad_set)
+    for input in backward_op.input_vars():
+        var = scope.var(input)
+        var.get_tensor()
+    for output in backward_op.output_vars():
+        var = scope.var(output)
+        var.get_tensor()
+    return backward_op
+def create_or_get_tensor(scope, var_name, var, place):
+    tensor = scope.var(var_name).get_tensor()
+    if var is not None:
+        assert isinstance(var, np.ndarray)
+        tensor.set_lod([[]])
+        tensor.set_dims(var.shape)
+        tensor.set(var, place)
+    return tensor
+def set_output_grad(scope, outputs, place, feed_dict=None):
+    def __set_tensor__(name, data=None):
+        out_tensor = scope.find_var(name).get_tensor()
+        grad_tensor = scope.var(grad_var_name(name)).get_tensor()
+        out_dtype = out_tensor.dtype()
+        if data is None:
+            if out_dtype == core.DataType.FP64:
+                data = np.ones(out_tensor.shape(), dtype=np.float64)
+            elif out_dtype == core.DataType.FP32:
+                data = np.ones(out_tensor.shape(), dtype=np.float32)
+            else:
+                raise ValueError("Not supported data type " + str(out_dtype))
+        grad_tensor.set(data, place)
+    for output in outputs:
+        data = None
+        if output in feed_dict:
+            data = feed_dict[output]
+        __set_tensor__(output, data)
+class TestLayerNormdOp(OpTest):
+    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        self.assertTrue(
+            np.allclose(
+                np.array(tensor).reshape(np_array.shape), np_array, atol=atol),
+            msg)
+    def __assert_grad_close(self,
+                            tensor,
+                            np_array,
+                            name,
+                            place,
+                            max_relative_error=0.02):
+        a = np.array(tensor).reshape(np_array.shape)
+        b = np_array
+        abs_a = np.abs(a)
+        abs_a[abs_a < 1e-5] = 1
+        diff_mat = np.abs(a - b) / abs_a
+        max_diff = np.max(diff_mat)
+        def err_msg():
+            offset = np.argmax(diff_mat > max_relative_error)
+            return ("%s Variable %s max gradient diff %f over limit %f, "
+                    "the first error element is %d, %f, %f") % (
+                        "Gradient Check On %s" % str(place), name, max_diff,
+                        max_relative_error, offset, a.flatten()[offset],
+                        b.flatten()[offset])
+        self.assertLessEqual(max_diff, max_relative_error, err_msg())
+    def check_forward_backward(self, shape, begin_norm_axis):
+        def test_with_place(place, shape, begin_norm_axis=1):
+            # setUp
+            assert begin_norm_axis > 0 and begin_norm_axis < len(
+                shape), 'begin_norm_axis must be between 0 and len(shape)-1.'
+            # attr
+            epsilon = 0.00001
+            x_shape = shape
+            D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
+            scale_shape = [D]
+            np.random.random(123)
+            x_val = np.random.random_sample(x_shape).astype(np.float32)
+            scale_val = np.random.random_sample(scale_shape).astype(np.float32)
+            bias_val = np.random.random_sample(scale_shape).astype(np.float32)
+            y_grad = np.random.random_sample(x_shape).astype(np.float32)
+            # run forward
+            y_out, saved_mean, var_ref = _reference_layer_norm_naive(
+                x_val, scale_val, bias_val, epsilon, begin_norm_axis)
+            naive_fw = {"Y": y_out, "Mean": saved_mean, "Variance": var_ref}
+            # get gradient
+            x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_layer_norm_grad(
+                x_val, y_grad, scale_val, saved_mean, var_ref, begin_norm_axis)
+            naive_grad = {
+                "X": x_grad_ref,
+                "Scale": scale_grad_ref,
+                "Bias": bias_grad_ref
+            }
+            scope = core.Scope()
+            # create input
+            input_map = {"X": x_val, "Scale": scale_val, "Bias": bias_val}
+            for i_name in input_map:
+                create_or_get_tensor(scope, i_name, input_map[i_name], place)
+            # create output
+            output_map = {"Y": None, "Mean": None, "Variance": None}
+            output_tensor = {}
+            for o_name in output_map:
+                output_tensor[o_name] = create_or_get_tensor(
+                    scope, o_name, output_map[o_name], place)
+            layer_norm_op = Operator(
+                "layer_norm",
+                # inputs
+                X="X",
+                Scale="Scale",
+                Bias="Bias",
+                # outputs
+                Y="Y",
+                Mean="Mean",
+                Variance="Variance",
+                # attrs
+                epsilon=epsilon,
+                begin_norm_axis=begin_norm_axis)
+            layer_norm_op.run(scope, place)
+            # check forward result
+            atol = 5e-2 if isinstance(place, core.CUDAPlace) else 1e-4
+            for o_tensor in output_tensor:
+                self.__assert_close(output_tensor[o_tensor], naive_fw[o_tensor],
+                                    o_tensor, atol)
+            # run backward
+            layer_norm_op_grad = get_backward_op(scope, layer_norm_op, set())
+            set_output_grad(
+                scope, ["Y", "Mean", "Variance"],
+                place,
+                feed_dict={"Y": y_grad})
+            layer_norm_op_grad.run(scope, place)
+            # get output
+            grad_tensor = {}
+            for o_name in naive_grad:
+                grad_tensor[o_name] = x_ = create_or_get_tensor(
+                    scope, grad_var_name(o_name), None, place)
+            # check gradient output
+            for o_grad in naive_grad:
+                self.__assert_grad_close(grad_tensor[o_grad],
+                                         naive_grad[o_grad], o_grad + "@GRAD",
+                                         place)
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            test_with_place(place, shape, begin_norm_axis)
+    def test_check_forward_backward_with_scale_and_bias(self):
+        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
+        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3)
+    def test_check_forward_backward_with_scale(self):
+        pass  # TODO(zcd)
+    def test_check_forward_backward_with_bias(self):
+        pass  # TODO(zcd)
+    def test_check_forward_backward(self):
+        pass  # TODO(zcd)
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -223,6 +223,14 @@ class TestBook(unittest.TestCase):
            self.assertIsNotNone(layers.sequence_softmax(x=seq))
        print(str(program))
+    def test_softmax(self):
+        program = Program()
+        with program_guard(program):
+            data = layers.data(name='data', shape=[10], dtype='float32')
+            hid = layers.fc(input=data, size=20)
+            self.assertIsNotNone(layers.softmax(x=hid))
+        print(str(program))
    def test_get_places(self):
        program = Program()
        with program_guard(program):

--- a/python/paddle/v2/fluid/tests/test_learning_rate_decay.py
+++ b/python/paddle/v2/fluid/tests/test_learning_rate_decay.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import math
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.learning_rate_decay as lr_decay
+def exponential_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False):
+    exponent = float(global_step) / float(decay_steps)
+    if staircase:
+        exponent = math.floor(exponent)
+    return learning_rate * decay_rate**exponent
+def natural_exp_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False):
+    exponent = float(global_step) / float(decay_steps)
+    if staircase:
+        exponent = math.floor(exponent)
+    return learning_rate * math.exp(-1 * decay_rate * exponent)
+def inverse_time_decay(learning_rate,
+                       global_step,
+                       decay_steps,
+                       decay_rate,
+                       staircase=False):
+    temp = float(global_step) / float(decay_steps)
+    if staircase:
+        temp = math.floor(temp)
+    return learning_rate / (1 + decay_rate * temp)
+class TestLearningRateDecay(unittest.TestCase):
+    def check_decay(self, python_decay_fn, fluid_decay_fn, staircase):
+        init_lr = 1.0
+        decay_steps = 5
+        decay_rate = 0.5
+        global_step = layers.create_global_var(
+            shape=[1], value=0.0, dtype='float32', persistable=True)
+        decayed_lr = fluid_decay_fn(
+            learning_rate=init_lr,
+            global_step=global_step,
+            decay_steps=decay_steps,
+            decay_rate=decay_rate,
+            staircase=staircase)
+        layers.increment(global_step, 1.0)
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        for step in range(10):
+            step_val, lr_val = exe.run(fluid.default_main_program(),
+                                       feed=[],
+                                       fetch_list=[global_step, decayed_lr])
+            python_decayed_lr = python_decay_fn(
+                learning_rate=init_lr,
+                global_step=step,
+                decay_steps=decay_steps,
+                decay_rate=decay_rate,
+                staircase=staircase)
+            self.assertAlmostEqual(python_decayed_lr, lr_val[0])
+    def test_decay(self):
+        decay_fns = [
+            (exponential_decay, lr_decay.exponential_decay, True),
+            (exponential_decay, lr_decay.exponential_decay, False),
+            (natural_exp_decay, lr_decay.natural_exp_decay, True),
+            (natural_exp_decay, lr_decay.natural_exp_decay, False),
+            (inverse_time_decay, lr_decay.inverse_time_decay, True),
+            (inverse_time_decay, lr_decay.inverse_time_decay, False),
+        ]
+        for py_decay_fn, fluid_decay_fn, staircase in decay_fns:
+            print("decay_fn=" + str(py_decay_fn) + " staircase=" + str(
+                staircase))
+            main_program = framework.Program()
+            startup_program = framework.Program()
+            with framework.program_guard(main_program, startup_program):
+                self.check_decay(py_decay_fn, fluid_decay_fn, staircase)
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_mine_hard_examples_op.py
+++ b/python/paddle/v2/fluid/tests/test_mine_hard_examples_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import sys
+import math
+from op_test import OpTest
+class TestMineHardExamplesOp(OpTest):
+    def set_data(self):
+        self.init_test_data()
+        self.inputs = {
+            'ClsLoss': self.cls_loss,
+            'LocLoss': self.loc_loss,
+            'MatchIndices': self.match_indices,
+            'MatchDist': self.match_dis
+        }
+        self.attrs = {
+            'neg_pos_ratio': self.neg_pos_ratio,
+            'neg_overlap': self.neg_overlap,
+            'sample_size': self.sample_size,
+            'mining_type': self.mining_type
+        }
+        self.outputs = {
+            'NegIndices': (self.neg_indices, self.neg_indices_lod),
+            'UpdatedMatchIndices': self.updated_match_indices
+        }
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        return
+    def setUp(self):
+        self.op_type = "mine_hard_examples"
+        self.set_data()
+    def init_test_data(self):
+        self.neg_pos_ratio = 1.0
+        self.neg_overlap = 0.5
+        self.sample_size = 0
+        self.mining_type = "max_negative"
+        self.cls_loss = np.array([[0.1, 0.1, 0.3],
+                                  [0.3, 0.1, 0.1]]).astype('float32')
+        self.loc_loss = np.array([[0.1, 0.2, 0.3],
+                                  [0.3, 0.4, 0.1]]).astype('float32')
+        self.match_dis = np.array([[0.2, 0.4, 0.8],
+                                   [0.1, 0.9, 0.3]]).astype('float32')
+        self.match_indices = np.array([[0, -1, -1],
+                                       [-1, 0, -1]]).astype('int32')
+        self.updated_match_indices = self.match_indices
+        self.neg_indices_lod = [[0, 1, 2]]
+        self.neg_indices = np.array([[1], [0]]).astype('int32')
+class TestMineHardExamplesOpHardExample(TestMineHardExamplesOp):
+    def init_test_data(self):
+        super(TestMineHardExamplesOpHardExample, self).init_test_data()
+        self.mining_type = "hard_example"
+        self.sample_size = 2
+        self.cls_loss = np.array([[0.5, 0.1, 0.3],
+                                  [0.3, 0.1, 0.1]]).astype('float32')
+        self.loc_loss = np.array([[0.2, 0.2, 0.3],
+                                  [0.3, 0.1, 0.2]]).astype('float32')
+        self.match_indices = np.array([[0, -1, -1],
+                                       [-1, 0, -1]]).astype('int32')
+        self.updated_match_indices = np.array([[0, -1, -1],
+                                               [-1, -1, -1]]).astype('int32')
+        self.neg_indices_lod = [[0, 1, 3]]
+        self.neg_indices = np.array([[2], [0], [2]]).astype('int32')
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py
+++ b/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import numpy as np
+import copy
+from op_test import OpTest
+def iou(box_a, box_b):
+    """Apply intersection-over-union overlap between box_a and box_b
+    """
+    xmin_a = min(box_a[0], box_a[2])
+    ymin_a = min(box_a[1], box_a[3])
+    xmax_a = max(box_a[0], box_a[2])
+    ymax_a = max(box_a[1], box_a[3])
+    xmin_b = min(box_b[0], box_b[2])
+    ymin_b = min(box_b[1], box_b[3])
+    xmax_b = max(box_b[0], box_b[2])
+    ymax_b = max(box_b[1], box_b[3])
+    area_a = (ymax_a - ymin_a) * (xmax_a - xmin_a)
+    area_b = (ymax_b - ymin_b) * (xmax_b - xmin_b)
+    if area_a <= 0 and area_b <= 0:
+        return 0.0
+    xa = max(xmin_a, xmin_b)
+    ya = max(ymin_a, ymin_b)
+    xb = min(xmax_a, xmax_b)
+    yb = min(ymax_a, ymax_b)
+    inter_area = max(xb - xa, 0.0) * max(yb - ya, 0.0)
+    box_a_area = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1])
+    box_b_area = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1])
+    iou_ratio = inter_area / (area_a + area_b - inter_area)
+    return iou_ratio
+def nms(boxes, scores, score_threshold, nms_threshold, top_k=200, eta=1.0):
+    """Apply non-maximum suppression at test time to avoid detecting too many
+    overlapping bounding boxes for a given object.
+    Args:
+        boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
+        scores: (tensor) The class predscores for the img, Shape:[num_priors].
+        score_threshold: (float) The confidence thresh for filtering low
+            confidence boxes.
+        nms_threshold: (float) The overlap thresh for suppressing unnecessary
+            boxes.
+        top_k: (int) The maximum number of box preds to consider.
+        eta: (float) The parameter for adaptive NMS.
+    Return:
+        The indices of the kept boxes with respect to num_priors.
+    """
+    all_scores = copy.deepcopy(scores)
+    all_scores = all_scores.flatten()
+    selected_indices = np.argwhere(all_scores > score_threshold)
+    selected_indices = selected_indices.flatten()
+    all_scores = all_scores[selected_indices]
+    sorted_indices = np.argsort(-all_scores, axis=0, kind='mergesort')
+    sorted_scores = all_scores[sorted_indices]
+    if top_k > -1 and top_k < sorted_indices.shape[0]:
+        sorted_indices = sorted_indices[:top_k]
+        sorted_scores = sorted_scores[:top_k]
+    selected_indices = []
+    adaptive_threshold = nms_threshold
+    for i in range(sorted_scores.shape[0]):
+        idx = sorted_indices[i]
+        keep = True
+        for k in range(len(selected_indices)):
+            if keep:
+                kept_idx = selected_indices[k]
+                overlap = iou(boxes[idx], boxes[kept_idx])
+                keep = True if overlap <= adaptive_threshold else False
+            else:
+                break
+        if keep:
+            selected_indices.append(idx)
+        if keep and eta < 1 and adaptive_threshold > 0.5:
+            adaptive_threshold *= eta
+    return selected_indices
+def multiclass_nms(boxes, scores, background, score_threshold, nms_threshold,
+                   nms_top_k, keep_top_k):
+    class_num = scores.shape[0]
+    priorbox_num = scores.shape[1]
+    selected_indices = {}
+    num_det = 0
+    for c in range(class_num):
+        if c == background: continue
+        indices = nms(boxes, scores[c], score_threshold, nms_threshold,
+                      nms_top_k)
+        selected_indices[c] = indices
+        num_det += len(indices)
+    if keep_top_k > -1 and num_det > keep_top_k:
+        score_index = []
+        for c, indices in selected_indices.iteritems():
+            for idx in indices:
+                score_index.append((scores[c][idx], c, idx))
+        sorted_score_index = sorted(
+            score_index, key=lambda tup: tup[0], reverse=True)
+        sorted_score_index = sorted_score_index[:keep_top_k]
+        selected_indices = {}
+        for _, c, _ in sorted_score_index:
+            selected_indices[c] = []
+        for s, c, idx in sorted_score_index:
+            selected_indices[c].append(idx)
+        num_det = keep_top_k
+    return selected_indices, num_det
+def batched_multiclass_nms(boxes, scores, background, score_threshold,
+                           nms_threshold, nms_top_k, keep_top_k):
+    batch_size = scores.shape[0]
+    det_outs = []
+    lod = [0]
+    for n in range(batch_size):
+        nmsed_outs, nmsed_num = multiclass_nms(boxes, scores[n], background,
+                                               score_threshold, nms_threshold,
+                                               nms_top_k, keep_top_k)
+        lod.append(lod[-1] + nmsed_num)
+        if nmsed_num == 0: continue
+        for c, indices in nmsed_outs.iteritems():
+            for idx in indices:
+                xmin, ymin, xmax, ymax = boxes[idx][:]
+                det_outs.append([c, scores[n][c][idx], xmin, ymin, xmax, ymax])
+    return det_outs, lod
+class TestMulticlassNMSOp(OpTest):
+    def set_argument(self):
+        self.score_threshold = 0.01
+    def setUp(self):
+        self.set_argument()
+        N = 7
+        M = 1200
+        C = 21
+        BOX_SIZE = 4
+        background = 0
+        nms_threshold = 0.3
+        nms_top_k = 400
+        keep_top_k = 200
+        score_threshold = self.score_threshold
+        scores = np.random.random((N * M, C)).astype('float32')
+        def softmax(x):
+            shiftx = x - np.max(x).clip(-64.)
+            exps = np.exp(shiftx)
+            return exps / np.sum(exps)
+        scores = np.apply_along_axis(softmax, 1, scores)
+        scores = np.reshape(scores, (N, M, C))
+        scores = np.transpose(scores, (0, 2, 1))
+        boxes = np.random.random((M, BOX_SIZE)).astype('float32')
+        boxes[:, 0:2] = boxes[:, 0:2] * 0.5
+        boxes[:, 2:4] = boxes[:, 2:4] * 0.5 + 0.5
+        nmsed_outs, lod = batched_multiclass_nms(boxes, scores, background,
+                                                 score_threshold, nms_threshold,
+                                                 nms_top_k, keep_top_k)
+        nmsed_outs = [-1] if not nmsed_outs else nmsed_outs
+        nmsed_outs = np.array(nmsed_outs).astype('float32')
+        self.op_type = 'multiclass_nms'
+        self.inputs = {'BBoxes': boxes, 'Scores': scores}
+        self.outputs = {'Out': (nmsed_outs, [lod])}
+        self.attrs = {
+            'background_label': 0,
+            'nms_threshold': nms_threshold,
+            'nms_top_k': nms_top_k,
+            'keep_top_k': keep_top_k,
+            'score_threshold': score_threshold,
+            'nms_eta': 1.0,
+        }
+    def test_check_output(self):
+        self.check_output()
+class TestMulticlassNMSOpNoOutput(TestMulticlassNMSOp):
+    def set_argument(self):
+        # Here set 2.0 to test the case there is no outputs.
+        # In practical use, 0.0 < score_threshold < 1.0 
+        self.score_threshold = 2.0
+class TestIOU(unittest.TestCase):
+    def test_iou(self):
+        box1 = np.array([4.0, 3.0, 7.0, 5.0]).astype('float32')
+        box2 = np.array([3.0, 4.0, 6.0, 8.0]).astype('float32')
+        expt_output = np.array([2.0 / 16.0]).astype('float32')
+        calc_output = np.array([iou(box1, box2)]).astype('float32')
+        self.assertTrue(np.allclose(calc_output, expt_output))
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_recv_op.py
+++ b/python/paddle/v2/fluid/tests/test_recv_op.py
@@ -19,6 +19,7 @@ import paddle.v2.fluid.layers as layers
 import numpy
 from multiprocessing import Process
 import os, sys
+import time
 class TestRecvOp(unittest.TestCase):
@@ -28,6 +29,7 @@ class TestRecvOp(unittest.TestCase):
        p = Process(target=self.init_serv, args=(place, ))
        p.daemon = True
        p.start()
+        time.sleep(1)
        self.init_client(place)
        # FIXME(typhoonzero): find a way to gracefully shutdown the server.
        os.system("kill -9 %d" % p.pid)

--- a/python/paddle/v2/fluid/tests/test_tensor.py
+++ b/python/paddle/v2/fluid/tests/test_tensor.py
@@ -108,9 +108,31 @@ class TestTensor(unittest.TestCase):
        scope = core.Scope()
        place = core.CPUPlace()
        lod_py = [[0, 2, 5], [0, 2, 4, 5]]
-        lod_tensor = core.LoDTensor(lod_py)
+        lod_tensor = core.LoDTensor()
        lod_tensor.set_dims([5, 2, 3, 4])
+        lod_tensor.set_lod(lod_py)
+        lod_tensor.alloc_float(place)
+        tensor_array = numpy.array(lod_tensor)
+        tensor_array[0, 0, 0, 0] = 1.0
+        tensor_array[0, 0, 0, 1] = 2.0
+        lod_tensor.set(tensor_array, place)
+        lod_v = numpy.array(lod_tensor)
+        self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0])
+        self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
+        self.assertListEqual(lod_py, lod_tensor.lod())
+    def test_lod_tensor_gpu_init(self):
+        if not core.is_compiled_with_cuda():
+            return
+        scope = core.Scope()
+        place = core.CUDAPlace(0)
+        lod_py = [[0, 2, 5], [0, 2, 4, 5]]
+        lod_tensor = core.LoDTensor()
+        lod_tensor.set_dims([5, 2, 3, 4])
+        lod_tensor.set_lod(lod_py)
        lod_tensor.alloc_float(place)
        tensor_array = numpy.array(lod_tensor)
        tensor_array[0, 0, 0, 0] = 1.0