Merge branch 'develop' into inference_lib_dist

446198da · Luo Tao · 55b5f29e · be815dd0 · 446198da · 446198da
176 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,7 +39,7 @@ option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_F
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
 option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
-option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
+option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
 option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check"         ON)
 option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
@@ -137,7 +137,7 @@ include(external/openblas)  # download, build, install openblas
 include(external/mkldnn)    # download, build, install mkldnn
 include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
-include(external/boost)     # download, build, install boost
+include(external/boost)     # download boost
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11

--- a/benchmark/cluster/vgg16/Dockerfile
+++ b/benchmark/cluster/vgg16/Dockerfile
+#FROM python:2.7.14
+FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
+RUN apt-get update && apt-get install -y python
+RUN pip install -U kubernetes opencv-python &&   apt-get update -y &&   apt-get install -y iputils-ping libgtk2.0-dev
+# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
+#       so we must build one with distribute support to install in this image.
+RUN pip install paddlepaddle
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
+RUN pip uninstall -y paddlepaddle
+# below lines may change a lot for debugging
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
+ADD *.whl /
+RUN pip install /*.whl && rm -f /*.whl && \
+chmod +x /usr/bin/paddle_k8s
+ENV LD_LIBRARY_PATH=/usr/local/lib
+ADD vgg16_fluid.py vgg16_v2.py /workspace/
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
+# Performance for Distributed vgg16
+## Test Result
+### Hardware Infomation
+- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
+- cpu MHz		: 2101.000
+- cache size	: 20480 KB
+### Single Node Single Thread
+- PServer Count: 10
+- Trainer Count: 20
+- Metrics: samples / sec
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
+| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
+| TensorFlow | - | - | - | - |
+### Different Batch Size
+- PServer Count: 10
+- Trainer Count: 20
+- Per trainer CPU Core: 1
+- Metrics: samples / sec
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
+| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
+| TensorFlow | - | - | - | - |
+### Accelerate Rate
+- Pserver Count: 20
+- Batch Size: 128
+- Metrics: samples / sec
+| Trainer Count | 20 | 40 | 80 | 100 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
+| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
+| TensorFlow | - | - | - | - |
+### Different Pserver Count
+- Trainer Count: 60
+- Batch Size: 128
+- Metrics: samples/ sec
+| PServer Count | 3 | 6 |10 | 20 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
+| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
+| TensorFlow | - | - | - | - |
+*The performance gap between Fuild and v2 comes from the network interference.*
+## Steps to Run the Performance Test
+1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
+1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
+1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
+1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
+1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
+Check the logs for the distributed training progress and analyze the performance.
+## Enable Verbos Logs
+Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.
--- a/benchmark/cluster/vgg16/fluid_pserver.yaml
+++ b/benchmark/cluster/vgg16/fluid_pserver.yaml
+apiVersion: extensions/v1beta1
+kind: ReplicaSet
+metadata:
+  name: vgg16job-pserver
+spec:
+  replicas: 10
+  template:
+    metadata:
+      labels:
+        paddle-job-pserver: vgg16job
+    spec:
+      hostNetwork: true
+      imagePullSecrets:
+      - name: job-registry-secret
+      containers:
+      - name: pserver
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+        imagePullPolicy: Always
+        ports:
+        - name: jobport-30236
+          containerPort: 30236
+        env:
+        - name: PADDLE_JOB_NAME
+          value: vgg16job
+        - name: MKL_NUM_THREADS
+          value: "1"
+        - name: TRAINING_ROLE
+          value: "PSERVER"
+        - name: TRAINERS
+          value: "20"
+        - name: PSERVERS
+          value: "10"
+        - name: TOPOLOGY
+          value: ""
+        - name: ENTRY
+          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: PADDLE_INIT_PORT
+          value: "30236"
+        - name: PADDLE_INIT_NICS
+          value: "xgbe0"
+        - name: PADDLE_INIT_TRAINER_COUNT
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+          value: "1"
+        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+          value: "20"
+        - name: PADDLE_INIT_NUM_PASSES
+          value: "1"
+        - name: PADDLE_INIT_USE_GPU
+          value: "0"
+        - name: LD_LIBRARY_PATH
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        - name: POD_IP
+          valueFrom:
+            fieldRef:
+              fieldPath: "status.podIP"
+        command: ["paddle_k8s", "start_fluid"]
+        resources:
+          requests:
+            memory: 10Gi
+            cpu: 4
+          limits:
+            memory: 10Gi
+            cpu: 4
--- a/benchmark/cluster/vgg16/fluid_trainer.yaml
+++ b/benchmark/cluster/vgg16/fluid_trainer.yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: vgg16job-trainer
+spec:
+  parallelism: 20
+  completions: 20
+  template:
+    metadata:
+      labels:
+        paddle-job: vgg16job
+    spec:
+      imagePullSecrets:
+        - name: job-registry-secret
+      hostNetwork: true
+      containers:
+      - name: trainer
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+        imagePullPolicy: Always
+        command: ["paddle_k8s", "start_fluid"]
+        env:
+        - name: PADDLE_JOB_NAME
+          value: vgg16job
+        - name: TRAINING_ROLE
+          value: "TRAINER"
+        - name: TRAINERS
+          value: "20"
+        - name: PSERVERS
+          value: "10"
+        - name: TOPOLOGY
+          value: ""
+        - name: ENTRY
+          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: PADDLE_INIT_PORT
+          value: "30236"
+        - name: PADDLE_INIT_NICS
+          value: "xgbe0"
+        - name: PADDLE_INIT_TRAINER_COUNT
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+          value: "1"
+        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+          value: "20"
+        - name: PADDLE_INIT_NUM_PASSES
+          value: "1"
+        - name: PADDLE_INIT_USE_GPU
+          value: "0"
+        - name: LD_LIBRARY_PATH
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        - name: POD_IP
+          valueFrom:
+            fieldRef:
+              fieldPath: "status.podIP"
+        resources:
+          requests:
+            memory: 40Gi
+            cpu: 2
+          limits:
+            memory: 40Gi
+            cpu: 2
+      restartPolicy: Never
--- a/benchmark/cluster/vgg16/v2_pserver.yaml
+++ b/benchmark/cluster/vgg16/v2_pserver.yaml
+apiVersion: extensions/v1beta1
+kind: ReplicaSet
+metadata:
+  name: vgg16v2job-pserver
+spec:
+  replicas: 10
+  template:
+    metadata:
+      labels:
+        paddle-job-pserver: vgg16v2job
+    spec:
+      hostNetwork: true
+      imagePullSecrets:
+      - name: job-registry-secret
+      containers:
+      - name: pserver
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+        imagePullPolicy: Always
+        ports:
+        - name: jobport-30236
+          containerPort: 30236
+        env:
+        - name: PADDLE_JOB_NAME
+          value: vgg16v2job
+        - name: TRAINERS
+          value: "20"
+        - name: PSERVERS
+          value: "10"
+        - name: TOPOLOGY
+          value: ""
+        - name: ENTRY
+          value: "python train.py"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: PADDLE_INIT_PORT
+          value: "30236"
+        - name: PADDLE_INIT_NICS
+          value: "xgbe0"
+        - name: PADDLE_INIT_TRAINER_COUNT
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+          value: "1"
+        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+          value: "20"
+        - name: PADDLE_INIT_NUM_PASSES
+          value: "1"
+        - name: PADDLE_INIT_USE_GPU
+          value: "0"
+        - name: LD_LIBRARY_PATH
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        command: ["paddle_k8s", "start_pserver"]
+        resources:
+          requests:
+            memory: 10Gi
+            cpu: 4
+          limits:
+            memory: 10Gi
+            cpu: 4
--- a/benchmark/cluster/vgg16/v2_trainer.yaml
+++ b/benchmark/cluster/vgg16/v2_trainer.yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: vgg16v2job-trainer
+spec:
+  parallelism: 20
+  completions: 20
+  template:
+    metadata:
+      labels:
+        paddle-job: vgg16v2job
+    spec:
+      imagePullSecrets:
+        - name: job-registry-secret
+      hostNetwork: true
+      containers:
+      - name: trainer
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+        imagePullPolicy: Always
+        command: ["paddle_k8s", "start_trainer", "v2"]
+        env:
+        - name: PADDLE_JOB_NAME
+          value: vgg16v2job
+        - name: BATCH_SIZE
+          value: "256"
+        - name: TRAINERS
+          value: "20"
+        - name: PSERVERS
+          value: "10"
+        - name: TOPOLOGY
+          value: ""
+        - name: ENTRY
+          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: PADDLE_INIT_PORT
+          value: "30236"
+        - name: PADDLE_INIT_NICS
+          value: "xgbe0"
+        - name: PADDLE_INIT_TRAINER_COUNT
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+          value: "1"
+        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+          value: "20"
+        - name: PADDLE_INIT_NUM_PASSES
+          value: "2"
+        - name: PADDLE_INIT_USE_GPU
+          value: "0"
+        - name: LD_LIBRARY_PATH
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        resources:
+          requests:
+            memory: 40Gi
+            cpu: 2
+          limits:
+            memory: 40Gi
+            cpu: 2
+      restartPolicy: Never
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VGG16 benchmark in Fluid"""
+from __future__ import print_function
+import sys
+import time
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.profiler as profiler
+import argparse
+import functools
+import os
+def str2bool(v):
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--batch_size', type=int, default=128, help="Batch size for training.")
+parser.add_argument(
+    '--learning_rate',
+    type=float,
+    default=1e-3,
+    help="Learning rate for training.")
+parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
+parser.add_argument(
+    '--device',
+    type=str,
+    default='CPU',
+    choices=['CPU', 'GPU'],
+    help="The device type.")
+parser.add_argument('--device_id', type=int, default=0, help="The device id.")
+parser.add_argument(
+    '--data_format',
+    type=str,
+    default='NCHW',
+    choices=['NCHW', 'NHWC'],
+    help='The data order, now only support NCHW.')
+parser.add_argument(
+    '--data_set',
+    type=str,
+    default='cifar10',
+    choices=['cifar10', 'flowers'],
+    help='Optional dataset for benchmark.')
+parser.add_argument(
+    '--local',
+    type=str2bool,
+    default=True,
+    help='Whether to run as local mode.')
+args = parser.parse_args()
+def vgg16_bn_drop(input):
+    def conv_block(input, num_filter, groups, dropouts):
+        return fluid.nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max')
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    return fc2
+def main():
+    if args.data_set == "cifar10":
+        classdim = 10
+        if args.data_format == 'NCHW':
+            data_shape = [3, 32, 32]
+        else:
+            data_shape = [32, 32, 3]
+    else:
+        classdim = 102
+        if args.data_format == 'NCHW':
+            data_shape = [3, 224, 224]
+        else:
+            data_shape = [224, 224, 3]
+    # Input data
+    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    # Train program
+    net = vgg16_bn_drop(images)
+    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    # Evaluator
+    accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+    # inference program
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        test_target = accuracy.metrics + accuracy.states
+        inference_program = fluid.io.get_inference_program(test_target)
+    # Optimization
+    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+    optimize_ops, params_grads = optimizer.minimize(avg_cost)
+    # Initialize executor
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(
+        args.device_id)
+    exe = fluid.Executor(place)
+    # test
+    def test(exe):
+        accuracy.reset(exe)
+        for batch_id, data in enumerate(test_reader()):
+            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
+                                    data)).astype("float32")
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([-1, 1])
+            exe.run(inference_program,
+                    feed={"pixel": img_data,
+                          "label": y_data})
+        return accuracy.eval(exe)
+    def train_loop(exe, trainer_prog):
+        iters = 0
+        ts = time.time()
+        for pass_id in range(args.num_passes):
+            # train
+            start_time = time.time()
+            num_samples = 0
+            accuracy.reset(exe)
+            with profiler.profiler("CPU", 'total') as prof:
+                for batch_id, data in enumerate(train_reader()):
+                    ts = time.time()
+                    img_data = np.array(
+                        map(lambda x: x[0].reshape(data_shape), data)).astype(
+                            "float32")
+                    y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                    y_data = y_data.reshape([-1, 1])
+                    loss, acc = exe.run(
+                        trainer_prog,
+                        feed={"pixel": img_data,
+                              "label": y_data},
+                        fetch_list=[avg_cost] + accuracy.metrics)
+                    iters += 1
+                    num_samples += len(data)
+                    print(
+                        "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f"
+                        % (pass_id, iters, loss, acc, time.time() - ts)
+                    )  # The accuracy is the accumulation of batches, but not the current batch.
+            pass_elapsed = time.time() - start_time
+            pass_train_acc = accuracy.eval(exe)
+            pass_test_acc = test(exe)
+            print(
+                "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"
+                % (pass_id, num_samples / pass_elapsed, pass_train_acc,
+                   pass_test_acc))
+    if args.local:
+        # Parameter initialization
+        exe.run(fluid.default_startup_program())
+        # data reader
+        train_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
+                else paddle.dataset.flowers.train(),
+                buf_size=5120),
+            batch_size=args.batch_size)
+        test_reader = paddle.batch(
+            paddle.dataset.cifar.test10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+            batch_size=args.batch_size)
+        train_loop(exe, fluid.default_main_program())
+    else:
+        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # all pserver endpoints
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, "6174"]))
+        pserver_endpoints = ",".join(eplist)
+        print("pserver endpoints: ", pserver_endpoints)
+        trainers = int(os.getenv("TRAINERS"))  # total trainer count
+        print("trainers total: ", trainers)
+        current_endpoint = os.getenv(
+            "POD_IP") + ":6174"  # current pserver endpoint
+        training_role = os.getenv(
+            "TRAINING_ROLE",
+            "TRAINER")  # get the training role: trainer/pserver
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            optimize_ops,
+            params_grads,
+            pservers=pserver_endpoints,
+            trainers=trainers)
+        if training_role == "PSERVER":
+            if not current_endpoint:
+                print("need env SERVER_ENDPOINT")
+                exit(1)
+            pserver_prog = t.get_pserver_program(current_endpoint)
+            pserver_startup = t.get_startup_program(current_endpoint,
+                                                    pserver_prog)
+            print("starting server side startup")
+            exe.run(pserver_startup)
+            print("starting parameter server...")
+            exe.run(pserver_prog)
+        elif training_role == "TRAINER":
+            # Parameter initialization
+            exe.run(fluid.default_startup_program())
+            # data reader
+            train_reader = paddle.batch(
+                paddle.reader.shuffle(
+                    paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
+                    else paddle.dataset.flowers.train(),
+                    buf_size=5120),
+                batch_size=args.batch_size)
+            test_reader = paddle.batch(
+                paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else
+                paddle.dataset.flowers.test(),
+                batch_size=args.batch_size)
+            trainer_prog = t.get_trainer_program()
+            feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
+            # TODO(typhoonzero): change trainer startup program to fetch parameters from pserver
+            exe.run(fluid.default_startup_program())
+            train_loop(exe, trainer_prog)
+        else:
+            print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
+def print_arguments():
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+if __name__ == "__main__":
+    print_arguments()
+    main()
--- a/benchmark/cluster/vgg16/vgg16_v2.py
+++ b/benchmark/cluster/vgg16/vgg16_v2.py
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import gzip
+import paddle.v2.dataset.cifar as cifar
+import paddle.v2 as paddle
+import time
+import os
+DATA_DIM = 3 * 32 * 32
+CLASS_DIM = 10
+BATCH_SIZE = os.getenv("BATCH_SIZE")
+if BATCH_SIZE:
+    BATCH_SIZE = int(BATCH_SIZE)
+else:
+    BATCH_SIZE = 128
+print "batch_size", BATCH_SIZE
+NODE_COUNT = int(os.getenv("TRAINERS"))
+ts = 0
+def vgg(input, nums, class_dim):
+    def conv_block(input, num_filter, groups, num_channels=None):
+        return paddle.networks.img_conv_group(
+            input=input,
+            num_channels=num_channels,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act=paddle.activation.Relu(),
+            pool_type=paddle.pooling.Max())
+    assert len(nums) == 5
+    # the channel of input feature is 3
+    conv1 = conv_block(input, 64, nums[0], 3)
+    conv2 = conv_block(conv1, 128, nums[1])
+    conv3 = conv_block(conv2, 256, nums[2])
+    conv4 = conv_block(conv3, 512, nums[3])
+    conv5 = conv_block(conv4, 512, nums[4])
+    fc_dim = 512
+    fc1 = paddle.layer.fc(input=conv5,
+                          size=fc_dim,
+                          act=paddle.activation.Relu(),
+                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
+    fc2 = paddle.layer.fc(input=fc1,
+                          size=fc_dim,
+                          act=paddle.activation.Relu(),
+                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
+    out = paddle.layer.fc(input=fc2,
+                          size=class_dim,
+                          act=paddle.activation.Softmax())
+    return out
+def vgg13(input, class_dim):
+    nums = [2, 2, 2, 2, 2]
+    return vgg(input, nums, class_dim)
+def vgg16(input, class_dim):
+    nums = [2, 2, 3, 3, 3]
+    return vgg(input, nums, class_dim)
+def vgg19(input, class_dim):
+    nums = [2, 2, 4, 4, 4]
+    return vgg(input, nums, class_dim)
+def main():
+    global ts
+    paddle.init(use_gpu=False)
+    image = paddle.layer.data(
+        name="image", type=paddle.data_type.dense_vector(DATA_DIM))
+    lbl = paddle.layer.data(
+        name="label", type=paddle.data_type.integer_value(CLASS_DIM))
+    extra_layers = None
+    # NOTE: for v2 distributed training need averaging updates.
+    learning_rate = 1e-3 / NODE_COUNT
+    out = vgg16(image, class_dim=CLASS_DIM)
+    cost = paddle.layer.classification_cost(input=out, label=lbl)
+    # Create parameters
+    parameters = paddle.parameters.create(cost)
+    # Create optimizer
+    optimizer = paddle.optimizer.Momentum(
+        momentum=0.9,
+        regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
+                                                         BATCH_SIZE),
+        learning_rate=learning_rate / BATCH_SIZE,
+        learning_rate_decay_a=0.1,
+        learning_rate_decay_b=128000 * 35,
+        learning_rate_schedule="discexp", )
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            cifar.train10(),
+            # To use other data, replace the above line with:
+            # reader.train_reader('train.list'),
+            buf_size=1000),
+        batch_size=BATCH_SIZE)
+    test_reader = paddle.batch(
+        cifar.test10(),
+        # To use other data, replace the above line with:
+        # reader.test_reader('val.list'),
+        batch_size=BATCH_SIZE)
+    # Create trainer
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer,
+                                 extra_layers=extra_layers,
+                                 is_local=False)
+    # End batch and end pass event handler
+    def event_handler(event):
+        global ts, ts_pass
+        if isinstance(event, paddle.event.BeginPass):
+            ts_pass = time.time()
+        if isinstance(event, paddle.event.BeginIteration):
+            ts = time.time()
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 1 == 0:
+                print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics,
+                    time.time() - ts)
+        if isinstance(event, paddle.event.EndPass):
+            print "Pass %d end, spent: %f" % (event.pass_id,
+                                              time.time() - ts_pass)
+            result = trainer.test(reader=test_reader)
+            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+    trainer.train(
+        reader=train_reader, num_passes=200, event_handler=event_handler)
+if __name__ == '__main__':
+    main()
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -21,6 +21,7 @@ set(BOOST_URL           "http://sourceforge.net/projects/boost/files/boost/${BOO
 set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
 set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
 set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
+set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
 include_directories(${BOOST_INCLUDE_DIR})

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -186,6 +186,11 @@ function(cc_library TARGET_NAME)
      add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
    endif()
    if (cc_library_DEPS)
+      # Don't need link libwarpctc.so
+      if ("${cc_library_DEPS};" MATCHES "warpctc;")
+        list(REMOVE_ITEM cc_library_DEPS warpctc)
+        add_dependencies(${TARGET_NAME} warpctc)
+      endif()
      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
    endif()
@@ -224,12 +229,18 @@ function(cc_test TARGET_NAME)
  if(WITH_TESTING)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
+    set(multiValueArgs SRCS DEPS ARGS)
    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
+    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
+      list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
+    endif()
    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
-    add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+    add_test(NAME ${TARGET_NAME}
+             COMMAND ${TARGET_NAME} ${cc_test_ARGS}
+             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
  endif()
 endfunction(cc_test)
@@ -457,12 +468,12 @@ endfunction()
 function(py_test TARGET_NAME)
  if(WITH_TESTING)
-    set(options STATIC static SHARED shared)
+    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS ARGS)
+    set(multiValueArgs SRCS DEPS ARGS ENVS)
    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python
+             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_ENVS}
             ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
  endif()

--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -87,6 +87,11 @@ roi_pool
 ..  autoclass:: paddle.v2.layer.roi_pool
    :noindex:
+pad
+----
+..  autoclass:: paddle.v2.layer.pad
+    :noindex:
 Norm Layer
 ==========
@@ -133,6 +138,11 @@ grumemory
 ..  autoclass:: paddle.v2.layer.grumemory
    :noindex:
+gated_unit
+-----------
+..  autoclass:: paddle.v2.layer.gated_unit
+    :noindex:
 Recurrent Layer Group
 =====================
@@ -340,6 +350,11 @@ bilinear_interp
 ..  autoclass:: paddle.v2.layer.bilinear_interp
    :noindex:
+dropout
+--------
+..  autoclass:: paddle.v2.layer.dropout
+    :noindex:
 dot_prod
 ---------
 .. autoclass:: paddle.v2.layer.dot_prod
@@ -402,6 +417,11 @@ scale_shift
 ..  autoclass:: paddle.v2.layer.scale_shift
    :noindex:
+factorization_machine
+---------------------
+..  autoclass:: paddle.v2.layer.factorization_machine
+    :noindex:
 Sampling Layers
 ===============
@@ -420,22 +440,6 @@ multiplex
 ..  autoclass:: paddle.v2.layer.multiplex
    :noindex:
-Factorization Machine Layer
-============================
-factorization_machine
---------------------
-..  autoclass:: paddle.v2.layer.factorization_machine
-    :noindex:
-Slicing and Joining Layers
-==========================
-pad
----
-..  autoclass:: paddle.v2.layer.pad
-    :noindex:
 ..  _api_v2.layer_costs:
 Cost Layers
@@ -526,6 +530,11 @@ multibox_loss
 ..  autoclass:: paddle.v2.layer.multibox_loss
    :noindex:
+detection_output
+----------------
+..  autoclass:: paddle.v2.layer.detection_output
+    :noindex:
 Check Layer
 ============
@@ -534,31 +543,10 @@ eos
 ..  autoclass:: paddle.v2.layer.eos
    :noindex:
-Miscs
+Activation
-=====
+==========
-dropout
--------
-..  autoclass:: paddle.v2.layer.dropout
-    :noindex:
-Activation with learnable parameter
-===================================
 prelu
 --------
 ..  autoclass:: paddle.v2.layer.prelu
    :noindex:
-gated_unit
-----------
-..  autoclass:: paddle.v2.layer.gated_unit
-    :noindex:
-Detection output Layer
-======================
-detection_output
----------------
-..  autoclass:: paddle.v2.layer.detection_output
-    :noindex:
--- a/doc/api/v2/data/dataset.rst
+++ b/doc/api/v2/data/dataset.rst
@@ -73,3 +73,10 @@ wmt14
 ..  automodule:: paddle.v2.dataset.wmt14
    :members:
    :noindex:
+wmt16
+++++
+..  automodule:: paddle.v2.dataset.wmt16
+    :members:
+    :noindex:
--- a/doc/api/v2/fluid/data_feeder.rst
+++ b/doc/api/v2/fluid/data_feeder.rst
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
 ===========
-DataFeeder
+data_feeder
 ===========
 DataFeeder
-----------
+----------
-..  automodule:: paddle.v2.fluid.data_feeder
-    :members: DataFeeder
+..  autoclass:: paddle.v2.fluid.data_feeder.DataFeeder
+    :members:
    :noindex:
--- a/doc/api/v2/fluid/evaluator.rst
+++ b/doc/api/v2/fluid/evaluator.rst
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-Evaluator
+    !DO NOT EDIT THIS FILE MANUALLY!
-===========
+=========
-Evaluator
+evaluator
-----------
+=========
-..  automodule:: paddle.v2.fluid.evaluator
-    :members: Evaluator
+Accuracy
+--------
+..  autoclass:: paddle.v2.fluid.evaluator.Accuracy
+    :members:
    :noindex:
+ChunkEvaluator
+--------------
+..  autoclass:: paddle.v2.fluid.evaluator.ChunkEvaluator
+    :members:
+    :noindex:
--- a/doc/api/v2/fluid/executor.rst
+++ b/doc/api/v2/fluid/executor.rst
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-Executor
+    !DO NOT EDIT THIS FILE MANUALLY!
-===========
+========
+executor
+========
 Executor
+--------
+..  autoclass:: paddle.v2.fluid.executor.Executor
+    :members:
+    :noindex:
+global_scope
+------------
+..  autofunction:: paddle.v2.fluid.executor.global_scope
+    :noindex:
+scope_guard
 -----------
-..  automodule:: paddle.v2.fluid.executor
-    :members: Executor
+..  autofunction:: paddle.v2.fluid.executor.scope_guard
+    :noindex:
+switch_scope
+------------
+..  autofunction:: paddle.v2.fluid.executor.switch_scope
    :noindex:
--- a/doc/api/v2/fluid/gen_doc.py
+++ b/doc/api/v2/fluid/gen_doc.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import sys
+import types
+import paddle.v2.fluid as fluid
+def parse_arg():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--submodules', nargs="*")
+    parser.add_argument(
+        'module', type=str, help='Generate the documentation of which module')
+    return parser.parse_args()
+class DocGenerator(object):
+    def __init__(self, module_name, stream=sys.stdout):
+        self.stream = stream
+        self.module_name = module_name
+        if not hasattr(fluid, module_name):
+            raise ValueError("Cannot find fluid.{0}".format(module_name))
+        else:
+            self.module = getattr(fluid, module_name)
+        self.stream.write('''..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+''')
+        self._print_header_(module_name, dot='=', is_title=True)
+    def print_submodule(self, submodule_name):
+        submodule = getattr(self.module, submodule_name)
+        if submodule is None:
+            raise ValueError("Cannot find submodule {0}".format(submodule_name))
+        self.print_section(submodule_name)
+        for item in submodule.__all__:
+            self.print_item(item)
+    def print_current_module(self):
+        for item in self.module.__all__:
+            self.print_item(item)
+    def print_section(self, name):
+        self._print_header_(name, dot='=', is_title=False)
+    def print_item(self, name):
+        item = getattr(self.module, name)
+        if isinstance(item, types.TypeType):
+            self.print_class(name)
+        elif isinstance(item, types.FunctionType):
+            self.print_method(name)
+        else:
+            raise RuntimeError("Unsupported item {0}".format(name))
+    def print_class(self, name):
+        self._print_header_(name, dot='-', is_title=False)
+        self.stream.write('''..  autoclass:: paddle.v2.fluid.{0}.{1}
+    :members:
+    :noindex:
+'''.format(self.module_name, name))
+    def print_method(self, name):
+        self._print_header_(name, dot='-', is_title=False)
+        self.stream.write('''..  autofunction:: paddle.v2.fluid.{0}.{1}
+    :noindex:
+'''.format(self.module_name, name))
+    def _print_header_(self, name, dot, is_title):
+        dot_line = dot * len(name)
+        if is_title:
+            self.stream.write(dot_line)
+            self.stream.write('\n')
+        self.stream.write(name)
+        self.stream.write('\n')
+        self.stream.write(dot_line)
+        self.stream.write('\n')
+        self.stream.write('\n')
+def main():
+    args = parse_arg()
+    gen = DocGenerator(args.module)
+    if args.submodules is None:
+        gen.print_current_module()
+    else:
+        for submodule_name in args.submodules:
+            gen.print_submodule(submodule_name)
+if __name__ == '__main__':
+    main()
--- a/doc/api/v2/fluid/gen_doc.sh
+++ b/doc/api/v2/fluid/gen_doc.sh
+#!/bin/bash
+python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst
+for module in io data_feeder evaluator executor initializer io nets optimizer param_attr profiler regularizer
+do
+  python gen_doc.py ${module} > ${module}.rst
+done
--- a/doc/api/v2/fluid/initializer.rst
+++ b/doc/api/v2/fluid/initializer.rst
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
 ===========
-Initializer
+initializer
 ===========
+Constant
+--------
+..  autoclass:: paddle.v2.fluid.initializer.Constant
-Initializer
+    :members:
-----------
-..  automodule:: paddle.v2.fluid.initializer
-    :members: Initializer
-    :noindex:
-ConstantInitializer
-------------------
-..  automodule:: paddle.v2.fluid.initializer
-    :members: ConstantInitializer
    :noindex:
+Uniform
+-------
+..  autoclass:: paddle.v2.fluid.initializer.Uniform
-UniformInitializer
+    :members:
------------------
-..  automodule:: paddle.v2.fluid.initializer
-    :members: UniformInitializer
-    :noindex:
-NormalInitializer
-----------------
-..  automodule:: paddle.v2.fluid.initializer
-    :members: NormalInitializer
    :noindex:
+Normal
+------
-XavierInitializer
+..  autoclass:: paddle.v2.fluid.initializer.Normal
-----------------
+    :members:
-..  automodule:: paddle.v2.fluid.initializer
-    :members: XavierInitializer
    :noindex:
+Xavier
+------
-MSRAInitializer
+..  autoclass:: paddle.v2.fluid.initializer.Xavier
---------------
+    :members:
-..  automodule:: paddle.v2.fluid.initializer
-    :members: MSRAInitializer
    :noindex:
--- a/doc/api/v2/fluid/io.rst
+++ b/doc/api/v2/fluid/io.rst
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-IO
+    !DO NOT EDIT THIS FILE MANUALLY!
-===========
+==
+io
+==
+save_vars
+---------
-is_parameter
+..  autofunction:: paddle.v2.fluid.io.save_vars
+    :noindex:
+save_params
 -----------
-..  autofunction:: paddle.v2.fluid.io.is_parameter
+..  autofunction:: paddle.v2.fluid.io.save_params
+    :noindex:
+save_persistables
+-----------------
+..  autofunction:: paddle.v2.fluid.io.save_persistables
+    :noindex:
+load_vars
+---------
+..  autofunction:: paddle.v2.fluid.io.load_vars
+    :noindex:
+load_params
+-----------
+..  autofunction:: paddle.v2.fluid.io.load_params
    :noindex:
+load_persistables
+-----------------
+..  autofunction:: paddle.v2.fluid.io.load_persistables
+    :noindex:
+save_inference_model
+--------------------
+..  autofunction:: paddle.v2.fluid.io.save_inference_model
+    :noindex:
+load_inference_model
+--------------------
+..  autofunction:: paddle.v2.fluid.io.load_inference_model
+    :noindex:
+get_inference_program
+---------------------
+..  autofunction:: paddle.v2.fluid.io.get_inference_program
+    :noindex:
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
-==========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-Layers
+    !DO NOT EDIT THIS FILE MANUALLY!
-==========
+======
+layers
+======
-fc
+control_flow
---
+============
-..  autofunction:: paddle.v2.fluid.layers.fc
+split_lod_tensor
+----------------
+..  autofunction:: paddle.v2.fluid.layers.split_lod_tensor
    :noindex:
-embedding
+merge_lod_tensor
---------
+----------------
-..  autofunction:: paddle.v2.fluid.layers.embedding
+..  autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
    :noindex:
-dynamic_lstm
+BlockGuard
------------
+----------
-..  autofunction:: paddle.v2.fluid.layers.dynamic_lstm
+..  autoclass:: paddle.v2.fluid.layers.BlockGuard
+    :members:
    :noindex:
-dynamic_lstmp
+BlockGuardWithCompletion
-------------
+------------------------
-..  autofunction:: paddle.v2.fluid.layers.dynamic_lstmp
+..  autoclass:: paddle.v2.fluid.layers.BlockGuardWithCompletion
+    :members:
    :noindex:
-dynamic_gru
+StaticRNNMemoryLink
-----------
+-------------------
-..  autofunction:: paddle.v2.fluid.layers.dynamic_gru
+..  autoclass:: paddle.v2.fluid.layers.StaticRNNMemoryLink
+    :members:
    :noindex:
-data
+WhileGuard
----
+----------
-..  autofunction:: paddle.v2.fluid.layers.data
+..  autoclass:: paddle.v2.fluid.layers.WhileGuard
+    :members:
    :noindex:
-mean
+While
----
+-----
-..  autofunction:: paddle.v2.fluid.layers.mean
+..  autoclass:: paddle.v2.fluid.layers.While
+    :members:
    :noindex:
-mul
+lod_rank_table
---
+--------------
-..  autofunction:: paddle.v2.fluid.layers.mul
+..  autofunction:: paddle.v2.fluid.layers.lod_rank_table
    :noindex:
-elementwise_add
+max_sequence_len
---------------
+----------------
-..  autofunction:: paddle.v2.fluid.layers.elementwise_add
+..  autofunction:: paddle.v2.fluid.layers.max_sequence_len
    :noindex:
-elementwise_sub
+topk
---------------
+----
-..  autofunction:: paddle.v2.fluid.layers.elementwise_sub
+..  autofunction:: paddle.v2.fluid.layers.topk
    :noindex:
-elementwise_mul
+lod_tensor_to_array
---------------
+-------------------
-..  autofunction:: paddle.v2.fluid.layers.elementwise_mul
+..  autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
    :noindex:
-elementwise_div
+array_to_lod_tensor
---------------
+-------------------
-..  autofunction:: paddle.v2.fluid.layers.elementwise_div
+..  autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
    :noindex:
+increment
+---------
-dropout
+..  autofunction:: paddle.v2.fluid.layers.increment
-------
-..  autofunction:: paddle.v2.fluid.layers.dropout
    :noindex:
+array_write
+-----------
-reshape
+..  autofunction:: paddle.v2.fluid.layers.array_write
--------
-..  autofunction:: paddle.v2.fluid.layers.reshape
    :noindex:
+create_array
+------------
-sigmoid
+..  autofunction:: paddle.v2.fluid.layers.create_array
+    :noindex:
+less_than
 ---------
-..  autofunction:: paddle.v2.fluid.layers.sigmoid
+..  autofunction:: paddle.v2.fluid.layers.less_than
    :noindex:
+array_read
+----------
-scale
+..  autofunction:: paddle.v2.fluid.layers.array_read
---------
+    :noindex:
-..  autofunction:: paddle.v2.fluid.layers.scale
+shrink_memory
+-------------
+..  autofunction:: paddle.v2.fluid.layers.shrink_memory
    :noindex:
+array_length
+------------
-transpose
+..  autofunction:: paddle.v2.fluid.layers.array_length
+    :noindex:
+IfElse
+------
+..  autoclass:: paddle.v2.fluid.layers.IfElse
+    :members:
+    :noindex:
+DynamicRNN
+----------
+..  autoclass:: paddle.v2.fluid.layers.DynamicRNN
+    :members:
+    :noindex:
+ConditionalBlock
+----------------
+..  autoclass:: paddle.v2.fluid.layers.ConditionalBlock
+    :members:
+    :noindex:
+StaticRNN
 ---------
-..  autofunction:: paddle.v2.fluid.layers.transpose
+..  autoclass:: paddle.v2.fluid.layers.StaticRNN
+    :members:
    :noindex:
+reorder_lod_tensor_by_rank
+--------------------------
-sigmoid_cross_entropy_with_logits
+..  autofunction:: paddle.v2.fluid.layers.reorder_lod_tensor_by_rank
---------------------------------
-..  autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits
    :noindex:
+ParallelDo
+----------
-cast
+..  autoclass:: paddle.v2.fluid.layers.ParallelDo
+    :members:
+    :noindex:
+Print
+-----
+..  autofunction:: paddle.v2.fluid.layers.Print
+    :noindex:
+device
+======
+get_places
+----------
+..  autofunction:: paddle.v2.fluid.layers.get_places
+    :noindex:
+io
+==
+data
 ----
-..  autofunction:: paddle.v2.fluid.layers.cast
+..  autofunction:: paddle.v2.fluid.layers.data
    :noindex:
+BlockGuardServ
+--------------
-concat
+..  autoclass:: paddle.v2.fluid.layers.BlockGuardServ
-------
+    :members:
-..  autofunction:: paddle.v2.fluid.layers.concat
    :noindex:
+ListenAndServ
+-------------
-sums
+..  autoclass:: paddle.v2.fluid.layers.ListenAndServ
+    :members:
+    :noindex:
+Send
 ----
-..  autofunction:: paddle.v2.fluid.layers.sums
+..  autofunction:: paddle.v2.fluid.layers.Send
    :noindex:
+nn
+==
-linear_chain_crf
+fc
----------------
+--
-..  autofunction:: paddle.v2.fluid.layers.linear_chain_crf
+..  autofunction:: paddle.v2.fluid.layers.fc
    :noindex:
+embedding
+---------
-assign
-------
 ..  autofunction:: paddle.v2.fluid.layers.embedding
    :noindex:
+dynamic_lstm
+------------
-split_lod_tensor
+..  autofunction:: paddle.v2.fluid.layers.dynamic_lstm
----------------
-..  autofunction:: paddle.v2.fluid.layers.split_lod_tensor
    :noindex:
+dynamic_lstmp
+-------------
-merge_lod_tensor
+..  autofunction:: paddle.v2.fluid.layers.dynamic_lstmp
+    :noindex:
+dynamic_gru
+-----------
+..  autofunction:: paddle.v2.fluid.layers.dynamic_gru
+    :noindex:
+gru_unit
+--------
+..  autofunction:: paddle.v2.fluid.layers.gru_unit
+    :noindex:
+linear_chain_crf
 ----------------
-..  autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
+..  autofunction:: paddle.v2.fluid.layers.linear_chain_crf
+    :noindex:
+crf_decoding
+------------
+..  autofunction:: paddle.v2.fluid.layers.crf_decoding
    :noindex:
 cos_sim
--------
+-------
 ..  autofunction:: paddle.v2.fluid.layers.cos_sim
    :noindex:
 cross_entropy
 -------------
 ..  autofunction:: paddle.v2.fluid.layers.cross_entropy
    :noindex:
 square_error_cost
 -----------------
 ..  autofunction:: paddle.v2.fluid.layers.square_error_cost
    :noindex:
 accuracy
---------
+--------
 ..  autofunction:: paddle.v2.fluid.layers.accuracy
    :noindex:
+chunk_eval
+----------
+..  autofunction:: paddle.v2.fluid.layers.chunk_eval
+    :noindex:
 sequence_conv
 -------------
 ..  autofunction:: paddle.v2.fluid.layers.sequence_conv
    :noindex:
 conv2d
 ------
 ..  autofunction:: paddle.v2.fluid.layers.conv2d
    :noindex:
 sequence_pool
 -------------
 ..  autofunction:: paddle.v2.fluid.layers.sequence_pool
    :noindex:
+pool2d
+------
-sequence_first_step
+..  autofunction:: paddle.v2.fluid.layers.pool2d
-------------------
-..  autofunction:: paddle.v2.fluid.layers.sequence_first_step
    :noindex:
+batch_norm
+----------
+..  autofunction:: paddle.v2.fluid.layers.batch_norm
+    :noindex:
-sequence_last_step
+beam_search_decode
 ------------------
-..  autofunction:: paddle.v2.fluid.layers.sequence_last_step
+..  autofunction:: paddle.v2.fluid.layers.beam_search_decode
    :noindex:
+conv2d_transpose
+----------------
-pool2d
+..  autofunction:: paddle.v2.fluid.layers.conv2d_transpose
------
-..  autofunction:: paddle.v2.fluid.layers.pool2d
    :noindex:
+sequence_expand
+---------------
-batch_norm
+..  autofunction:: paddle.v2.fluid.layers.sequence_expand
+    :noindex:
+lstm_unit
+---------
+..  autofunction:: paddle.v2.fluid.layers.lstm_unit
+    :noindex:
+reduce_sum
 ----------
-..  autofunction:: paddle.v2.fluid.layers.batch_norm
+..  autofunction:: paddle.v2.fluid.layers.reduce_sum
+    :noindex:
+reduce_mean
+-----------
+..  autofunction:: paddle.v2.fluid.layers.reduce_mean
    :noindex:
+reduce_max
+----------
+..  autofunction:: paddle.v2.fluid.layers.reduce_max
+    :noindex:
-beam_search_decode
+reduce_min
+----------
+..  autofunction:: paddle.v2.fluid.layers.reduce_min
+    :noindex:
+sequence_first_step
+-------------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_first_step
+    :noindex:
+sequence_last_step
 ------------------
-..  autofunction:: paddle.v2.fluid.layers.beam_search_decode
+..  autofunction:: paddle.v2.fluid.layers.sequence_last_step
+    :noindex:
+dropout
+-------
+..  autofunction:: paddle.v2.fluid.layers.dropout
    :noindex:
+split
+-----
-lod_rank_table
+..  autofunction:: paddle.v2.fluid.layers.split
--------------
-..  autofunction:: paddle.v2.fluid.layers.lod_rank_table
    :noindex:
+ctc_greedy_decoder
+------------------
-max_sequence_len
+..  autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder
----------------
-..  autofunction:: paddle.v2.fluid.layers.max_sequence_len
    :noindex:
+edit_distance
+-------------
-topk
+..  autofunction:: paddle.v2.fluid.layers.edit_distance
-----
-..  autofunction:: paddle.v2.fluid.layers.topk
    :noindex:
+l2_normalize
+------------
-lod_tensor_to_array
+..  autofunction:: paddle.v2.fluid.layers.l2_normalize
-------------------
-..  autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
    :noindex:
+matmul
+------
+..  autofunction:: paddle.v2.fluid.layers.matmul
-array_to_lod_tensor
-------------------
-..  autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
    :noindex:
+warpctc
+-------
+..  autofunction:: paddle.v2.fluid.layers.warpctc
+    :noindex:
+sequence_reshape
+----------------
-fill_constant
+..  autofunction:: paddle.v2.fluid.layers.sequence_reshape
-------------
-..  autofunction:: paddle.v2.fluid.layers.fill_constant
    :noindex:
+transpose
+---------
+..  autofunction:: paddle.v2.fluid.layers.transpose
+    :noindex:
-fill_constant_batch_size_like
+im2sequence
-----------------------------
+-----------
-..  autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
+..  autofunction:: paddle.v2.fluid.layers.im2sequence
    :noindex:
+nce
+---
-ones
+..  autofunction:: paddle.v2.fluid.layers.nce
----
-..  autofunction:: paddle.v2.fluid.layers.ones
    :noindex:
+beam_search
+-----------
-zeros
+..  autofunction:: paddle.v2.fluid.layers.beam_search
-----
-..  autofunction:: paddle.v2.fluid.layers.zeros
    :noindex:
+row_conv
+--------
-increment
+..  autofunction:: paddle.v2.fluid.layers.row_conv
---------
-..  autofunction:: paddle.v2.fluid.layers.increment
    :noindex:
+multiplex
+---------
-array_write
+..  autofunction:: paddle.v2.fluid.layers.multiplex
-----------
-..  autofunction:: paddle.v2.fluid.layers.array_write
    :noindex:
+ops
+===
+mean
+----
-create_array
+..  autofunction:: paddle.v2.fluid.layers.mean
------------
-..  autofunction:: paddle.v2.fluid.layers.create_array
    :noindex:
+mul
+---
-less_than
+..  autofunction:: paddle.v2.fluid.layers.mul
---------
-..  autofunction:: paddle.v2.fluid.layers.less_than
    :noindex:
+reshape
+-------
-array_read
+..  autofunction:: paddle.v2.fluid.layers.reshape
----------
-..  autofunction:: paddle.v2.fluid.layers.array_read
    :noindex:
+scale
+-----
-shrink_memory
+..  autofunction:: paddle.v2.fluid.layers.scale
--------------
-..  autofunction:: paddle.v2.fluid.layers.shrink_memory
    :noindex:
+sigmoid_cross_entropy_with_logits
+---------------------------------
-array_length
+..  autofunction:: paddle.v2.fluid.layers.sigmoid_cross_entropy_with_logits
-------------
-..  autofunction:: paddle.v2.fluid.layers.array_length
    :noindex:
+elementwise_add
+---------------
-conv2d_transpose
+..  autofunction:: paddle.v2.fluid.layers.elementwise_add
----------------
-..  autofunction:: paddle.v2.fluid.layers.conv2d_transpose
    :noindex:
+elementwise_div
-sequence_expand
 ---------------
-..  autofunction:: paddle.v2.fluid.layers.sequence_expand
+..  autofunction:: paddle.v2.fluid.layers.elementwise_div
    :noindex:
+elementwise_sub
+---------------
-gru_unit
+..  autofunction:: paddle.v2.fluid.layers.elementwise_sub
--------
-..  autofunction:: paddle.v2.fluid.layers.gru_unit
    :noindex:
+elementwise_mul
+---------------
-lstm_unit
+..  autofunction:: paddle.v2.fluid.layers.elementwise_mul
---------
-..  autofunction:: paddle.v2.fluid.layers.lstm_unit
    :noindex:
+elementwise_max
+---------------
-sequence_softmax
+..  autofunction:: paddle.v2.fluid.layers.elementwise_max
----------------
-..  autofunction:: paddle.v2.fluid.layers.sequence_softmax
    :noindex:
+elementwise_min
+---------------
-reduce_sum
+..  autofunction:: paddle.v2.fluid.layers.elementwise_min
----------
-..  autofunction:: paddle.v2.fluid.layers.reduce_sum
    :noindex:
+elementwise_pow
+---------------
-reduce_mean
+..  autofunction:: paddle.v2.fluid.layers.elementwise_pow
-----------
-..  autofunction:: paddle.v2.fluid.layers.reduce_mean
    :noindex:
+clip
+----
-reduce_max
+..  autofunction:: paddle.v2.fluid.layers.clip
----------
-..  autofunction:: paddle.v2.fluid.layers.reduce_max
    :noindex:
+clip_by_norm
+------------
-reduce_min
+..  autofunction:: paddle.v2.fluid.layers.clip_by_norm
----------
-..  autofunction:: paddle.v2.fluid.layers.reduce_min
    :noindex:
+sequence_softmax
+----------------
-split
+..  autofunction:: paddle.v2.fluid.layers.sequence_softmax
-----
-..  autofunction:: paddle.v2.fluid.layers.split
    :noindex:
+sigmoid
+-------
-matmul
+..  autofunction:: paddle.v2.fluid.layers.sigmoid
------
-..  autofunction:: paddle.v2.fluid.layers.matmul
    :noindex:
 logsigmoid
 ----------
 ..  autofunction:: paddle.v2.fluid.layers.logsigmoid
    :noindex:
 exp
 ---
 ..  autofunction:: paddle.v2.fluid.layers.exp
    :noindex:
 relu
 ----
 ..  autofunction:: paddle.v2.fluid.layers.relu
    :noindex:
 tanh
 ----
 ..  autofunction:: paddle.v2.fluid.layers.tanh
    :noindex:
 tanh_shrink
 -----------
 ..  autofunction:: paddle.v2.fluid.layers.tanh_shrink
    :noindex:
 softshrink
 ----------
 ..  autofunction:: paddle.v2.fluid.layers.softshrink
    :noindex:
 sqrt
 ----
 ..  autofunction:: paddle.v2.fluid.layers.sqrt
    :noindex:
 abs
----
+---
 ..  autofunction:: paddle.v2.fluid.layers.abs
    :noindex:
 ceil
 ----
 ..  autofunction:: paddle.v2.fluid.layers.ceil
    :noindex:
 floor
 -----
 ..  autofunction:: paddle.v2.fluid.layers.floor
    :noindex:
 round
 -----
 ..  autofunction:: paddle.v2.fluid.layers.round
    :noindex:
 reciprocal
 ----------
 ..  autofunction:: paddle.v2.fluid.layers.reciprocal
    :noindex:
 log
 ---
 ..  autofunction:: paddle.v2.fluid.layers.log
    :noindex:
 square
 ------
 ..  autofunction:: paddle.v2.fluid.layers.square
    :noindex:
 softplus
 --------
 ..  autofunction:: paddle.v2.fluid.layers.softplus
    :noindex:
 softsign
---------
+--------
 ..  autofunction:: paddle.v2.fluid.layers.softsign
    :noindex:
 brelu
 -----
 ..  autofunction:: paddle.v2.fluid.layers.brelu
    :noindex:
 leaky_relu
 ----------
 ..  autofunction:: paddle.v2.fluid.layers.leaky_relu
    :noindex:
 soft_relu
 ---------
 ..  autofunction:: paddle.v2.fluid.layers.soft_relu
    :noindex:
 elu
----
+---
 ..  autofunction:: paddle.v2.fluid.layers.elu
    :noindex:
 relu6
 -----
 ..  autofunction:: paddle.v2.fluid.layers.relu6
    :noindex:
 pow
----
+---
 ..  autofunction:: paddle.v2.fluid.layers.pow
    :noindex:
+stanh
+-----
+..  autofunction:: paddle.v2.fluid.layers.stanh
+    :noindex:
 hard_shrink
 -----------
 ..  autofunction:: paddle.v2.fluid.layers.hard_shrink
    :noindex:
 thresholded_relu
 ----------------
 ..  autofunction:: paddle.v2.fluid.layers.thresholded_relu
    :noindex:
 hard_sigmoid
-------------
+------------
 ..  autofunction:: paddle.v2.fluid.layers.hard_sigmoid
    :noindex:
 swish
------
+-----
 ..  autofunction:: paddle.v2.fluid.layers.swish
    :noindex:
-im2sequence
+tensor
+======
+create_tensor
+-------------
+..  autofunction:: paddle.v2.fluid.layers.create_tensor
+    :noindex:
+create_parameter
+----------------
+..  autofunction:: paddle.v2.fluid.layers.create_parameter
+    :noindex:
+create_global_var
+-----------------
+..  autofunction:: paddle.v2.fluid.layers.create_global_var
+    :noindex:
+cast
+----
+..  autofunction:: paddle.v2.fluid.layers.cast
+    :noindex:
+concat
 ------
-..  autofunction:: paddle.v2.fluid.layers.im2sequence
+..  autofunction:: paddle.v2.fluid.layers.concat
    :noindex:
-edit_distance
+sums
---------------
+----
-..  autofunction:: paddle.v2.fluid.layers.edit_distance_error
+..  autofunction:: paddle.v2.fluid.layers.sums
    :noindex:
-ctc_greedy_decoder
+assign
---------------
+------
-..  autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder
+..  autofunction:: paddle.v2.fluid.layers.assign
    :noindex:
-l2_normalize
+fill_constant_batch_size_like
------------
+-----------------------------
-..  autofunction:: paddle.v2.fluid.layers.l2_normalize
+..  autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
    :noindex:
-sequence_reshape
+fill_constant
----------------
+-------------
-..  autofunction:: paddle.v2.fluid.layers.sequence_reshape
+..  autofunction:: paddle.v2.fluid.layers.fill_constant
    :noindex:
-row_conv
+ones
--------
+----
-..  autofunction:: paddle.v2.fluid.layers.row_conv
+..  autofunction:: paddle.v2.fluid.layers.ones
    :noindex:
-multiplex
+zeros
---------
+-----
-..  autofunction:: paddle.v2.fluid.layers.multiplex
+..  autofunction:: paddle.v2.fluid.layers.zeros
    :noindex:
--- a/doc/api/v2/fluid/nets.rst
+++ b/doc/api/v2/fluid/nets.rst
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-Nets
+    !DO NOT EDIT THIS FILE MANUALLY!
-===========
+====
+nets
+====
 simple_img_conv_pool
 --------------------
-..  autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
-    :noindex:
+..  autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
-img_conv_group
---------------
-..  autofunction:: paddle.v2.fluid.nets.img_conv_group
    :noindex:
 sequence_conv_pool
 ------------------
 ..  autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
    :noindex:
 glu
 ---
 ..  autofunction:: paddle.v2.fluid.nets.glu
    :noindex:
 scaled_dot_product_attention
 ----------------------------
 ..  autofunction:: paddle.v2.fluid.nets.scaled_dot_product_attention
    :noindex:
--- a/doc/api/v2/fluid/optimizer.rst
+++ b/doc/api/v2/fluid/optimizer.rst
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-Optimizer
+    !DO NOT EDIT THIS FILE MANUALLY!
-===========
-Optimizer
-----------
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: Optimizer
-    :noindex:
+=========
+optimizer
+=========
-SGDOptimizer
+SGD
-----------
+---
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: SGDOptimizer
-    :noindex:
+..  autoclass:: paddle.v2.fluid.optimizer.SGD
+    :members:
+    :noindex:
+Momentum
+--------
-MomentumOptimizer
+..  autoclass:: paddle.v2.fluid.optimizer.Momentum
-----------------
+    :members:
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: MomentumOptimizer
    :noindex:
+Adagrad
+-------
+..  autoclass:: paddle.v2.fluid.optimizer.Adagrad
-AdagradOptimizer
+    :members:
----------------
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: AdagradOptimizer
    :noindex:
+Adam
+----
-AdamOptimizer
+..  autoclass:: paddle.v2.fluid.optimizer.Adam
-------------
+    :members:
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: AdamOptimizer
    :noindex:
+Adamax
+------
-AdamaxOptimizer
+..  autoclass:: paddle.v2.fluid.optimizer.Adamax
-----------
+    :members:
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: AdamaxOptimizer
    :noindex:
+DecayedAdagrad
+--------------
-DecayedAdagradOptimizer
+..  autoclass:: paddle.v2.fluid.optimizer.DecayedAdagrad
-----------------------
+    :members:
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: DecayedAdagradOptimizer
    :noindex:
--- a/doc/api/v2/fluid/param_attr.rst
+++ b/doc/api/v2/fluid/param_attr.rst
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+==========
+param_attr
+==========
 ParamAttr
-===========
+---------
+..  autoclass:: paddle.v2.fluid.param_attr.ParamAttr
+    :members:
+    :noindex:
+WeightNormParamAttr
+-------------------
-ParamAttr
+..  autoclass:: paddle.v2.fluid.param_attr.WeightNormParamAttr
-----------
+    :members:
-..  automodule:: paddle.v2.fluid.param_attr
-    :members: ParamAttr
    :noindex:
--- a/doc/api/v2/fluid/profiler.rst
+++ b/doc/api/v2/fluid/profiler.rst
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-Profiler
+    !DO NOT EDIT THIS FILE MANUALLY!
-===========
+========
+profiler
+========
+cuda_profiler
+-------------
-Profiler
-----------
 ..  autofunction:: paddle.v2.fluid.profiler.cuda_profiler
    :noindex:
+reset_profiler
+--------------
+..  autofunction:: paddle.v2.fluid.profiler.reset_profiler
+    :noindex:
+profiler
+--------
+..  autofunction:: paddle.v2.fluid.profiler.profiler
+    :noindex:
--- a/doc/api/v2/fluid/regularizer.rst
+++ b/doc/api/v2/fluid/regularizer.rst
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
 ===========
-Regularizer
+regularizer
 ===========
-WeightDecayRegularizer
+append_regularization_ops
----------------------
+-------------------------
-..  automodule:: paddle.v2.fluid.regularizer
-    :members: WeightDecayRegularizer
-    :noindex:
-L2DecayRegularizer
+..  autofunction:: paddle.v2.fluid.regularizer.append_regularization_ops
------------------
-..  automodule:: paddle.v2.fluid.regularizer
-    :members: L2DecayRegularizer
    :noindex:
+L1Decay
+-------
+..  autoclass:: paddle.v2.fluid.regularizer.L1Decay
+    :members:
+    :noindex:
-L1DecayRegularizer
+L2Decay
-------------------
+-------
-..  automodule:: paddle.v2.fluid.regularizer
-    :members: L1DecayRegularizer
+..  autoclass:: paddle.v2.fluid.regularizer.L2Decay
+    :members:
+    :noindex:
--- a/doc/design/speech/README.MD
+++ b/doc/design/speech/README.MD
@@ -140,7 +140,19 @@ TODO by Assignees
 ### Beam Search with CTC and LM
-TODO by Assignees
+<div align="center">
+<img src="image/beam_search.png" width=600><br/>
+Figure 2. Algorithm for CTC Beam Search Decoder.
+</div>
+- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts: 
+   - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths; 
+   - 2) the if condition ```if l^+ not in A_prev then``` after probabilities' computation is deprecated for it is hard to understand and seems unnecessary.
+- An **external scorer** would be passed into the decoder to evaluate a candidate prefix during decoding whenever a white space appended in English decoding and any character appended in Mandarin decoding.
+- Such external scorer consists of language model, word count or any other custom scorers.
+- The **language model** is built from Task 5, with parameters should be carefully tuned to achieve minimum WER/CER (c.f. Task 7)
+- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality. 
 ## Future Work
@@ -153,3 +165,4 @@ TODO by Assignees
 1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016.
 2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). 	arXiv:1512.02595.
+3. Awni Y. Hannun, etc. [First-Pass Large Vocabulary Continuous Speech Recognition using Bi-Directional Recurrent DNNs](https://arxiv.org/abs/1408.2873). arXiv:1408.2873
--- a/doc/design/speech/image/beam_search.png
+++ b/doc/design/speech/image/beam_search.png
--- a/doc/design/switch.md
+++ b/doc/design/switch.md
+### Design Doc: Switch
+### Background
+Many programming languages provide `switch` as a generalization of `if-elif-else`.  We want to add it to Fluid.
+The following example shows the usage of `fluid.switch`.
+```python
+a = fluid.Var(10)
+b = fluid.Var(0)
+switch = fluid.switch()
+with switch.block():
+    with switch.case(fluid.less_equal(a, 10)):
+        fluid.print("Case 1")
+    with switch.case(fluid.larger(a, 0)):
+        fluid.print("Case 2")
+    with switch.default():
+        fluid.print("Case 3")
+```
+### The Semantics
+1. A `switch` control-flow checks cases one-by-one.
+1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values.
+1. It runs the first matched case, or the default case if there is one.
+1. Once it matches a case, it runs the corresponding branch and only that branch.  It's like there is a C's `break` keyword at the end of each case.
+The above program should print and print only "Case 1".
+The implementation of the backward pass of the `switch` control-flow is easier than the backward of the `if_else`, because `switch` runs at most one branch, whereas `if-else` could run more than one branches.
--- a/doc/getstarted/build_and_install/build_from_source_cn.rst
+++ b/doc/getstarted/build_and_install/build_from_source_cn.rst
@@ -115,7 +115,7 @@ PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种B
    "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
    "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
    "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
-    "WITH_TESTING", "是否开启单元测试", "ON"
+    "WITH_TESTING", "是否开启单元测试", "OFF"
    "WITH_DOC", "是否编译中英文文档", "OFF"
    "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练", "Auto"
    "WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON"

--- a/doc/getstarted/build_and_install/build_from_source_en.rst
+++ b/doc/getstarted/build_and_install/build_from_source_en.rst
@@ -126,7 +126,7 @@ You can add :code:`-D` argument to pass such options, like:
    "WITH_AVX", "Build with AVX support", "ON"
    "WITH_PYTHON", "Build with integrated Python interpreter", "ON"
    "WITH_STYLE_CHECK", "Check code style when building", "ON"
-    "WITH_TESTING", "Build unit tests", "ON"
+    "WITH_TESTING", "Build unit tests", "OFF"
    "WITH_DOC", "Build documentations", "OFF"
    "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
    "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"

--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -95,6 +95,12 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
     docker run -p 8888:8888 paddlepaddle/book
+国内用户可以使用下面的镜像源来加速访问：
+  .. code-block: bash
+    docker run -p 8888:8888 docker.paddlepaddlehub.com/book
 然后在浏览器中输入以下网址：
  .. code-block:: text

--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -102,6 +102,12 @@ We provide a packaged book image, simply issue the command:
     docker run -p 8888:8888 paddlepaddle/book
+For users in China, we provide a faster mirror:
+  .. code-block: bash
+    docker run -p 8888:8888 docker.paddlepaddlehub.com/book
 Then, you would back and paste the address into the local browser:
  .. code-block:: text

--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
@@ -92,11 +92,11 @@ paddle.init(
 参数说明
 - use_gpu： **可选，默认False**，是否启用GPU训练
- trainer_count：**必选，默认1**，当前训练任务trainer总个数
+- trainer_count：**必选，默认1**，当前trainer的线程数目
 - port：**必选，默认7164**，连接到pserver的端口
 - ports_num：**必选，默认1**，连接到pserver的端口个数
 - ports_num_for_sparse：**必选，默认0**，和pserver之间用于稀疏类型参数通信的端口个数
- num_gradient_servers：**必选，默认1**，当前训练任务pserver总数
+- num_gradient_servers：**必选，默认1**，当前训练任务trainer总数
 - trainer_id：**必选，默认0**，每个trainer的唯一ID，从0开始的整数
 - pservers：**必选，默认127.0.0.1**，当前训练任务启动的pserver的IP列表，多个IP使用“,”隔开

--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
@@ -95,11 +95,11 @@ paddle.init(
 Parameter Description
 - use_gpu: **optional, default False**, set to "True" to enable GPU training.
- trainer_count: **required, default 1**, total count of trainers in the training job.
+- trainer_count: **required, default 1**, number of threads in current trainer.
 - port: **required, default 7164**, port to connect to parameter server.
 - ports_num: **required, default 1**, number of ports for communication.
 - ports_num_for_sparse: **required, default 0**, number of ports for sparse type caculation.
- num_gradient_servers: **required, default 1**, total number of gradient server.
+- num_gradient_servers: **required, default 1**, number of trainers in current job.
 - trainer_id: **required, default 0**, ID for every trainer, start from 0.
 - pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".

--- a/doc/index_cn.rst
+++ b/doc/index_cn.rst
@@ -8,4 +8,3 @@ PaddlePaddle 文档
  howto/index_cn.rst
  api/index_cn.rst
  faq/index_cn.rst
-  mobile/index_cn.rst
--- a/doc/index_en.rst
+++ b/doc/index_en.rst
@@ -7,4 +7,3 @@ PaddlePaddle Documentation
  getstarted/index_en.rst
  howto/index_en.rst
  api/index_en.rst
-  mobile/index_en.rst
--- a/doc/mobile/index_cn.rst
+++ b/doc/mobile/index_cn.rst
-MOBILE
-======
-..  toctree::
-  :maxdepth: 1
-  cross_compiling_for_android_cn.md
-  cross_compiling_for_ios_cn.md
-  cross_compiling_for_raspberry_cn.md
--- a/doc/mobile/index_en.rst
+++ b/doc/mobile/index_en.rst
-MOBILE
-======
-..  toctree::
-  :maxdepth: 1
-  cross_compiling_for_android_en.md
-  cross_compiling_for_ios_en.md
-  cross_compiling_for_raspberry_en.md
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -22,7 +22,7 @@ cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
-nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
+nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
 cc_test(variable_test SRCS variable_test.cc)

--- a/paddle/framework/channel.h
+++ b/paddle/framework/channel.h
@@ -23,12 +23,10 @@ namespace framework {
 template <typename T>
 class Channel {
 public:
-  virtual void Send(T*) = 0;
+  virtual bool Send(T*) = 0;
-  virtual void Receive(T*) = 0;
+  virtual bool Receive(T*) = 0;
  virtual size_t Cap() = 0;
+  virtual void Close() = 0;
-  // Don't delete channels; instead, call Channel::Close.
- protected:
  virtual ~Channel() {}
 };
@@ -50,11 +48,7 @@ Channel<T>* MakeChannel(size_t buffer_size) {
 template <typename T>
 void CloseChannel(Channel<T>* ch) {
-  if (ch->Cap() > 0) {
+  ch->Close();
-    delete dynamic_cast<details::Buffered<T>*>(ch);
-  } else {
-    delete dynamic_cast<details::UnBuffered<T>*>(ch);
-  }
 }
 }  // namespace framework

--- a/paddle/framework/channel_test.cc
+++ b/paddle/framework/channel_test.cc
@@ -14,13 +14,329 @@ limitations under the License. */
 #include "paddle/framework/channel.h"
+#include <chrono>
+#include <thread>
 #include "gtest/gtest.h"
+using paddle::framework::Channel;
+using paddle::framework::MakeChannel;
+using paddle::framework::CloseChannel;
 TEST(Channel, MakeAndClose) {
-  using paddle::framework::Channel;
+  using paddle::framework::details::Buffered;
-  using paddle::framework::MakeChannel;
+  using paddle::framework::details::UnBuffered;
-  using paddle::framework::CloseChannel;
+  {
+    // MakeChannel should return a buffered channel is buffer_size > 0.
+    auto ch = MakeChannel<int>(10);
+    EXPECT_NE(dynamic_cast<Buffered<int> *>(ch), nullptr);
+    EXPECT_EQ(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
+    CloseChannel(ch);
+    delete ch;
+  }
+  {
+    // MakeChannel should return an un-buffered channel is buffer_size = 0.
+    auto ch = MakeChannel<int>(0);
+    EXPECT_EQ(dynamic_cast<Buffered<int> *>(ch), nullptr);
+    EXPECT_NE(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
+    CloseChannel(ch);
+    delete ch;
+  }
+}
+TEST(Channel, SufficientBufferSizeDoesntBlock) {
+  const size_t buffer_size = 10;
+  auto ch = MakeChannel<size_t>(buffer_size);
+  for (size_t i = 0; i < buffer_size; ++i) {
+    EXPECT_EQ(ch->Send(&i), true);  // should not block
+  }
+  size_t out;
+  for (size_t i = 0; i < buffer_size; ++i) {
+    EXPECT_EQ(ch->Receive(&out), true);  // should not block
+    EXPECT_EQ(out, i);
+  }
+  CloseChannel(ch);
+  delete ch;
+}
+TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
+  const size_t buffer_size = 10;
+  auto ch = MakeChannel<size_t>(buffer_size);
+  size_t sum = 0;
+  std::thread t([&]() {
+    // Try to write more than buffer size.
+    for (size_t i = 0; i < 2 * buffer_size; ++i) {
+      if (i < buffer_size)
+        EXPECT_EQ(ch->Send(&i), true);  // should block after 10 iterations
+      else
+        EXPECT_EQ(ch->Send(&i), false);
+      sum += i;
+    }
+  });
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.5 sec
+  EXPECT_EQ(sum, 45U);
+  CloseChannel(ch);
+  t.join();
+  delete ch;
+}
+TEST(Channel, SimpleUnbufferedChannelTest) {
+  auto ch = MakeChannel<int>(0);
+  unsigned sum_send = 0;
+  std::thread t([&]() {
+    for (int i = 0; i < 5; i++) {
+      EXPECT_EQ(ch->Send(&i), true);
+      sum_send += i;
+    }
+  });
+  for (int i = 0; i < 5; i++) {
+    int recv;
+    EXPECT_EQ(ch->Receive(&recv), true);
+    EXPECT_EQ(recv, i);
+  }
+  CloseChannel(ch);
+  t.join();
+  EXPECT_EQ(sum_send, 10U);
+  delete ch;
+}
+// This tests that closing a buffered channel also unblocks
+//  any receivers waiting on the channel
+TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
+  auto ch = MakeChannel<int>(1);
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+  // Launches threads that try to read and are blocked because of no writers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data;
+          // All reads should return false
+          EXPECT_EQ(ch->Receive(&data), false);
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+  // Verify that all threads are blocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], false);
+  }
+  // Explicitly close the channel
+  // This should unblock all receivers
+  CloseChannel(ch);
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  delete ch;
+}
+// This tests that closing a buffered channel also unblocks
+//  any senders waiting for channel to have write space
+TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
+  auto ch = MakeChannel<int>(1);
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+  bool send_success[num_threads];
+  // Launches threads that try to write and are blocked because of no readers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    send_success[i] = false;
+    t[i] = std::thread(
+        [&](bool *ended, bool *success) {
+          int data = 10;
+          *success = ch->Send(&data);
+          *ended = true;
+        },
+        &thread_ended[i], &send_success[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+  // Verify that atleast 4 threads are blocked
+  int ct = 0;
+  for (size_t i = 0; i < num_threads; i++) {
+    if (thread_ended[i] == false) ct++;
+  }
+  // Atleast 4 threads must be blocked
+  EXPECT_GE(ct, 4);
+  // Explicitly close the thread
+  // This should unblock all senders
+  CloseChannel(ch);
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+  // Verify that only 1 send was successful
+  ct = 0;
+  for (size_t i = 0; i < num_threads; i++) {
+    if (send_success[i]) ct++;
+  }
+  // Only 1 send must be successful
+  EXPECT_EQ(ct, 1);
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  delete ch;
+}
+// This tests that closing an unbuffered channel also unblocks
+//  unblocks any receivers waiting for senders
+TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
+  auto ch = MakeChannel<int>(0);
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+  // Launches threads that try to read and are blocked becausew of no writers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data;
+          EXPECT_EQ(ch->Receive(&data), false);
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  // Verify that all the threads are blocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], false);
+  }
+  // Explicitly close the thread
+  // This should unblock all receivers
+  CloseChannel(ch);
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  delete ch;
+}
+// This tests that closing an unbuffered channel also unblocks
+//  unblocks any senders waiting for senders
+TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
+  auto ch = MakeChannel<int>(0);
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+  // Launches threads that try to read and are blocked becausew of no writers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data = 10;
+          EXPECT_EQ(ch->Send(&data), false);
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  // Verify that all the threads are blocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], false);
+  }
+  // Explicitly close the thread
+  // This should unblock all receivers
+  CloseChannel(ch);
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  delete ch;
+}
+TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
+  auto ch = MakeChannel<int>(0);
+  unsigned sum_send = 0;
+  // Send should block after three iterations
+  // since we only have three receivers.
+  std::thread t([&]() {
+    // Try to send more number of times
+    // than receivers
+    for (int i = 0; i < 4; i++) {
+      ch->Send(&i);
+      sum_send += i;
+    }
+  });
+  for (int i = 0; i < 3; i++) {
+    int recv;
+    ch->Receive(&recv);
+    EXPECT_EQ(recv, i);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.5 sec
+  EXPECT_EQ(sum_send, 3U);
+  CloseChannel(ch);
+  t.join();
+  delete ch;
+}
+TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
+  auto ch = MakeChannel<int>(0);
+  unsigned sum_send = 0;
+  unsigned sum_receive = 0;
+  // The receiver should block after 5
+  // iterations, since there are only 5 senders.
+  std::thread t([&]() {
+    for (int i = 0; i < 8; i++) {
+      int recv;
+      ch->Receive(&recv);  // should block after the fifth iteration.
+      EXPECT_EQ(recv, i);
+      sum_receive += i;
+    }
+  });
+  for (int i = 0; i < 5; i++) {
+    ch->Send(&i);
+    sum_send += i;
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  EXPECT_EQ(sum_send, 10U);
+  EXPECT_EQ(sum_receive, 10U);
+  // send three more elements
+  for (int i = 5; i < 8; i++) {
+    ch->Send(&i);
+    sum_send += i;
+  }
-  Channel<int>* ch = MakeChannel<int>(10);
  CloseChannel(ch);
+  t.join();
+  EXPECT_EQ(sum_send, 28U);
+  EXPECT_EQ(sum_receive, 28U);
+  delete ch;
 }
--- a/paddle/framework/details/buffered_channel.h
+++ b/paddle/framework/details/buffered_channel.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <mutex>
 #include "paddle/framework/channel.h"
+#include "paddle/platform/enforce.h"
 namespace paddle {
 namespace framework {
@@ -29,9 +30,11 @@ class Buffered : public paddle::framework::Channel<T> {
  friend void paddle::framework::CloseChannel<T>(Channel<T>*);
 public:
-  virtual void Send(T*);
+  virtual bool Send(T*);
-  virtual void Receive(T*);
+  virtual bool Receive(T*);
  virtual size_t Cap() { return cap_; }
+  virtual void Close();
+  virtual ~Buffered();
 private:
  size_t cap_;
@@ -39,42 +42,64 @@ class Buffered : public paddle::framework::Channel<T> {
  std::condition_variable empty_cond_var_;
  std::condition_variable full_cond_var_;
  std::deque<T> channel_;
+  bool closed_;
-  Buffered(size_t cap) : cap_(cap) {}
+  Buffered(size_t cap) : cap_(cap), closed_(false) {
-  virtual ~Buffered();
+    PADDLE_ENFORCE_GT(cap, 0);
+  }
-  void NotifyAllSenders(std::unique_lock<std::mutex>*);
+  void NotifyAllParticipants(std::unique_lock<std::mutex>*);
 };
 template <typename T>
-void Buffered<T>::Send(T* item) {
+bool Buffered<T>::Send(T* item) {
  std::unique_lock<std::mutex> lock(mu_);
-  full_cond_var_.wait(lock, [this]() { return channel_.size() < cap_; });
+  full_cond_var_.wait(lock,
+                      [this]() { return channel_.size() < cap_ || closed_; });
+  bool ret = false;
+  if (!closed_) {
    channel_.push_back(std::move(*item));
    lock.unlock();
    empty_cond_var_.notify_one();
+    ret = true;
+  }
+  return ret;
 }
 template <typename T>
-void Buffered<T>::Receive(T* item) {
+bool Buffered<T>::Receive(T* item) {
  std::unique_lock<std::mutex> lock(mu_);
-  empty_cond_var_.wait(lock, [this]() { return !channel_.empty(); });
+  empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
+  bool ret = false;
+  if (!closed_) {
    *item = std::move(channel_.front());
    channel_.pop_front();
-  NotifyAllSenders(&lock);
+    full_cond_var_.notify_one();
+    ret = true;
+  }
+  return ret;
+}
+template <typename T>
+void Buffered<T>::Close() {
+  std::unique_lock<std::mutex> lock(mu_);
+  closed_ = true;
+  NotifyAllParticipants(&lock);
 }
 template <typename T>
 Buffered<T>::~Buffered() {
  std::unique_lock<std::mutex> lock(mu_);
+  closed_ = true;
  channel_.clear();
-  NotifyAllSenders(&lock);
+  NotifyAllParticipants(&lock);
 }
 template <typename T>
-void Buffered<T>::NotifyAllSenders(std::unique_lock<std::mutex>* lock) {
+void Buffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
  lock->unlock();
-  full_cond_var_.notify_one();
+  full_cond_var_.notify_all();
+  empty_cond_var_.notify_all();
 }
 }  // namespace details

--- a/paddle/framework/details/unbuffered_channel.h
+++ b/paddle/framework/details/unbuffered_channel.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <atomic>
 #include <condition_variable>
-#include <deque>
 #include <mutex>
 #include "paddle/framework/channel.h"
@@ -29,23 +29,117 @@ class UnBuffered : public paddle::framework::Channel<T> {
  friend void paddle::framework::CloseChannel<T>(Channel<T>*);
 public:
-  virtual void Send(T*);
+  virtual bool Send(T*);
-  virtual void Receive(T*);
+  virtual bool Receive(T*);
  virtual size_t Cap() { return 0; }
+  virtual void Close();
+  virtual ~UnBuffered();
 private:
-  UnBuffered() {}
+  std::mutex mu_ch_;
-  virtual ~UnBuffered();
+  // Mutex for readers and writers who are waiting for other reader
+  // and writer to complete execution
+  std::recursive_mutex mu_read_, mu_write_;
+  // reader_found_ is set true when a reader is ready to accept data
+  // writer_found_ is set true when a writer is ready to send data
+  // A transaction occurs only when both are true
+  std::atomic<bool> reader_found_{false}, writer_found_{false};
+  std::condition_variable cv_channel_;
+  std::condition_variable_any cv_reader_, cv_writer_;
+  T* item{nullptr};
+  std::atomic<bool> closed_{false};
+  UnBuffered() : closed_(false) {}
+  void NotifyAllParticipants(std::unique_lock<std::mutex>*);
 };
+// This function implements the concept of how data should
+// be sent from a writer to a reader.
+template <typename T>
+bool UnBuffered<T>::Send(T* data) {
+  // Prevent other writers from entering
+  std::unique_lock<std::recursive_mutex> writer_lock(mu_write_);
+  writer_found_ = true;
+  std::unique_lock<std::recursive_mutex> cv_lock(mu_write_);
+  // If writer comes first, it should wait till a reader arrives
+  cv_writer_.wait(cv_lock,
+                  [this]() { return reader_found_ == true || closed_; });
+  cv_reader_.notify_one();
+  bool ret = false;
+  if (!closed_) {
+    std::unique_lock<std::mutex> channel_lock(mu_ch_);
+    item = data;
+    channel_lock.unlock();
+    cv_channel_.notify_one();
+    channel_lock.lock();
+    cv_channel_.wait(channel_lock,
+                     [this]() { return item == nullptr || closed_; });
+    ret = true;
+  }
+  writer_found_ = false;
+  return ret;
+}
+// This function implements the concept of how
+// data that was sent by a writer is read from a reader.
+template <typename T>
+bool UnBuffered<T>::Receive(T* data) {
+  // Prevent other readers from entering
+  std::unique_lock<std::recursive_mutex> read_lock{mu_read_};
+  reader_found_ = true;
+  std::unique_lock<std::recursive_mutex> cv_lock{mu_read_};
+  // If reader comes first, it should wait till a writer arrives
+  cv_reader_.wait(cv_lock,
+                  [this]() { return writer_found_ == true || closed_; });
+  cv_writer_.notify_one();
+  bool ret = false;
+  if (!closed_) {
+    std::unique_lock<std::mutex> lock_ch{mu_ch_};
+    // Reader should wait for the writer to first write its data
+    cv_channel_.wait(lock_ch, [this]() { return item != nullptr || closed_; });
+    if (!closed_) {
+      *data = std::move(*item);
+      item = nullptr;
+      lock_ch.unlock();
+      ret = true;
+    }
+    cv_channel_.notify_one();
+  }
+  reader_found_ = false;
+  return ret;
+}
+// This function implements the sequence of events
+// that take place once the channel is closed.
 template <typename T>
-void UnBuffered<T>::Send(T* channel_element) {}
+void UnBuffered<T>::Close() {
+  std::unique_lock<std::mutex> lock(mu_ch_);
+  item = nullptr;
+  closed_ = true;
+  NotifyAllParticipants(&lock);
+}
+// This function implements the sequence of events
+// that are executed once the object of an UnBuffered
+// channel is destroyed.
 template <typename T>
-void UnBuffered<T>::Receive(T*) {}
+UnBuffered<T>::~UnBuffered() {
+  std::unique_lock<std::mutex> lock(mu_ch_);
+  item = nullptr;
+  closed_ = true;
+  NotifyAllParticipants(&lock);
+}
+// This function notifies all the readers, writers and
+// the channel condition variables.
 template <typename T>
-UnBuffered<T>::~UnBuffered() {}
+void UnBuffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
+  lock->unlock();
+  cv_writer_.notify_all();
+  cv_channel_.notify_all();
+  cv_reader_.notify_all();
+}
 }  // namespace details
 }  // namespace framework

--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -25,7 +25,7 @@ limitations under the License. */
 #include "paddle/platform/place.h"
 #include "paddle/platform/profiler.h"
-DECLARE_bool(do_memory_benchmark);
+DECLARE_bool(benchmark);
 DEFINE_bool(check_nan_inf, false,
            "Checking whether operator produce NAN/INF or not. It will be "
            "extremely slow so please use this flag wisely.");
@@ -33,9 +33,6 @@ DEFINE_bool(check_nan_inf, false,
 namespace paddle {
 namespace framework {
-const std::string kFeedOpType = "feed";
-const std::string kFetchOpType = "fetch";
 Executor::Executor(const platform::Place& place) : place_(place) {}
 static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
@@ -125,7 +122,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
    op->Run(*local_scope, place_);
    VLOG(3) << op->DebugStringEx(local_scope);
-    if (FLAGS_do_memory_benchmark) {
+    if (FLAGS_benchmark) {
      VLOG(2) << "Memory used after operator " + op->Type() + " running: "
              << memory::memory_usage(place_);
    }
@@ -142,7 +139,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
  if (create_vars && create_local_scope) {
    scope->DeleteScope(local_scope);
  }
-  if (FLAGS_do_memory_benchmark) {
+  if (FLAGS_benchmark) {
    VLOG(2) << "-------------------------------------------------------";
    VLOG(2) << "Memory used after deleting local scope: "
            << memory::memory_usage(place_);

--- a/paddle/framework/feed_fetch_type.h
+++ b/paddle/framework/feed_fetch_type.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <string>
 #include <vector>
 #include "paddle/framework/lod_tensor.h"
@@ -20,5 +21,8 @@ namespace paddle {
 namespace framework {
 using FeedFetchType = LoDTensor;
 using FeedFetchList = std::vector<FeedFetchType>;
+static const std::string kFeedOpType = "feed";
+static const std::string kFetchOpType = "fetch";
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/init.cc
+++ b/paddle/framework/init.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <string.h>  // for strdup
 #include <algorithm>
+#include <stdexcept>
 #include <string>
 #include "paddle/framework/init.h"
@@ -46,17 +47,23 @@ void InitDevices() {
  std::vector<platform::Place> places;
  places.emplace_back(platform::CPUPlace());
+  int count = 0;
 #ifdef PADDLE_WITH_CUDA
-  int count = platform::GetCUDADeviceCount();
+  try {
-  for (int i = 0; i < count; ++i) {
+    count = platform::GetCUDADeviceCount();
-    places.emplace_back(platform::CUDAPlace(i));
+  } catch (const std::exception &exp) {
+    LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
  }
 #else
  LOG(WARNING)
-      << "'GPU' is not supported, Please re-compile with WITH_GPU option";
+      << "'CUDA' is not supported, Please re-compile with WITH_GPU option";
 #endif
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(platform::CUDAPlace(i));
+  }
  platform::DeviceContextPool::Init(places);
 }

--- a/paddle/framework/init_test.cc
+++ b/paddle/framework/init_test.cc
@@ -20,7 +20,21 @@ TEST(InitDevices, CPU) {
  using paddle::framework::InitDevices;
  using paddle::platform::DeviceContextPool;
+#ifndef PADDLE_WITH_CUDA
  InitDevices();
  DeviceContextPool& pool = DeviceContextPool::Instance();
-  ASSERT_GE(pool.size(), 1U);
+  ASSERT_EQ(pool.size(), 1U);
+#endif
+}
+TEST(InitDevices, CUDA) {
+  using paddle::framework::InitDevices;
+  using paddle::platform::DeviceContextPool;
+#ifdef PADDLE_WITH_CUDA
+  int count = paddle::platform::GetCUDADeviceCount();
+  InitDevices();
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
+#endif
 }
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -24,8 +24,6 @@ limitations under the License. */
 #include <algorithm>
 #include <iterator>
-#include <glog/logging.h>
 namespace paddle {
 namespace framework {

--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -18,11 +18,11 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
-#include <thrust/system/cuda/experimental/pinned_allocator.h>
 #endif
 #include <glog/logging.h>
 #include "paddle/framework/ddim.h"
+#include "paddle/framework/mixed_vector.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/framework/tensor_util.h"
 #include "paddle/platform/enforce.h"
@@ -31,15 +31,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-#ifndef PADDLE_WITH_CUDA
-template <typename T>
-using Vector = std::vector<T>;
-#else
-template <typename T>
-using Vector = thrust::host_vector<
-    T, thrust::system::cuda::experimental::pinned_allocator<T>>;
-#endif
 /*
 * LoD is short for Level of Details.
 *
@@ -55,7 +46,15 @@ using Vector = thrust::host_vector<
 *    0 2 4 7
 *    0 2 5 7 10 12 15 20
 */
-using LoD = std::vector<Vector<size_t>>;
+struct LoD : public std::vector<Vector<size_t>> {
+  using std::vector<Vector<size_t>>::vector;
+  void CopyFromCUDA() {
+    for (auto it = this->begin(); it != this->end(); ++it) {
+      it->CopyFromCUDA();
+    }
+  }
+};
 std::ostream& operator<<(std::ostream& os, const LoD& lod);
 std::ostream& operator<<(std::ostream& os, const LoDTensor& t);
@@ -109,7 +108,10 @@ bool CheckAbsLoD(const LoD& in, int tensor_height = -1);
 */
 class LoDTensor : public Tensor {
 public:
-  LoDTensor() {}
+  LoDTensor() : Tensor() {}
+  /* Constructor with place should only be used in pybind */
+  explicit LoDTensor(const platform::Place& place) : Tensor(place) {}
  explicit LoDTensor(const LoD& lod) : lod_(lod) {}

--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -23,6 +23,17 @@
 namespace paddle {
 namespace framework {
+TEST(LoD, data) {
+  LoD lod{{0, 1, 2}};
+  lod.push_back({0, 2, 4, 5});
+  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
+  auto& v = lod[0];
+  for (size_t i = 0; i < v.size(); ++i) {
+    EXPECT_EQ(v[i], i);
+  }
+}
 TEST(LodExpand, test) {
  LoD lod{{0, 2}};
  LoDTensor tensor;

--- a/paddle/framework/lod_tensor_test.cu
+++ b/paddle/framework/lod_tensor_test.cu
@@ -14,6 +14,8 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <stdio.h>
+#include "paddle/framework/init.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/platform/assert.h"
@@ -26,7 +28,48 @@ __global__ void test(size_t* a, int size) {
  }
 }
+TEST(Vector, Normal) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  using namespace paddle::memory;
+  paddle::framework::InitDevices();
+  paddle::framework::Vector<size_t> vec({1, 2, 3});
+  size_t* ptr = vec.data();
+  for (size_t i = 0; i < vec.size(); ++i) {
+    EXPECT_EQ(vec[i], *(ptr + i));
+  }
+  vec.clear();
+  vec.CopyFromCUDA();
+  std::vector<size_t> v = {1, 2, 3};
+  for (size_t i = 0; i < v.size(); ++i) {
+    EXPECT_EQ(v[i], vec[i]);
+  }
+}
+TEST(LoD, data) {
+  paddle::framework::InitDevices();
+  paddle::framework::LoD lod{{0, 1, 2}};
+  lod.push_back({0, 2, 4, 5});
+  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
+  auto& v = lod[0];
+  test<<<1, 1>>>(v.cuda_data(), v.size());
+  cudaDeviceSynchronize();
+  v.CopyFromCUDA();
+  for (size_t i = 0; i < v.size(); ++i) {
+    EXPECT_EQ(v[i], i * 2);
+  }
+}
 TEST(LoDTensor, LoDInGPU) {
+  paddle::framework::InitDevices();
  paddle::framework::LoDTensor lod_tensor;
  paddle::platform::CUDAPlace place(0);
@@ -42,8 +85,9 @@ TEST(LoDTensor, LoDInGPU) {
  auto lod = lod_tensor.lod();
-  test<<<1, 8>>>(lod[0].data(), lod[0].size());
+  test<<<1, 8>>>(lod[0].cuda_data(), lod[0].size());
  cudaDeviceSynchronize();
+  lod.CopyFromCUDA();
  for (size_t i = 0; i < src_lod[0].size(); ++i) {
    EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);

--- a/paddle/framework/mixed_vector.h
+++ b/paddle/framework/mixed_vector.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <initializer_list>
+#include <vector>
+#include "paddle/memory/memcpy.h"
+#include "paddle/memory/memory.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/place.h"
+namespace paddle {
+namespace framework {
+/**
+ * @brief Vector support both cpu and gpu.
+ * host vector lifetime is same with Vector
+ * device vector is lazily malloc and modified.
+ */
+template <typename T>
+class Vector : public std::vector<T> {
+ public:
+  using std::vector<T>::vector;
+  Vector() {}
+  Vector(const std::vector<T> &v) : std::vector<T>(v) {}  // NOLINT
+  virtual ~Vector() {
+#ifdef PADDLE_WITH_CUDA
+    if (cuda_ptr_ != nullptr) {
+      memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
+    }
+#endif
+  }
+  /* Get device vector */
+  T *cuda_data() {
+    CopyToCUDA();
+    PADDLE_ENFORCE_NOT_NULL(
+        cuda_ptr_, "No data or Insufficient CUDA memory to allocation");
+    return static_cast<T *>(cuda_ptr_);
+  }
+  /* Get host vector */
+  T *data() { return std::vector<T>::data(); }
+  const T *data() const { return std::vector<T>::data(); }
+  /* Synchronize host vector to device vector */
+  void CopyToCUDA();
+  /* Synchronize device vector to host vector */
+  void CopyFromCUDA();
+  /* Switch device vector location */
+  void CopyToPeer(platform::Place);
+ private:
+  void *cuda_ptr_ = nullptr;
+  size_t cuda_size_ = 0;  // device vector numel
+  platform::CUDAPlace place_;
+};
+template <typename T>
+void Vector<T>::CopyToCUDA() {
+#ifdef PADDLE_WITH_CUDA
+  if (cuda_size_ < this->size()) {
+    if (cuda_ptr_ != nullptr) {
+      memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
+    }
+    cuda_ptr_ =
+        memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T));
+  }
+  cuda_size_ = this->size();
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto *ctx = pool.GetByPlace(place_);
+  memory::Copy(place_, cuda_ptr_, platform::CPUPlace(),
+               static_cast<const void *>(this->data()),
+               this->size() * sizeof(T), ctx->stream());
+  ctx->Wait();
+#endif
+}
+template <typename T>
+void Vector<T>::CopyFromCUDA() {
+#ifdef PADDLE_WITH_CUDA
+  if (cuda_ptr_ == nullptr) {
+    LOG(WARNING) << "No uncommitted cuda data.";
+    return;
+  }
+  this->resize(cuda_size_);
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto *ctx = pool.GetByPlace(place_);
+  memory::Copy(platform::CPUPlace(), static_cast<void *>(this->data()), place_,
+               static_cast<const void *>(cuda_ptr_), this->size() * sizeof(T),
+               ctx->stream());
+  ctx->Wait();
+#endif
+}
+template <typename T>
+void Vector<T>::CopyToPeer(platform::Place peer_place) {
+#ifdef PADDLE_WITH_CUDA
+  auto *ctx = platform::DeviceContextPool::Instance().GetByPlace(place_);
+  void *peer_cuda_ptr = memory::Alloc<platform::CUDAPlace>(
+      boost::get<platform::CUDAPlace>(peer_place), this->size() * sizeof(T));
+  memory::Copy(boost::get<platform::CUDAPlace>(peer_place), peer_cuda_ptr,
+               place_, cuda_ptr_, this->size() * sizeof(T), ctx->stream());
+  ctx->Wait();
+  memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
+  place_ = boost::get<platform::CUDAPlace>(peer_place);
+  cuda_ptr_ = peer_cuda_ptr;
+#endif
+}
+template class Vector<int>;
+template class Vector<unsigned>;
+template class Vector<size_t>;
+template class Vector<int64_t>;
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -39,10 +39,6 @@ class CompileTimeInferShapeContext : public InferShapeContext {
  bool HasOutputs(const std::string &name) const override;
-  DDim GetInputDim(const std::string &name) const override;
-  void SetOutputDim(const std::string &name, const DDim &dim) override;
  AttrReader Attrs() const override;
  const std::vector<std::string> &Inputs(
@@ -444,21 +440,6 @@ bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const {
  return true;
 }
-DDim CompileTimeInferShapeContext::GetInputDim(const std::string &name) const {
-  std::vector<DDim> ddims = GetInputsDim(name);
-  auto length = ddims.size();
-  PADDLE_ENFORCE_EQ(length, 1UL,
-                    "Input(%s) should have 1 value, "
-                    "but it has %d now",
-                    name, length);
-  return ddims[0];
-}
-void CompileTimeInferShapeContext::SetOutputDim(const std::string &name,
-                                                const DDim &dim) {
-  SetOutputsDim(name, {dim});
-}
 AttrReader CompileTimeInferShapeContext::Attrs() const {
  return AttrReader(op_.GetAttrMap());
 }

--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -22,9 +22,7 @@ limitations under the License. */
 #include "paddle/framework/shape_inference.h"
 #include "paddle/framework/var_type.h"
-DEFINE_bool(op_sync, false,
+DECLARE_bool(benchmark);
-            "Default cuda is asynchronous device, set to True will"
-            "force op run in synchronous mode.");
 namespace paddle {
 namespace framework {
@@ -368,14 +366,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
    return true;
  }
-  DDim GetInputDim(const std::string& name) const override {
-    return GetDim(op_.Input(name));
-  }
-  void SetOutputDim(const std::string& name, const DDim& dim) override {
-    SetDim(op_.Output(name), dim);
-  }
  AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
  const std::vector<std::string>& Inputs(
@@ -531,7 +521,7 @@ void OperatorWithKernel::Run(const Scope& scope,
      ExecutionContext(*this, new_scope, *new_dev_ctx));
  /*For profiling/benchmark only*/
-  if (FLAGS_op_sync) {
+  if (FLAGS_benchmark) {
    new_dev_ctx->Wait();
  }
 }

--- a/paddle/framework/program_desc.cc
+++ b/paddle/framework/program_desc.cc
@@ -14,13 +14,11 @@ limitations under the License. */
 #include "paddle/framework/program_desc.h"
 #include "paddle/framework/block_desc.h"
+#include "paddle/framework/feed_fetch_type.h"
 namespace paddle {
 namespace framework {
-const std::string kFeedOpType = "feed";
-const std::string kFetchOpType = "fetch";
 BlockDesc *ProgramDesc::AppendBlock(const BlockDesc &parent) {
  auto *b = desc_.add_blocks();
  b->set_parent_idx(parent.ID());

--- a/paddle/framework/program_desc.h
+++ b/paddle/framework/program_desc.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <memory>
 #include <vector>
+#include "paddle/framework/block_desc.h"
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/proto_desc.h"
 #include "paddle/platform/macros.h"

--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -20,9 +20,11 @@ limitations under the License. */
 #include "paddle/framework/threadpool.h"
 #include "paddle/string/printf.h"
-DEFINE_bool(do_memory_benchmark, false,
+DEFINE_bool(benchmark, false,
            "Doing memory benchmark. It will make deleting scope synchronized, "
-            "and add some memory usage logs");
+            "and add some memory usage logs."
+            "Default cuda is asynchronous device, set to True will"
+            "force op run in synchronous mode.");
 namespace paddle {
 namespace framework {
@@ -93,7 +95,7 @@ void Scope::DeleteScope(Scope* scope) {
  PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
  this->kids_.erase(it);
  // When making memory benchmark on Fluid, we have to delete scope sync.
-  if (FLAGS_do_memory_benchmark) {
+  if (FLAGS_benchmark) {
    delete scope;
  } else {
    Async([scope] { delete scope; });

--- a/paddle/framework/shape_inference.cc
+++ b/paddle/framework/shape_inference.cc
@@ -18,10 +18,18 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-std::vector<framework::DDim> InferShapeContext::GetInputsDim(
+DDim InferShapeContext::GetInputDim(const std::string &name) const {
+  const std::vector<std::string> &arg_names = Inputs(name);
+  PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
+                    "Input(%s) should hold one element, but now it holds %d",
+                    name, arg_names.size());
+  return this->GetDim(arg_names[0]);
+}
+std::vector<DDim> InferShapeContext::GetInputsDim(
    const std::string &name) const {
-  const std::vector<std::string> &names = Inputs(name);
+  const std::vector<std::string> &arg_names = Inputs(name);
-  return GetDims(names);
+  return GetDims(arg_names);
 }
 DDim InferShapeContext::GetInputsElementDim(const std::string &name,
@@ -30,24 +38,31 @@ DDim InferShapeContext::GetInputsElementDim(const std::string &name,
  return this->GetDim(names[idx]);
 }
-void InferShapeContext::SetOutputsDim(
+void InferShapeContext::SetOutputDim(const std::string &name, const DDim &dim) {
-    const std::string &name, const std::vector<framework::DDim> &dims) {
+  auto &arg_names = Outputs(name);
+  PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
+                    "Output(%s) should hold one element, but now it holds %d",
+                    name, arg_names.size());
+  SetDim(arg_names[0], dim);
+}
+void InferShapeContext::SetOutputsDim(const std::string &name,
+                                      const std::vector<DDim> &dims) {
  auto &names = Outputs(name);
  SetDims(names, dims);
 }
-std::vector<framework::DDim> InferShapeContext::GetDims(
+std::vector<DDim> InferShapeContext::GetDims(
    const std::vector<std::string> &names) const {
-  std::vector<framework::DDim> ret;
+  std::vector<DDim> ret;
  ret.reserve(names.size());
  std::transform(
      names.begin(), names.end(), std::back_inserter(ret),
      [this](const std::string &name) { return this->GetDim(name); });
  return ret;
 }
 void InferShapeContext::SetDims(const std::vector<std::string> &names,
-                                const std::vector<framework::DDim> &dims) {
+                                const std::vector<DDim> &dims) {
  size_t length = names.size();
  PADDLE_ENFORCE_EQ(length, dims.size());
  for (size_t i = 0; i < length; ++i) {

--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -35,14 +35,13 @@ class InferShapeContext {
  virtual bool HasInputs(const std::string &name) const = 0;
  virtual bool HasOutputs(const std::string &name) const = 0;
-  virtual framework::DDim GetInputDim(const std::string &name) const = 0;
+  DDim GetInputDim(const std::string &name) const;
-  std::vector<framework::DDim> GetInputsDim(const std::string &name) const;
+  std::vector<DDim> GetInputsDim(const std::string &name) const;
  DDim GetInputsElementDim(const std::string &name, int idx) const;
-  virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0;
+  void SetOutputDim(const std::string &name, const DDim &dim);
-  void SetOutputsDim(const std::string &name,
+  void SetOutputsDim(const std::string &name, const std::vector<DDim> &dims);
-                     const std::vector<framework::DDim> &dims);
  virtual AttrReader Attrs() const = 0;
  virtual const std::vector<std::string> &Inputs(
@@ -57,15 +56,13 @@ class InferShapeContext {
  // Note: In while op, we need this to be public
  void SetDims(const std::vector<std::string> &names,
-               const std::vector<framework::DDim> &dims);
+               const std::vector<DDim> &dims);
 protected:
-  virtual framework::DDim GetDim(const std::string &name) const = 0;
+  virtual DDim GetDim(const std::string &name) const = 0;
-  virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0;
+  virtual void SetDim(const std::string &name, const DDim &dim) = 0;
-  std::vector<framework::DDim> GetDims(
-      const std::vector<std::string> &names) const;
+  std::vector<DDim> GetDims(const std::vector<std::string> &names) const;
  std::vector<proto::VarDesc::VarType> GetVarTypes(
      const std::vector<std::string> &names) const;

--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -47,6 +47,11 @@ class Tensor {
 public:
  Tensor() : offset_(0) {}
+  /*! Constructor with place should only be used in pybind. */
+  explicit Tensor(const platform::Place& place) : offset_(0) {
+    holder_->set_place(place);
+  }
  /*! Return a pointer to mutable memory block. */
  template <typename T>
  inline T* data();
@@ -137,6 +142,7 @@ class Tensor {
    virtual std::type_index type() const = 0;
    virtual platform::Place place() const = 0;
    virtual void set_type(std::type_index type) = 0;
+    virtual void set_place(platform::Place place) = 0;
  };
  template <typename Place>
@@ -156,6 +162,7 @@ class Tensor {
    virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
    virtual std::type_index type() const { return type_; }
    virtual void set_type(std::type_index type) { type_ = type; }
+    virtual void set_place(platform::Place place) { place_ = place; }
    /*! the pointer of memory block. */
    std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_;

--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -178,19 +178,22 @@ public:
    real* inputData = inputs[0].data<real>();
    real* filterData = inputs[1].data<real>();
    real* outputData = outputs[0].data<real>();
+    real* colData = NULL;
    bool needIm2col = isNeedIm2col(filter);
    TensorShape imShape =
        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
    TensorShape colShape;
-    real* colData = NULL;
-    size_t colHeight = inputChannels / groups_ * filterHeight * filterWidth;
+    // Max col matrix width 4096, Max col matrix size 4M.
-    size_t colWidth = outputHeight * outputWidth;
+    size_t outputHeightSteps =
-    // Max col matrix height 256, Max col matrix width 1024
+        std::min(std::max(4096 / outputWidth, (size_t)1), outputHeight);
-    size_t stepColHeight = std::min(colHeight, static_cast<size_t>(256));
+    size_t maxColWidth = outputHeightSteps * outputWidth;
-    size_t stepColWidth = std::min(colWidth, static_cast<size_t>(2048));
+    size_t channelSteps =
+        std::min(std::max((1048576 / maxColWidth) / filterHeight * filterWidth,
+                          (size_t)1),
+                 inputChannels / groups_);
+    size_t maxColHeight = channelSteps * filterHeight * filterWidth;
    if (needIm2col) {
      colShape = TensorShape({inputChannels / groups_,
@@ -199,7 +202,7 @@ public:
                              outputHeight,
                              outputWidth});
-      resizeBuffer<Device>(stepColHeight * stepColWidth * sizeof(real));
+      resizeBuffer<Device>(maxColHeight * maxColWidth * sizeof(real));
      colData = reinterpret_cast<real*>(memory_->getBuf());
    }
@@ -209,20 +212,24 @@ public:
        (outputChannels / groups_) * outputHeight * outputWidth;
    size_t filterOffset = filter.getElements() / groups_;
-    int nStride = colWidth;
+    int nStride = outputHeight * outputWidth;
-    int kStride = colHeight;
+    int kStride = inputChannels / groups_ * filterHeight * filterWidth;
    for (size_t i = 0; i < batchSize; i++) {
+      filterData = inputs[1].data<real>();
      for (size_t g = 0; g < groups_; g++) {
        if (needIm2col) {
          real beta_ = beta;
-          for (size_t colHeightStart = 0; colHeightStart < colHeight;
+          for (size_t ic = 0; ic < inputChannels / groups_;
-               colHeightStart += stepColHeight) {
+               ic += channelSteps) {
-            for (size_t colWidthStart = 0; colWidthStart < colWidth;
+            int channels = std::min(inputChannels / groups_ - ic, channelSteps);
-                 colWidthStart += stepColWidth) {
+            for (size_t oh = 0; oh < outputHeight; oh += outputHeightSteps) {
-              int N = std::min(colWidth - colWidthStart, stepColWidth);
+              int height = std::min(outputHeight - oh, outputHeightSteps);
-              int K = std::min(colHeight - colHeightStart, stepColHeight);
+              int M = outputChannels / groups_;
+              int N = height * outputWidth;
+              int K = channels * filterHeight * filterWidth;
              // im2col
-              im2col(inputData + g * inputOffset,
+              im2col(inputData,
                     imShape,
                     colData,
                     colShape,
@@ -232,13 +239,12 @@ public:
                     paddingW(),
                     dilationH(),
                     dilationW(),
-                     colHeightStart,
+                     channels,
-                     K,
+                     oh,
-                     colWidthStart,
+                     height,
                     N);
              // gemm
-              int M = outputChannels / groups_;
              BlasGemm<Device, real>::compute(
                  false,
                  false,
@@ -246,12 +252,12 @@ public:
                  N,
                  K,
                  1.0f,
-                  filterData + g * filterOffset + colHeightStart,
+                  filterData + ic * filterHeight * filterWidth,
                  kStride,
                  colData,
                  N,
                  beta_,
-                  outputData + g * outputOffset + colWidthStart,
+                  outputData + oh * outputWidth,
                  nStride);
            }
            beta_ = 1.0;
@@ -266,17 +272,18 @@ public:
                                          N,
                                          K,
                                          1.0f,
-                                          filterData + g * filterOffset,
+                                          filterData,
                                          K,
-                                          inputData + g * inputOffset,
+                                          inputData,
                                          N,
                                          beta,
-                                          outputData + g * outputOffset,
+                                          outputData,
                                          N);
        }
+        inputData += inputOffset;
+        outputData += outputOffset;
+        filterData += filterOffset;
      }
-      inputData += inputChannels * inputHeight * inputWidth;
-      outputData += outputChannels * outputHeight * outputWidth;
    }
    memory_.reset();

--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -111,39 +111,42 @@ public:
                  int paddingWidth,
                  int dilationHeight,
                  int dilationWidth,
-                  int colHeightStart,
+                  int inputChannels,
-                  int colHeightSize,
+                  int colOffset,
-                  int colWidthStart,
+                  int colOutputHeight,
-                  int colWidthSize) {
+                  int colWidth) {
    int inputHeight = imShape[1];
    int inputWidth = imShape[2];
    int filterHeight = colShape[1];
    int filterWidth = colShape[2];
    int outputWidth = colShape[4];
-    for (int colh = 0; colh < colHeightSize; colh++) {
+    for (int ic = 0; ic < inputChannels; ic++) {
-      int wOffset = (colHeightStart + colh) % filterWidth;
+      for (int oh = 0; oh < colOutputHeight; oh++) {
-      int hOffset = ((colHeightStart + colh) / filterWidth) % filterHeight;
+        T* dstData = colData + oh * outputWidth;
-      int c_im = (colHeightStart + colh) / filterWidth / filterHeight;
+        for (int fh = 0; fh < filterHeight; fh++) {
+          for (int fw = 0; fw < filterWidth; fw++) {
-      for (int colw = 0; colw < colWidthSize; colw++) {
+            int imRowIdx = (oh + colOffset) * strideHeight +
-        int h = (colWidthStart + colw) / outputWidth;
+                           fh * dilationHeight - paddingHeight;
-        int w = (colWidthStart + colw) % outputWidth;
+            if (imRowIdx < 0 || imRowIdx >= inputHeight) {
+              memset(dstData, 0, outputWidth * sizeof(T));
-        int imRowIdx = h * strideHeight + hOffset * dilationHeight;
-        int imColIdx = w * strideWidth + wOffset * dilationWidth;
-        if ((imRowIdx - paddingHeight) < 0 ||
-            (imRowIdx - paddingHeight) >= inputHeight ||
-            (imColIdx - paddingWidth) < 0 ||
-            (imColIdx - paddingWidth) >= inputWidth) {
-          colData[colh * colWidthSize + colw] = static_cast<T>(0);
            } else {
-          imRowIdx += c_im * inputHeight - paddingHeight;
+              for (int ow = 0; ow < outputWidth; ow++) {
-          imColIdx -= paddingWidth;
+                int imColIdx =
-          colData[colh * colWidthSize + colw] =
+                    ow * strideWidth + fw * dilationWidth - paddingWidth;
-              imData[imRowIdx * inputWidth + imColIdx];
+                if (imColIdx < 0 || imColIdx >= inputWidth) {
+                  dstData[ow] = T(0);
+                } else {
+                  dstData[ow] = imData[imRowIdx * inputWidth + imColIdx];
+                }
+              }
+            }
+            dstData += colWidth;
+          }
        }
      }
+      colData += filterHeight * filterWidth * colWidth;
+      imData += inputHeight * inputWidth;
    }
  }
 };

--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
@@ -202,10 +202,10 @@ void TestIm2ColMobileFunctor() {
                          padding,
                          dilation,
                          dilation,
+                          channels,
                          0,
-                          height,
+                          outputHeight,
-                          0,
+                          outputHeight * outputWidth);
-                          width);
                  autotest::TensorCheckEqual(*output1, *output2);
                }

--- a/paddle/inference/CMakeLists.txt
+++ b/paddle/inference/CMakeLists.txt
-set(FLUID_CORE_MODULES proto_desc paddle_memory executor prune init)
+set(FLUID_CORE_MODULES proto_desc paddle_memory lod_tensor executor prune init)
 cc_library(paddle_fluid_api
    SRCS io.cc
@@ -29,19 +29,6 @@ add_custom_target(inference_lib_dist DEPENDS
  inference_lib framework_lib memory_lib platform_lib string_lib
  gflags_lib glog_lib protobuf_lib eigen3_lib)
-add_executable(example example.cc)
+if(WITH_TESTING)
-if(APPLE)
+  add_subdirectory(tests/book)
-  set(OPTIONAL_LINK_FLAGS)
-  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
-    set(OPTIONAL_LINK_FLAGS "-undefined dynamic_lookup")
-  endif()
-  target_link_libraries(example
-      -Wl,-force_load paddle_fluid
-      ${OPTIONAL_LINK_FLAGS}
-      ${PTOOLS_LIB})
-else()
-  target_link_libraries(example
-      -Wl,--start-group -Wl,--whole-archive paddle_fluid
-      -Wl,--no-whole-archive -Wl,--end-group
-      ${PTOOLS_LIB})
 endif()
--- a/paddle/inference/io.cc
+++ b/paddle/inference/io.cc
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/inference/io.h"
 #include <fstream>
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/feed_fetch_type.h"
 namespace paddle {
 namespace inference {
-const std::string kFeedOpType = "feed";
 bool IsParameter(const framework::VarDesc* var,
                 const framework::ProgramDesc& main_program) {
  if (var->Persistable()) {
@@ -27,7 +28,7 @@ bool IsParameter(const framework::VarDesc* var,
    for (size_t i = 0; i < main_program.Size(); ++i) {
      const framework::BlockDesc& block = main_program.Block(i);
      for (auto* op : block.AllOps()) {
-        if (op->Type() == kFeedOpType) {
+        if (op->Type() == framework::kFeedOpType) {
          continue;
        }
        for (auto input_argument_name : op->InputArgumentNames()) {
@@ -51,7 +52,7 @@ void LoadPersistables(framework::Executor& executor,
  framework::BlockDesc* load_block = load_program->MutableBlock(0);
  for (auto* var : global_block.AllVars()) {
    if (IsParameter(var, main_program)) {
-      LOG(INFO) << "parameter's name: " << var->Name();
+      VLOG(3) << "parameter's name: " << var->Name();
      framework::VarDesc* new_var = load_block->Var(var->Name());
      new_var->SetShape(var->Shape());

--- a/paddle/inference/io.h
+++ b/paddle/inference/io.h
@@ -17,18 +17,13 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
-#include "paddle/framework/block_desc.h"
 #include "paddle/framework/executor.h"
 #include "paddle/framework/program_desc.h"
 #include "paddle/framework/scope.h"
-#include "paddle/framework/var_desc.h"
 namespace paddle {
 namespace inference {
-bool IsParameter(const framework::VarDesc* var,
-                 const framework::ProgramDesc& main_program);
 void LoadPersistables(framework::Executor& executor,
                      framework::Scope& scope,
                      const std::string& dirname,

--- a/paddle/inference/tests/book/CMakeLists.txt
+++ b/paddle/inference/tests/book/CMakeLists.txt
+set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests)
+cc_test(test_inference_recognize_digits_mlp
+    SRCS test_inference_recognize_digits.cc
+    DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
+    ARGS --dirname=${PYTHON_TESTS_DIR}/book/recognize_digits_mlp.inference.model)
+set_tests_properties(test_inference_recognize_digits_mlp
+    PROPERTIES DEPENDS test_recognize_digits)
--- a/paddle/inference/example.cc
+++ b/paddle/inference/example.cc
@@ -12,93 +12,102 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <gtest/gtest.h>
 #include <time.h>
-#include <iostream>
+#include <sstream>
 #include "gflags/gflags.h"
-#include "paddle/framework/init.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/inference/io.h"
 DEFINE_string(dirname, "", "Directory of the inference model.");
-int main(int argc, char** argv) {
+template <typename Place, typename T>
-  google::ParseCommandLineFlags(&argc, &argv, true);
+void TestInference(const std::string& dirname,
-  if (FLAGS_dirname.empty()) {
+                   const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
-    // Example:
+                   std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
-    //   ./example --dirname=recognize_digits_mlp.inference.model
+  // 1. Define place, executor and scope
-    std::cout << "Usage: ./example --dirname=path/to/your/model" << std::endl;
+  auto place = Place();
-    exit(1);
+  auto executor = paddle::framework::Executor(place);
-  }
-  // 1. Define place, executor, scope
-  auto place = paddle::platform::CPUPlace();
-  paddle::framework::InitDevices();
-  auto* executor = new paddle::framework::Executor(place);
  auto* scope = new paddle::framework::Scope();
-  std::cout << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  // 2. Initialize the inference_program and load all parameters from file
-  std::string dirname = FLAGS_dirname;
+  auto inference_program = paddle::inference::Load(executor, *scope, dirname);
-  // 2. Initialize the inference program
-  auto inference_program = paddle::inference::Load(*executor, *scope, dirname);
-  // 3. Optional: perform optimization on the inference_program
+  // 3. Get the feed_target_names and fetch_target_names
-  // 4. Get the feed_target_names and fetch_target_names
  const std::vector<std::string>& feed_target_names =
      inference_program->GetFeedTargetNames();
  const std::vector<std::string>& fetch_target_names =
      inference_program->GetFetchTargetNames();
-  // 5. Generate input
+  // 4. Prepare inputs: set up maps for feed targets
-  paddle::framework::LoDTensor input;
-  srand(time(0));
-  float* input_ptr =
-      input.mutable_data<float>({1, 784}, paddle::platform::CPUPlace());
-  for (int i = 0; i < 784; ++i) {
-    input_ptr[i] = rand() / (static_cast<float>(RAND_MAX));
-  }
-  std::vector<paddle::framework::LoDTensor> feeds;
-  feeds.push_back(input);
-  std::vector<paddle::framework::LoDTensor> fetchs;
-  // Set up maps for feed and fetch targets
  std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
-  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
-  // set_feed_variable
  for (size_t i = 0; i < feed_target_names.size(); ++i) {
-    feed_targets[feed_target_names[i]] = &feeds[i];
+    // Please make sure that cpu_feeds[i] is right for feed_target_names[i]
+    feed_targets[feed_target_names[i]] = cpu_feeds[i];
  }
-  // get_fetch_variable
+  // 5. Define Tensor to get the outputs: set up maps for fetch targets
-  fetchs.resize(fetch_target_names.size());
+  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
  for (size_t i = 0; i < fetch_target_names.size(); ++i) {
-    fetch_targets[fetch_target_names[i]] = &fetchs[i];
+    fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
  }
-  // Run the inference program
+  // 6. Run the inference program
-  executor->Run(*inference_program, scope, feed_targets, fetch_targets);
+  executor.Run(*inference_program, scope, feed_targets, fetch_targets);
-  // Get outputs
+  delete scope;
-  for (size_t i = 0; i < fetchs.size(); ++i) {
+}
-    auto dims_i = fetchs[i].dims();
-    std::cout << "dims_i:";
+TEST(inference, recognize_digits) {
-    for (int j = 0; j < dims_i.size(); ++j) {
+  if (FLAGS_dirname.empty()) {
-      std::cout << " " << dims_i[j];
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
-    }
-    std::cout << std::endl;
-    std::cout << "result:";
-    float* output_ptr = fetchs[i].data<float>();
-    for (int j = 0; j < paddle::framework::product(dims_i); ++j) {
-      std::cout << " " << output_ptr[j];
-    }
-    std::cout << std::endl;
  }
-  delete scope;
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
-  delete executor;
+  std::string dirname = FLAGS_dirname;
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-  return 0;
+  paddle::framework::LoDTensor input;
+  srand(time(0));
+  float* input_ptr =
+      input.mutable_data<float>({1, 28, 28}, paddle::platform::CPUPlace());
+  for (int i = 0; i < 784; ++i) {
+    input_ptr[i] = rand() / (static_cast<float>(RAND_MAX));
+  }
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&input);
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace, float>(
+      dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.dims();
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace, float>(
+      dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.dims();
+  EXPECT_EQ(output1.dims(), output2.dims());
+  EXPECT_EQ(output1.numel(), output2.numel());
+  float err = 1E-3;
+  int count = 0;
+  for (int64_t i = 0; i < output1.numel(); ++i) {
+    if (fabs(output1.data<float>()[i] - output2.data<float>()[i]) > err) {
+      count++;
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
+#endif
 }
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -2015,13 +2015,6 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
    CHECK_EQ(channels * outLength, maskMatP->getWidth());
  }
-  /* initialize the data_ */
-  for (size_t i = 0; i < height_; i++) {
-    for (size_t j = 0; j < width_; j++) {
-      outData[i * outStride + j] = -(real)FLT_MAX;
-    }
-  }
  /* pool max one by one */
  for (size_t n = 0; n < num; ++n) {  // frame by frame
    if (!isContiguous()) {
@@ -2030,19 +2023,24 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
    for (size_t c = 0; c < channels; ++c) {  // channel by channel
      for (size_t ph = 0; ph < outputH; ++ph) {
        int hstart = ph * strideH - paddingH;
-        int hend = std::min(hstart + sizeY, imgSizeH);
+        int hend = hstart + sizeY;
-        hstart = std::max(hstart, 0);
+        hstart = hstart < 0 ? 0 : hstart;
+        hend = hend < (int)imgSizeH ? hend : (int)imgSizeH;
        for (size_t pw = 0; pw < outputW; ++pw) {
          int wstart = pw * strideW - paddingW;
-          int wend = std::min(wstart + sizeX, imgSizeW);
+          int wend = wstart + sizeX;
-          wstart = std::max(wstart, 0);
+          wstart = wstart < 0 ? 0 : wstart;
+          wend = wend < (int)imgSizeW ? wend : (int)imgSizeW;
          if (maskData == NULL) {
+            real tmp = -(real)FLT_MAX;
            for (int h = hstart; h < hend; ++h) {
              for (int w = wstart; w < wend; ++w) {
-                outData[ph * outputW + pw] = std::max(
+                tmp = tmp < inputData[h * imgSizeW + w]
-                    outData[ph * outputW + pw], inputData[h * imgSizeW + w]);
+                          ? inputData[h * imgSizeW + w]
+                          : tmp;
              }
            }
+            outData[ph * outputW + pw] = tmp;
          } else {
            for (int h = hstart; h < hend; ++h) {
              for (int w = wstart; w < wend; ++w) {

--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -122,9 +122,11 @@ if(WITH_DISTRIBUTE)
    set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
    set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
+    op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor)
 else()
-    set(DEPS_OPS ${DEPS_OPS} send_op recv_op)
+    set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op)
 endif()
 op_library(cond_op DEPS framework_proto tensor net_op)
@@ -156,7 +158,10 @@ op_library(parallel_do_op DEPS executor)
 # Regist multiple Kernel to pybind
 if (WITH_GPU)
-op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS vol2col)
+op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS
+    vol2col depthwise_conv)
 op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function)
 op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling)
 op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc
@@ -173,6 +178,8 @@ endif()
 # FIXME(typhoonzero): save/load depends lodtensor serialization functions
 op_library(save_op DEPS lod_tensor)
 op_library(load_op DEPS lod_tensor)
+op_library(save_combine_op DEPS lod_tensor)
+op_library(load_combine_op DEPS lod_tensor)
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
@@ -192,3 +199,4 @@ if(WITH_GPU)
    cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 endif()
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
+cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
--- a/paddle/operators/adagrad_op.cu
+++ b/paddle/operators/adagrad_op.cu
@@ -82,7 +82,7 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
    math::scatter::MergeAdd<platform::CUDADeviceContext, T> merge_func;
    auto grad_merge = merge_func(context, grad);
    auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
-    auto& merge_rows = grad_merge.rows();
+    framework::Vector<int64_t> merge_rows(grad_merge.rows());
    // 2. m += g_m * g_m
    math::scatter::Mul<platform::CUDADeviceContext, T> sqare_func;
    auto grad_square = sqare_func(context, grad_merge, grad_merge);
@@ -101,8 +101,8 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
    SparseAdagradFunctorKernel<
        T, 256><<<grid2, threads, 0,
                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(grad_merge_data, grad_merge.rows().data(),
+                      .stream()>>>(grad_merge_data, merge_rows.cuda_data(), lr,
-                                   lr, param_data, moment_data, grad_width,
+                                   param_data, moment_data, grad_width,
                                   epsilon);
  }
 };

--- a/paddle/operators/adam_op.h
+++ b/paddle/operators/adam_op.h
@@ -199,7 +199,12 @@ class AdamOpKernel : public framework::OpKernel<T> {
          merge_func(ctx.template device_context<DeviceContext>(), grad);
      auto& grad_tensor = grad_merge.value();
      const T* grad_data = grad_tensor.template data<T>();
-      auto* rows = grad_merge.rows().data();
+      int64_t* rows = nullptr;
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+        rows = grad_merge.mutable_rows()->cuda_data();
+      } else {
+        rows = grad_merge.mutable_rows()->data();
+      }
      auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
      SparseAdamFunctor<T> functor(

--- a/paddle/operators/bipartite_match_op.cc
+++ b/paddle/operators/bipartite_match_op.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -28,12 +28,18 @@ class BipartiteMatchOp : public framework::OperatorWithKernel {
  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("DistMat"),
                   "Input(DistMat) of BipartiteMatch should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("ColToRowMatchIndices"),
+        "Output(ColToRowMatchIndices) of BipartiteMatch should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("ColToRowMatchDist"),
+        "Output(ColToRowMatchDist) of BipartiteMatch should not be null.");
    auto dims = ctx->GetInputDim("DistMat");
    PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DistMat) must be 2.");
    ctx->SetOutputDim("ColToRowMatchIndices", dims);
-    ctx->SetOutputDim("ColToRowMatchDis", dims);
+    ctx->SetOutputDim("ColToRowMatchDist", dims);
  }
 };
@@ -91,7 +97,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& context) const override {
    auto* dist_mat = context.Input<LoDTensor>("DistMat");
    auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices");
-    auto* match_dist = context.Output<Tensor>("ColToRowMatchDis");
+    auto* match_dist = context.Output<Tensor>("ColToRowMatchDist");
    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
@@ -148,13 +154,13 @@ class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {
              "Otherwise, it means B[j] is matched to row "
              "ColToRowMatchIndices[i][j] in i-th instance. The row number of "
              "i-th instance is saved in ColToRowMatchIndices[i][j].");
-    AddOutput("ColToRowMatchDis",
+    AddOutput("ColToRowMatchDist",
              "(Tensor) A 2-D Tensor with shape [N, M] in float type. "
              "N is batch size. If ColToRowMatchIndices[i][j] is -1, "
-              "ColToRowMatchDis[i][j] is also -1.0. Otherwise, assumed "
+              "ColToRowMatchDist[i][j] is also -1.0. Otherwise, assumed "
              "ColToRowMatchIndices[i][j] = d, and the row offsets of each "
              "instance are called LoD. Then "
-              "ColToRowMatchDis[i][j] = DistMat[d+LoD[i]][j]");
+              "ColToRowMatchDist[i][j] = DistMat[d+LoD[i]][j]");
    AddComment(R"DOC(
 This operator is a greedy bipartite matching algorithm, which is used to
 obtain the matching with the maximum distance based on the input

--- a/paddle/operators/box_coder_op.cc
+++ b/paddle/operators/box_coder_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/box_coder_op.h"
+namespace paddle {
+namespace operators {
+class BoxCoderOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
+                   "Input(PriorBox) of BoxCoderOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("PriorBoxVar"),
+                   "Input(PriorBoxVar) of BoxCoderOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("TargetBox"),
+                   "Input(TargetBox) of BoxCoderOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutputBox"),
+                   "Output(OutputBox) of BoxCoderOp should not be null.");
+    auto prior_box_dims = ctx->GetInputDim("PriorBox");
+    auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
+    auto target_box_dims = ctx->GetInputDim("TargetBox");
+    PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2,
+                      "The rank of Input of PriorBoxVar must be 2");
+    PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]");
+    PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims);
+    PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
+                      "The rank of Input of TargetBox must be 2");
+    PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
+                      "The shape of TargetBox is [M, 4]");
+    GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
+    ctx->SetOutputDim(
+        "OutputBox",
+        framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4}));
+    ctx->ShareLoD("TargetBox", /*->*/ "OutputBox");
+  }
+};
+class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BoxCoderOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "PriorBox",
+        "(Tensor, default Tensor<float>) "
+        "Box list PriorBox is a 2-D Tensor with shape [M, 4] holds M boxes, "
+        "each box is represented as [xmin, ymin, xmax, ymax], "
+        "[xmin, ymin] is the left top coordinate of the anchor box, "
+        "if the input is image feature map, they are close to the origin "
+        "of the coordinate system. [xmax, ymax] is the right bottom "
+        "coordinate of the anchor box.");
+    AddInput("PriorBoxVar",
+             "(Tensor, default Tensor<float>) "
+             "PriorBoxVar is a 2-D Tensor with shape [M, 4] holds M group "
+             "of variance.");
+    AddInput(
+        "TargetBox",
+        "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
+        "[N, 4], each box is represented as [xmin, ymin, xmax, ymax], "
+        "[xmin, ymin] is the left top coordinate of the box if the input "
+        "is image feature map, they are close to the origin of the coordinate "
+        "system. [xmax, ymax] is the right bottom coordinate of the box. "
+        "This tensor can contain LoD information to represent a batch "
+        "of inputs. One instance of this batch can contain different "
+        "numbers of entities.");
+    AddAttr<std::string>("code_type",
+                         "(string, default encode_center_size) "
+                         "the code type used with the target box")
+        .SetDefault("encode_center_size")
+        .InEnum({"encode_center_size", "decode_center_size"});
+    AddOutput(
+        "OutputBox",
+        "(LoDTensor or Tensor) "
+        "(Tensor) The output of box_coder_op, a tensor with shape [N, M, 4] "
+        "representing the result of N target boxes encoded/decoded with "
+        "M Prior boxes and variances.");
+    AddComment(R"DOC(
+Bounding Box Coder Operator.
+Encode/Decode the target bounding box with the priorbox information.
+The Encoding schema described below:
+ox = (tx - px) / pw / pxv
+oy = (ty - py) / ph / pyv
+ow = log(abs(tw / pw)) / pwv 
+oh = log(abs(th / ph)) / phv 
+The Decoding schema described below:
+ox = (pw * pxv * tx * + px) - tw / 2
+oy = (ph * pyv * ty * + py) - th / 2
+ow = exp(pwv * tw) * pw + tw / 2
+oh = exp(phv * th) * ph + th / 2
+where tx, ty, tw, th denote the target box's center coordinates, width and
+height respectively. Similarly, px, py, pw, ph denote the priorbox's(anchor)
+center coordinates, width and height. pxv, pyv, pwv, phv denote the variance
+of the priorbox and ox, oy, ow, oh denote the encoded/decoded coordinates,
+width and height.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker);
+REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel<float>,
+                       ops::BoxCoderKernel<double>);
--- a/paddle/operators/box_coder_op.cu
+++ b/paddle/operators/box_coder_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/box_coder_op.h"
+#include "paddle/platform/cuda_helper.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+__global__ void EncodeCenterSizeKernel(const T* prior_box_data,
+                                       const T* prior_box_var_data,
+                                       const T* target_box_data, const int row,
+                                       const int col, const int len,
+                                       T* output) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < row * col) {
+    const int row_idx = idx / col;
+    const int col_idx = idx % col;
+    T prior_box_width =
+        prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
+    T prior_box_height =
+        prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
+    T prior_box_center_x =
+        (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
+    T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
+                            prior_box_data[col_idx * len + 1]) /
+                           2;
+    T target_box_center_x =
+        (target_box_data[row_idx * len + 2] + target_box_data[row_idx * len]) /
+        2;
+    T target_box_center_y = (target_box_data[row_idx * len + 3] +
+                             target_box_data[row_idx * len + 1]) /
+                            2;
+    T target_box_width =
+        target_box_data[row_idx * len + 2] - target_box_data[row_idx * len];
+    T target_box_height =
+        target_box_data[row_idx * len + 3] - target_box_data[row_idx * len + 1];
+    output[idx * len] = (target_box_center_x - prior_box_center_x) /
+                        prior_box_width / prior_box_var_data[col_idx * len];
+    output[idx * len + 1] = (target_box_center_y - prior_box_center_y) /
+                            prior_box_height /
+                            prior_box_var_data[col_idx * len + 1];
+    output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)) /
+                            prior_box_var_data[col_idx * len + 2];
+    output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)) /
+                            prior_box_var_data[col_idx * len + 3];
+  }
+}
+template <typename T>
+__global__ void DecodeCenterSizeKernel(const T* prior_box_data,
+                                       const T* prior_box_var_data,
+                                       const T* target_box_data, const int row,
+                                       const int col, const int len,
+                                       T* output) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < row * col) {
+    const int row_idx = idx / col;
+    const int col_idx = idx % col;
+    T prior_box_width =
+        prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
+    T prior_box_height =
+        prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
+    T prior_box_center_x =
+        (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
+    T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
+                            prior_box_data[col_idx * len + 1]) /
+                           2;
+    T target_box_width = exp(prior_box_var_data[col_idx * len + 2] *
+                             target_box_data[row_idx * len + 2]) *
+                         prior_box_width;
+    T target_box_height = exp(prior_box_var_data[col_idx * len + 3] *
+                              target_box_data[row_idx * len + 3]) *
+                          prior_box_height;
+    T target_box_center_x = prior_box_var_data[col_idx * len] *
+                                target_box_data[row_idx * len] *
+                                prior_box_width +
+                            prior_box_center_x;
+    T target_box_center_y = prior_box_var_data[col_idx * len + 1] *
+                                target_box_data[row_idx * len + 1] *
+                                prior_box_height +
+                            prior_box_center_y;
+    output[idx * len] = target_box_center_x - target_box_width / 2;
+    output[idx * len + 1] = target_box_center_y - target_box_height / 2;
+    output[idx * len + 2] = target_box_center_x + target_box_width / 2;
+    output[idx * len + 3] = target_box_center_y + target_box_height / 2;
+  }
+}
+template <typename T>
+class BoxCoderCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto* prior_box = context.Input<framework::Tensor>("PriorBox");
+    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
+    auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
+    auto* output_box = context.Output<framework::Tensor>("OutputBox");
+    if (target_box->lod().size()) {
+      PADDLE_ENFORCE_EQ(target_box->lod().size(), 1,
+                        "Only support 1 level of LoD.");
+    }
+    auto row = target_box->dims()[0];
+    auto col = prior_box->dims()[0];
+    auto len = prior_box->dims()[1];
+    int block = 512;
+    int grid = (row * col + block - 1) / block;
+    auto& device_ctx = context.cuda_device_context();
+    const T* prior_box_data = prior_box->data<T>();
+    const T* prior_box_var_data = prior_box_var->data<T>();
+    const T* target_box_data = target_box->data<T>();
+    output_box->mutable_data<T>({row, col, len}, context.GetPlace());
+    T* output = output_box->data<T>();
+    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
+    if (code_type == BoxCodeType::kEncodeCenterSize) {
+      EncodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
+          prior_box_data, prior_box_var_data, target_box_data, row, col, len,
+          output);
+    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
+      DecodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
+          prior_box_data, prior_box_var_data, target_box_data, row, col, len,
+          output);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(box_coder, ops::BoxCoderCUDAKernel<float>,
+                        ops::BoxCoderCUDAKernel<double>);
--- a/paddle/operators/box_coder_op.h
+++ b/paddle/operators/box_coder_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+namespace paddle {
+namespace operators {
+enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 };
+inline BoxCodeType GetBoxCodeType(const std::string& type) {
+  if (type == "encode_center_size") {
+    return BoxCodeType::kEncodeCenterSize;
+  } else if (type == "decode_center_size") {
+    return BoxCodeType::kDecodeCenterSize;
+  }
+  PADDLE_THROW("Not support type %s.", type);
+}
+template <typename T>
+class BoxCoderKernel : public framework::OpKernel<T> {
+ public:
+  void EncodeCenterSize(const framework::Tensor& target_box,
+                        const framework::Tensor& prior_box,
+                        const framework::Tensor& prior_box_var,
+                        T* output) const {
+    int64_t row = target_box.dims()[0];
+    int64_t col = prior_box.dims()[0];
+    int64_t len = prior_box.dims()[1];
+    auto* target_box_data = target_box.data<T>();
+    auto* prior_box_data = prior_box.data<T>();
+    auto* prior_box_var_data = prior_box_var.data<T>();
+    for (int64_t i = 0; i < row; ++i) {
+      for (int64_t j = 0; j < col; ++j) {
+        T prior_box_width =
+            prior_box_data[j * len + 2] - prior_box_data[j * len];
+        T prior_box_height =
+            prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
+        T prior_box_center_x =
+            (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
+        T prior_box_center_y =
+            (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
+        T target_box_center_x =
+            (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
+        T target_box_center_y =
+            (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
+        T target_box_width =
+            target_box_data[i * len + 2] - target_box_data[i * len];
+        T target_box_height =
+            target_box_data[i * len + 3] - target_box_data[i * len + 1];
+        size_t offset = i * col * len + j * len;
+        output[offset] = (target_box_center_x - prior_box_center_x) /
+                         prior_box_width / prior_box_var_data[j * len];
+        output[offset + 1] = (target_box_center_y - prior_box_center_y) /
+                             prior_box_height / prior_box_var_data[j * len + 1];
+        output[offset + 2] =
+            std::log(std::fabs(target_box_width / prior_box_width)) /
+            prior_box_var_data[j * len + 2];
+        output[offset + 3] =
+            std::log(std::fabs(target_box_height / prior_box_height)) /
+            prior_box_var_data[j * len + 3];
+      }
+    }
+  }
+  void DecodeCenterSize(const framework::Tensor& target_box,
+                        const framework::Tensor& prior_box,
+                        const framework::Tensor& prior_box_var,
+                        T* output) const {
+    int64_t row = target_box.dims()[0];
+    int64_t col = prior_box.dims()[0];
+    int64_t len = prior_box.dims()[1];
+    auto* target_box_data = target_box.data<T>();
+    auto* prior_box_data = prior_box.data<T>();
+    auto* prior_box_var_data = prior_box_var.data<T>();
+    for (int64_t i = 0; i < row; ++i) {
+      for (int64_t j = 0; j < col; ++j) {
+        T prior_box_width =
+            prior_box_data[j * len + 2] - prior_box_data[j * len];
+        T prior_box_height =
+            prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
+        T prior_box_center_x =
+            (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
+        T prior_box_center_y =
+            (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
+        T target_box_center_x = prior_box_var_data[j * len] *
+                                    target_box_data[i * len] * prior_box_width +
+                                prior_box_center_x;
+        T target_box_center_y = prior_box_var_data[j * len + 1] *
+                                    target_box_data[i * len + 1] *
+                                    prior_box_height +
+                                prior_box_center_y;
+        T target_box_width = std::exp(prior_box_var_data[j * len + 2] *
+                                      target_box_data[i * len + 2]) *
+                             prior_box_width;
+        T target_box_height = std::exp(prior_box_var_data[j * len + 3] *
+                                       target_box_data[i * len + 3]) *
+                              prior_box_height;
+        size_t offset = i * col * len + j * len;
+        output[offset] = target_box_center_x - target_box_width / 2;
+        output[offset + 1] = target_box_center_y - target_box_height / 2;
+        output[offset + 2] = target_box_center_x + target_box_width / 2;
+        output[offset + 3] = target_box_center_y + target_box_height / 2;
+      }
+    }
+  }
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* prior_box = context.Input<framework::Tensor>("PriorBox");
+    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
+    auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
+    auto* output_box = context.Output<framework::Tensor>("OutputBox");
+    if (target_box->lod().size()) {
+      PADDLE_ENFORCE_EQ(target_box->lod().size(), 1UL,
+                        "Only support 1 level of LoD.");
+    }
+    auto row = target_box->dims()[0];
+    auto col = prior_box->dims()[0];
+    auto len = prior_box->dims()[1];
+    output_box->mutable_data<T>({row, col, len}, context.GetPlace());
+    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
+    T* output = output_box->data<T>();
+    if (code_type == BoxCodeType::kEncodeCenterSize) {
+      EncodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
+    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
+      DecodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/compare_op.h
+++ b/paddle/operators/compare_op.h
@@ -54,7 +54,15 @@ class CompareOpKernel
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    using T = typename Functor::ELEM_TYPE;
-    ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context);
+    using Tensor = framework::Tensor;
+    auto* x = context.Input<Tensor>("X");
+    auto* y = context.Input<Tensor>("Y");
+    auto* z = context.Output<Tensor>("Out");
+    z->mutable_data<T>(context.GetPlace());
+    int axis = context.Attr<int>("axis");
+    ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, axis,
+                                                          z);
  }
 };

--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -318,9 +318,25 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
 namespace ops = paddle::operators;
 REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
            ops::ConvOpGrad);
+// depthwise convolution op
+REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
+            depthwise_conv2d_grad, ops::ConvOpGrad);
 REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
            ops::ConvOpGrad);
+// depthwise conv kernel
+// TODO(xingzhaolong): neon kernel for mobile
+REGISTER_OP_CPU_KERNEL(
+    depthwise_conv2d,
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    depthwise_conv2d_grad,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
    conv2d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);

--- a/paddle/operators/conv_op.cu.cc
+++ b/paddle/operators/conv_op.cu.cc
@@ -16,6 +16,16 @@ limitations under the License. */
 namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    depthwise_conv2d,
+    ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    depthwise_conv2d_grad,
+    ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(
    conv2d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
    ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);

--- a/paddle/operators/conv_op.h
+++ b/paddle/operators/conv_op.h
--- a/paddle/operators/ctc_align_op.cu
+++ b/paddle/operators/ctc_align_op.cu
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
--- a/paddle/operators/elementwise_add_op.h
+++ b/paddle/operators/elementwise_add_op.h
--- a/paddle/operators/elementwise_div_op.h
+++ b/paddle/operators/elementwise_div_op.h
--- a/paddle/operators/elementwise_max_op.h
+++ b/paddle/operators/elementwise_max_op.h
--- a/paddle/operators/elementwise_min_op.h
+++ b/paddle/operators/elementwise_min_op.h
--- a/paddle/operators/elementwise_mul_op.h
+++ b/paddle/operators/elementwise_mul_op.h
--- a/paddle/operators/elementwise_op_function.h
+++ b/paddle/operators/elementwise_op_function.h
--- a/paddle/operators/elementwise_pow_op.cc
+++ b/paddle/operators/elementwise_pow_op.cc
--- a/paddle/operators/elementwise_pow_op.cu
+++ b/paddle/operators/elementwise_pow_op.cu
--- a/paddle/operators/elementwise_pow_op.h
+++ b/paddle/operators/elementwise_pow_op.h
--- a/paddle/operators/elementwise_sub_op.h
+++ b/paddle/operators/elementwise_sub_op.h
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
--- a/paddle/operators/gru_op.h
+++ b/paddle/operators/gru_op.h
--- a/paddle/operators/label_smooth_op.cc
+++ b/paddle/operators/label_smooth_op.cc
--- a/paddle/operators/label_smooth_op.cu
+++ b/paddle/operators/label_smooth_op.cu
--- a/paddle/operators/label_smooth_op.h
+++ b/paddle/operators/label_smooth_op.h
--- a/paddle/operators/layer_norm_op.cc
+++ b/paddle/operators/layer_norm_op.cc
--- a/paddle/operators/layer_norm_op.h
+++ b/paddle/operators/layer_norm_op.h
--- a/paddle/operators/listen_and_serv_op.cc
+++ b/paddle/operators/listen_and_serv_op.cc
--- a/paddle/operators/load_combine_op.cc
+++ b/paddle/operators/load_combine_op.cc
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
--- a/paddle/operators/lstm_op.h
+++ b/paddle/operators/lstm_op.h
--- a/paddle/operators/lstmp_op.h
+++ b/paddle/operators/lstmp_op.h
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
--- a/paddle/operators/math/depthwise_conv.cu
+++ b/paddle/operators/math/depthwise_conv.cu
--- a/paddle/operators/math/depthwise_conv.h
+++ b/paddle/operators/math/depthwise_conv.h
--- a/paddle/operators/math/selected_rows_functor.cu
+++ b/paddle/operators/math/selected_rows_functor.cu
--- a/paddle/operators/math/sequence2batch.cc
+++ b/paddle/operators/math/sequence2batch.cc
--- a/paddle/operators/math/sequence2batch.cu
+++ b/paddle/operators/math/sequence2batch.cu
--- a/paddle/operators/math/sequence2batch.h
+++ b/paddle/operators/math/sequence2batch.h
--- a/paddle/operators/math/sequence_padding.cu
+++ b/paddle/operators/math/sequence_padding.cu
--- a/paddle/operators/math/sequence_pooling.cu
+++ b/paddle/operators/math/sequence_pooling.cu
--- a/paddle/operators/math/sequence_scale.cu
+++ b/paddle/operators/math/sequence_scale.cu
--- a/paddle/operators/mine_hard_examples_op.cc
+++ b/paddle/operators/mine_hard_examples_op.cc
--- a/paddle/operators/multiclass_nms_op.cc
+++ b/paddle/operators/multiclass_nms_op.cc
--- a/paddle/operators/recv_op.cc
+++ b/paddle/operators/recv_op.cc
--- a/paddle/operators/row_conv_op.cu
+++ b/paddle/operators/row_conv_op.cu
--- a/paddle/operators/save_combine_op.cc
+++ b/paddle/operators/save_combine_op.cc
--- a/paddle/operators/save_load_combine_op_test.cc
+++ b/paddle/operators/save_load_combine_op_test.cc
--- a/paddle/operators/save_load_op_test.cc
+++ b/paddle/operators/save_load_op_test.cc
--- a/paddle/operators/send_op.cc
+++ b/paddle/operators/send_op.cc
--- a/paddle/operators/send_recv_op_test.cc
+++ b/paddle/operators/send_recv_op_test.cc
--- a/paddle/operators/sequence_erase_op.cu
+++ b/paddle/operators/sequence_erase_op.cu
--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
--- a/paddle/platform/profiler.cc
+++ b/paddle/platform/profiler.cc
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
--- a/paddle/scripts/docker/README.md
+++ b/paddle/scripts/docker/README.md
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
--- a/paddle/scripts/docker/test.sh
+++ b/paddle/scripts/docker/test.sh
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
--- a/python/paddle/v2/fluid/debuger.py
+++ b/python/paddle/v2/fluid/debuger.py
--- a/python/paddle/v2/fluid/distribute_transpiler.py
+++ b/python/paddle/v2/fluid/distribute_transpiler.py
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
--- a/python/paddle/v2/fluid/graphviz.py
+++ b/python/paddle/v2/fluid/graphviz.py
--- a/python/paddle/v2/fluid/io.py
+++ b/python/paddle/v2/fluid/io.py
--- a/python/paddle/v2/fluid/layers/io.py
+++ b/python/paddle/v2/fluid/layers/io.py
--- a/python/paddle/v2/fluid/layers/math_op_patch.py
+++ b/python/paddle/v2/fluid/layers/math_op_patch.py
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
--- a/python/paddle/v2/fluid/layers/ops.py
+++ b/python/paddle/v2/fluid/layers/ops.py
--- a/python/paddle/v2/fluid/layers/tensor.py
+++ b/python/paddle/v2/fluid/layers/tensor.py
--- a/python/paddle/v2/fluid/learning_rate_decay.py
+++ b/python/paddle/v2/fluid/learning_rate_decay.py
--- a/python/paddle/v2/fluid/memory_optimization_transpiler.py
+++ b/python/paddle/v2/fluid/memory_optimization_transpiler.py
--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
--- a/python/paddle/v2/fluid/profiler.py
+++ b/python/paddle/v2/fluid/profiler.py
--- a/python/paddle/v2/fluid/tests/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/CMakeLists.txt
--- a/python/paddle/v2/fluid/tests/book/.gitignore
+++ b/python/paddle/v2/fluid/tests/book/.gitignore
--- a/python/paddle/v2/fluid/tests/book/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/book/CMakeLists.txt
--- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
--- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
--- a/python/paddle/v2/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
--- a/python/paddle/v2/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py
--- a/python/paddle/v2/fluid/tests/test_bipartite_match_op.py
+++ b/python/paddle/v2/fluid/tests/test_bipartite_match_op.py
--- a/python/paddle/v2/fluid/tests/test_box_coder_op.py
+++ b/python/paddle/v2/fluid/tests/test_box_coder_op.py
--- a/python/paddle/v2/fluid/tests/test_conv2d_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv2d_op.py
--- a/python/paddle/v2/fluid/tests/test_dropout_op.py
+++ b/python/paddle/v2/fluid/tests/test_dropout_op.py
--- a/python/paddle/v2/fluid/tests/test_elementwise_pow_op.py
+++ b/python/paddle/v2/fluid/tests/test_elementwise_pow_op.py
--- a/python/paddle/v2/fluid/tests/test_label_smooth_op.py
+++ b/python/paddle/v2/fluid/tests/test_label_smooth_op.py
--- a/python/paddle/v2/fluid/tests/test_layer_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_layer_norm_op.py
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
--- a/python/paddle/v2/fluid/tests/test_learning_rate_decay.py
+++ b/python/paddle/v2/fluid/tests/test_learning_rate_decay.py
--- a/python/paddle/v2/fluid/tests/test_mine_hard_examples_op.py
+++ b/python/paddle/v2/fluid/tests/test_mine_hard_examples_op.py
--- a/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py
+++ b/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py
--- a/python/paddle/v2/fluid/tests/test_recv_op.py
+++ b/python/paddle/v2/fluid/tests/test_recv_op.py
--- a/python/paddle/v2/fluid/tests/test_tensor.py
+++ b/python/paddle/v2/fluid/tests/test_tensor.py