Merge branch 'develop' of http://github.com/paddlepaddle/paddle into fix_seq_expand

a51c8ccc · Yi Wang · b368f13e · 87b8c620 · a51c8ccc · a51c8ccc
1000 changed file
--- a/.copyright.hook
+++ b/.copyright.hook
@@ -9,7 +9,7 @@ import subprocess
 import platform

 COPYRIGHT = '''
-  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.

--- a/.gitignore
+++ b/.gitignore
+paddle/operators/check_t.save
+paddle/operators/check_tensor.ls
+paddle/operators/tensor.save
+python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/
+python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/
+python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/
 *.DS_Store
 build/
 build_doc/
@@ -27,5 +33,5 @@ CMakeFiles
 cmake_install.cmake
 paddle/.timestamp
 python/paddlepaddle.egg-info/
-paddle/pybind/pybind.h
+paddle/fluid/pybind/pybind.h
 python/paddle/version.py
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,7 +39,7 @@ option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_F
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
 option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
-option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
+option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
 option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check"         ON)
 option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
@@ -137,7 +137,7 @@ include(external/openblas)  # download, build, install openblas
 include(external/mkldnn)    # download, build, install mkldnn
 include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
-include(external/boost)     # download, build, install boost
+include(external/boost)     # download boost
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
@@ -156,6 +156,7 @@ include(rdma)               # set rdma libraries
 include(flags)              # set paddle compile flags
 include(version)            # set PADDLE_VERSION
 include(coveralls)          # set code coverage
+include(inference_lib)      # add paddle fluid inference libraries


 include_directories("${PADDLE_SOURCE_DIR}")

--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
 # Contribute Code

+You are welcome to contribute to project PaddlePaddle. To contribute to PaddlePaddle, you have to agree with the 
+[PaddlePaddle Contributor License Agreement](https://gist.github.com/wangkuiyi/0c22c7b1bd3bb7eb27d76f85c3a3e329).
+
 We sincerely appreciate your contribution.  This document explains our workflow and work style.

 ## Workflow

--- a/benchmark/cluster/vgg16/Dockerfile
+++ b/benchmark/cluster/vgg16/Dockerfile
+#FROM python:2.7.14
+FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
+RUN apt-get update && apt-get install -y python
+RUN pip install -U kubernetes opencv-python &&   apt-get update -y &&   apt-get install -y iputils-ping libgtk2.0-dev
+# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
+#       so we must build one with distribute support to install in this image.
+RUN pip install paddlepaddle
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
+RUN pip uninstall -y paddlepaddle
+
+# below lines may change a lot for debugging
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
+ADD *.whl /
+RUN pip install /*.whl && rm -f /*.whl && \
+chmod +x /usr/bin/paddle_k8s
+ENV LD_LIBRARY_PATH=/usr/local/lib
+ADD vgg16_fluid.py vgg16_v2.py /workspace/
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
+# Performance for Distributed vgg16
+
+## Test Result
+
+### Hardware Infomation
+
+- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
+- cpu MHz		: 2101.000
+- cache size	: 20480 KB
+
+### Single Node Single Thread
+
+- PServer Count: 10
+- Trainer Count: 20
+- Metrics: samples / sec
+
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
+| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
+| TensorFlow | - | - | - | - |
+
+### Different Batch Size
+
+- PServer Count: 10
+- Trainer Count: 20
+- Per trainer CPU Core: 1
+- Metrics: samples / sec
+
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
+| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
+| TensorFlow | - | - | - | - |
+
+
+### Accelerate Rate
+
+- Pserver Count: 20
+- Batch Size: 128
+- Metrics: samples / sec
+
+| Trainer Count | 20 | 40 | 80 | 100 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
+| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
+| TensorFlow | - | - | - | - |
+
+### Different Pserver Count
+
+- Trainer Count: 60
+- Batch Size: 128
+- Metrics: samples/ sec
+
+| PServer Count | 3 | 6 |10 | 20 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
+| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
+| TensorFlow | - | - | - | - |
+
+*The performance gap between Fuild and v2 comes from the network interference.*
+
+
+## Steps to Run the Performance Test
+
+1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
+1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
+1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
+1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
+1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
+
+Check the logs for the distributed training progress and analyze the performance.
+
+## Enable Verbos Logs
+
+Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.
--- a/benchmark/cluster/vgg16/fluid_pserver.yaml
+++ b/benchmark/cluster/vgg16/fluid_pserver.yaml
+apiVersion: extensions/v1beta1
+kind: ReplicaSet
+metadata:
+  name: vgg16job-pserver
+spec:
+  replicas: 10
+  template:
+    metadata:
+      labels:
+        paddle-job-pserver: vgg16job
+    spec:
+      hostNetwork: true
+      imagePullSecrets:
+      - name: job-registry-secret
+      containers:
+      - name: pserver
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+        imagePullPolicy: Always
+        ports:
+        - name: jobport-30236
+          containerPort: 30236
+        env:
+        - name: PADDLE_JOB_NAME
+          value: vgg16job
+        - name: MKL_NUM_THREADS
+          value: "1"
+        - name: TRAINING_ROLE
+          value: "PSERVER"
+        - name: TRAINERS
+          value: "20"
+        - name: PSERVERS
+          value: "10"
+        - name: TOPOLOGY
+          value: ""
+        - name: ENTRY
+          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: PADDLE_INIT_PORT
+          value: "30236"
+        - name: PADDLE_INIT_NICS
+          value: "xgbe0"
+        - name: PADDLE_INIT_TRAINER_COUNT
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+          value: "1"
+        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+          value: "20"
+        - name: PADDLE_INIT_NUM_PASSES
+          value: "1"
+        - name: PADDLE_INIT_USE_GPU
+          value: "0"
+        - name: LD_LIBRARY_PATH
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        - name: POD_IP
+          valueFrom:
+            fieldRef:
+              fieldPath: "status.podIP"
+        command: ["paddle_k8s", "start_fluid"]
+        resources:
+          requests:
+            memory: 10Gi
+            cpu: 4
+          limits:
+            memory: 10Gi
+            cpu: 4
--- a/benchmark/cluster/vgg16/fluid_trainer.yaml
+++ b/benchmark/cluster/vgg16/fluid_trainer.yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: vgg16job-trainer
+spec:
+  parallelism: 20
+  completions: 20
+  template:
+    metadata:
+      labels:
+        paddle-job: vgg16job
+    spec:
+      imagePullSecrets:
+        - name: job-registry-secret
+      hostNetwork: true
+      containers:
+      - name: trainer
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+        imagePullPolicy: Always
+        command: ["paddle_k8s", "start_fluid"]
+        env:
+        - name: PADDLE_JOB_NAME
+          value: vgg16job
+        - name: TRAINING_ROLE
+          value: "TRAINER"
+        - name: TRAINERS
+          value: "20"
+        - name: PSERVERS
+          value: "10"
+        - name: TOPOLOGY
+          value: ""
+        - name: ENTRY
+          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: PADDLE_INIT_PORT
+          value: "30236"
+        - name: PADDLE_INIT_NICS
+          value: "xgbe0"
+        - name: PADDLE_INIT_TRAINER_COUNT
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+          value: "1"
+        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+          value: "20"
+        - name: PADDLE_INIT_NUM_PASSES
+          value: "1"
+        - name: PADDLE_INIT_USE_GPU
+          value: "0"
+        - name: LD_LIBRARY_PATH
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        - name: POD_IP
+          valueFrom:
+            fieldRef:
+              fieldPath: "status.podIP"
+        resources:
+          requests:
+            memory: 40Gi
+            cpu: 2
+          limits:
+            memory: 40Gi
+            cpu: 2
+      restartPolicy: Never
--- a/benchmark/cluster/vgg16/v2_pserver.yaml
+++ b/benchmark/cluster/vgg16/v2_pserver.yaml
+apiVersion: extensions/v1beta1
+kind: ReplicaSet
+metadata:
+  name: vgg16v2job-pserver
+spec:
+  replicas: 10
+  template:
+    metadata:
+      labels:
+        paddle-job-pserver: vgg16v2job
+    spec:
+      hostNetwork: true
+      imagePullSecrets:
+      - name: job-registry-secret
+      containers:
+      - name: pserver
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+        imagePullPolicy: Always
+        ports:
+        - name: jobport-30236
+          containerPort: 30236
+        env:
+        - name: PADDLE_JOB_NAME
+          value: vgg16v2job
+        - name: TRAINERS
+          value: "20"
+        - name: PSERVERS
+          value: "10"
+        - name: TOPOLOGY
+          value: ""
+        - name: ENTRY
+          value: "python train.py"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: PADDLE_INIT_PORT
+          value: "30236"
+        - name: PADDLE_INIT_NICS
+          value: "xgbe0"
+        - name: PADDLE_INIT_TRAINER_COUNT
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+          value: "1"
+        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+          value: "20"
+        - name: PADDLE_INIT_NUM_PASSES
+          value: "1"
+        - name: PADDLE_INIT_USE_GPU
+          value: "0"
+        - name: LD_LIBRARY_PATH
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        command: ["paddle_k8s", "start_pserver"]
+        resources:
+          requests:
+            memory: 10Gi
+            cpu: 4
+          limits:
+            memory: 10Gi
+            cpu: 4
--- a/benchmark/cluster/vgg16/v2_trainer.yaml
+++ b/benchmark/cluster/vgg16/v2_trainer.yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: vgg16v2job-trainer
+spec:
+  parallelism: 20
+  completions: 20
+  template:
+    metadata:
+      labels:
+        paddle-job: vgg16v2job
+    spec:
+      imagePullSecrets:
+        - name: job-registry-secret
+      hostNetwork: true
+      containers:
+      - name: trainer
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+        imagePullPolicy: Always
+        command: ["paddle_k8s", "start_trainer", "v2"]
+        env:
+        - name: PADDLE_JOB_NAME
+          value: vgg16v2job
+        - name: BATCH_SIZE
+          value: "256"
+        - name: TRAINERS
+          value: "20"
+        - name: PSERVERS
+          value: "10"
+        - name: TOPOLOGY
+          value: ""
+        - name: ENTRY
+          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: PADDLE_INIT_PORT
+          value: "30236"
+        - name: PADDLE_INIT_NICS
+          value: "xgbe0"
+        - name: PADDLE_INIT_TRAINER_COUNT
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+          value: "1"
+        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+          value: "20"
+        - name: PADDLE_INIT_NUM_PASSES
+          value: "2"
+        - name: PADDLE_INIT_USE_GPU
+          value: "0"
+        - name: LD_LIBRARY_PATH
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        resources:
+          requests:
+            memory: 40Gi
+            cpu: 2
+          limits:
+            memory: 40Gi
+            cpu: 2
+      restartPolicy: Never
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VGG16 benchmark in Fluid"""
+from __future__ import print_function
+
+import sys
+import time
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.profiler as profiler
+import argparse
+import functools
+import os
+
+
+def str2bool(v):
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--batch_size', type=int, default=128, help="Batch size for training.")
+parser.add_argument(
+    '--learning_rate',
+    type=float,
+    default=1e-3,
+    help="Learning rate for training.")
+parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
+parser.add_argument(
+    '--device',
+    type=str,
+    default='CPU',
+    choices=['CPU', 'GPU'],
+    help="The device type.")
+parser.add_argument('--device_id', type=int, default=0, help="The device id.")
+parser.add_argument(
+    '--data_format',
+    type=str,
+    default='NCHW',
+    choices=['NCHW', 'NHWC'],
+    help='The data order, now only support NCHW.')
+parser.add_argument(
+    '--data_set',
+    type=str,
+    default='cifar10',
+    choices=['cifar10', 'flowers'],
+    help='Optional dataset for benchmark.')
+parser.add_argument(
+    '--local',
+    type=str2bool,
+    default=True,
+    help='Whether to run as local mode.')
+args = parser.parse_args()
+
+
+def vgg16_bn_drop(input):
+    def conv_block(input, num_filter, groups, dropouts):
+        return fluid.nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max')
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    return fc2
+
+
+def main():
+    if args.data_set == "cifar10":
+        classdim = 10
+        if args.data_format == 'NCHW':
+            data_shape = [3, 32, 32]
+        else:
+            data_shape = [32, 32, 3]
+    else:
+        classdim = 102
+        if args.data_format == 'NCHW':
+            data_shape = [3, 224, 224]
+        else:
+            data_shape = [224, 224, 3]
+
+    # Input data
+    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # Train program
+    net = vgg16_bn_drop(images)
+    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Evaluator
+    accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+
+    # inference program
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        test_target = accuracy.metrics + accuracy.states
+        inference_program = fluid.io.get_inference_program(test_target)
+
+    # Optimization
+    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+    optimize_ops, params_grads = optimizer.minimize(avg_cost)
+
+    # Initialize executor
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(
+        args.device_id)
+    exe = fluid.Executor(place)
+
+    # test
+    def test(exe):
+        accuracy.reset(exe)
+        for batch_id, data in enumerate(test_reader()):
+            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
+                                    data)).astype("float32")
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([-1, 1])
+
+            exe.run(inference_program,
+                    feed={"pixel": img_data,
+                          "label": y_data})
+
+        return accuracy.eval(exe)
+
+    def train_loop(exe, trainer_prog):
+        iters = 0
+        ts = time.time()
+        for pass_id in range(args.num_passes):
+            # train
+            start_time = time.time()
+            num_samples = 0
+            accuracy.reset(exe)
+            with profiler.profiler("CPU", 'total') as prof:
+                for batch_id, data in enumerate(train_reader()):
+                    ts = time.time()
+                    img_data = np.array(
+                        map(lambda x: x[0].reshape(data_shape), data)).astype(
+                            "float32")
+                    y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                    y_data = y_data.reshape([-1, 1])
+
+                    loss, acc = exe.run(
+                        trainer_prog,
+                        feed={"pixel": img_data,
+                              "label": y_data},
+                        fetch_list=[avg_cost] + accuracy.metrics)
+                    iters += 1
+                    num_samples += len(data)
+                    print(
+                        "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f"
+                        % (pass_id, iters, loss, acc, time.time() - ts)
+                    )  # The accuracy is the accumulation of batches, but not the current batch.
+
+            pass_elapsed = time.time() - start_time
+            pass_train_acc = accuracy.eval(exe)
+            pass_test_acc = test(exe)
+            print(
+                "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"
+                % (pass_id, num_samples / pass_elapsed, pass_train_acc,
+                   pass_test_acc))
+
+    if args.local:
+        # Parameter initialization
+        exe.run(fluid.default_startup_program())
+
+        # data reader
+        train_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
+                else paddle.dataset.flowers.train(),
+                buf_size=5120),
+            batch_size=args.batch_size)
+        test_reader = paddle.batch(
+            paddle.dataset.cifar.test10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+            batch_size=args.batch_size)
+        train_loop(exe, fluid.default_main_program())
+    else:
+        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # all pserver endpoints
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, "6174"]))
+        pserver_endpoints = ",".join(eplist)
+        print("pserver endpoints: ", pserver_endpoints)
+        trainers = int(os.getenv("TRAINERS"))  # total trainer count
+        print("trainers total: ", trainers)
+        current_endpoint = os.getenv(
+            "POD_IP") + ":6174"  # current pserver endpoint
+        training_role = os.getenv(
+            "TRAINING_ROLE",
+            "TRAINER")  # get the training role: trainer/pserver
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            optimize_ops,
+            params_grads,
+            pservers=pserver_endpoints,
+            trainers=trainers)
+
+        if training_role == "PSERVER":
+            if not current_endpoint:
+                print("need env SERVER_ENDPOINT")
+                exit(1)
+            pserver_prog = t.get_pserver_program(current_endpoint)
+            pserver_startup = t.get_startup_program(current_endpoint,
+                                                    pserver_prog)
+            print("starting server side startup")
+            exe.run(pserver_startup)
+            print("starting parameter server...")
+            exe.run(pserver_prog)
+        elif training_role == "TRAINER":
+            # Parameter initialization
+            exe.run(fluid.default_startup_program())
+
+            # data reader
+            train_reader = paddle.batch(
+                paddle.reader.shuffle(
+                    paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
+                    else paddle.dataset.flowers.train(),
+                    buf_size=5120),
+                batch_size=args.batch_size)
+            test_reader = paddle.batch(
+                paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else
+                paddle.dataset.flowers.test(),
+                batch_size=args.batch_size)
+
+            trainer_prog = t.get_trainer_program()
+            feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
+            # TODO(typhoonzero): change trainer startup program to fetch parameters from pserver
+            exe.run(fluid.default_startup_program())
+            train_loop(exe, trainer_prog)
+        else:
+            print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
+
+
+def print_arguments():
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == "__main__":
+    print_arguments()
+    main()
--- a/benchmark/cluster/vgg16/vgg16_v2.py
+++ b/benchmark/cluster/vgg16/vgg16_v2.py
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import gzip
+
+import paddle.v2.dataset.cifar as cifar
+import paddle.v2 as paddle
+import time
+import os
+
+DATA_DIM = 3 * 32 * 32
+CLASS_DIM = 10
+BATCH_SIZE = os.getenv("BATCH_SIZE")
+if BATCH_SIZE:
+    BATCH_SIZE = int(BATCH_SIZE)
+else:
+    BATCH_SIZE = 128
+print "batch_size", BATCH_SIZE
+NODE_COUNT = int(os.getenv("TRAINERS"))
+ts = 0
+
+
+def vgg(input, nums, class_dim):
+    def conv_block(input, num_filter, groups, num_channels=None):
+        return paddle.networks.img_conv_group(
+            input=input,
+            num_channels=num_channels,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act=paddle.activation.Relu(),
+            pool_type=paddle.pooling.Max())
+
+    assert len(nums) == 5
+    # the channel of input feature is 3
+    conv1 = conv_block(input, 64, nums[0], 3)
+    conv2 = conv_block(conv1, 128, nums[1])
+    conv3 = conv_block(conv2, 256, nums[2])
+    conv4 = conv_block(conv3, 512, nums[3])
+    conv5 = conv_block(conv4, 512, nums[4])
+
+    fc_dim = 512
+    fc1 = paddle.layer.fc(input=conv5,
+                          size=fc_dim,
+                          act=paddle.activation.Relu(),
+                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
+    fc2 = paddle.layer.fc(input=fc1,
+                          size=fc_dim,
+                          act=paddle.activation.Relu(),
+                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
+    out = paddle.layer.fc(input=fc2,
+                          size=class_dim,
+                          act=paddle.activation.Softmax())
+    return out
+
+
+def vgg13(input, class_dim):
+    nums = [2, 2, 2, 2, 2]
+    return vgg(input, nums, class_dim)
+
+
+def vgg16(input, class_dim):
+    nums = [2, 2, 3, 3, 3]
+    return vgg(input, nums, class_dim)
+
+
+def vgg19(input, class_dim):
+    nums = [2, 2, 4, 4, 4]
+    return vgg(input, nums, class_dim)
+
+
+def main():
+    global ts
+    paddle.init(use_gpu=False)
+    image = paddle.layer.data(
+        name="image", type=paddle.data_type.dense_vector(DATA_DIM))
+    lbl = paddle.layer.data(
+        name="label", type=paddle.data_type.integer_value(CLASS_DIM))
+
+    extra_layers = None
+    # NOTE: for v2 distributed training need averaging updates.
+    learning_rate = 1e-3 / NODE_COUNT
+    out = vgg16(image, class_dim=CLASS_DIM)
+    cost = paddle.layer.classification_cost(input=out, label=lbl)
+
+    # Create parameters
+    parameters = paddle.parameters.create(cost)
+
+    # Create optimizer
+    optimizer = paddle.optimizer.Momentum(
+        momentum=0.9,
+        regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
+                                                         BATCH_SIZE),
+        learning_rate=learning_rate / BATCH_SIZE,
+        learning_rate_decay_a=0.1,
+        learning_rate_decay_b=128000 * 35,
+        learning_rate_schedule="discexp", )
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            cifar.train10(),
+            # To use other data, replace the above line with:
+            # reader.train_reader('train.list'),
+            buf_size=1000),
+        batch_size=BATCH_SIZE)
+    test_reader = paddle.batch(
+        cifar.test10(),
+        # To use other data, replace the above line with:
+        # reader.test_reader('val.list'),
+        batch_size=BATCH_SIZE)
+
+    # Create trainer
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer,
+                                 extra_layers=extra_layers,
+                                 is_local=False)
+
+    # End batch and end pass event handler
+    def event_handler(event):
+        global ts, ts_pass
+        if isinstance(event, paddle.event.BeginPass):
+            ts_pass = time.time()
+        if isinstance(event, paddle.event.BeginIteration):
+            ts = time.time()
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 1 == 0:
+                print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics,
+                    time.time() - ts)
+        if isinstance(event, paddle.event.EndPass):
+            print "Pass %d end, spent: %f" % (event.pass_id,
+                                              time.time() - ts_pass)
+            result = trainer.test(reader=test_reader)
+            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+
+    trainer.train(
+        reader=train_reader, num_passes=200, event_handler=event_handler)
+
+
+if __name__ == '__main__':
+    main()
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -181,7 +181,8 @@ elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
 elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
-    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
+    # nvcc 9 does not support -Os. Use Release flags instead
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
 endif()

 mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)

--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -15,12 +15,13 @@
 include(ExternalProject)

 set(BOOST_PROJECT       "extern_boost")
-set(BOOST_VER           "1.66.0")
-set(BOOST_TAR           "boost_1_66_0")
-set(BOOST_URL           "https://dl.bintray.com/boostorg/release/${BOOST_VER}/source/${BOOST_TAR}.tar.gz")
+set(BOOST_VER           "1.41.0")
+set(BOOST_TAR           "boost_1_41_0")
+set(BOOST_URL           "http://sourceforge.net/projects/boost/files/boost/${BOOST_VER}/${BOOST_TAR}.tar.gz")
 set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
 set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
 set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
+set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)

 include_directories(${BOOST_INCLUDE_DIR})


--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -28,9 +28,3 @@ endif()
 add_dependencies(eigen3 extern_eigen3)

 LIST(APPEND external_project_dependencies eigen3)
-
-IF(NOT WITH_C_API AND WITH_FLUID)
-    INSTALL(FILES ${EIGEN_INCLUDE_DIR}/Eigen/Core DESTINATION third_party/eigen3/Eigen)
-    INSTALL(DIRECTORY ${EIGEN_INCLUDE_DIR}/Eigen/src DESTINATION third_party/eigen3/Eigen)
-    INSTALL(DIRECTORY ${EIGEN_INCLUDE_DIR}/unsupported/Eigen DESTINATION third_party/eigen3/unsupported)
-ENDIF()
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -52,7 +52,7 @@ ADD_DEPENDENCIES(gflags extern_gflags)

 LIST(APPEND external_project_dependencies gflags)

-IF(WITH_C_API OR WITH_FLUID)
+IF(WITH_C_API)
  INSTALL(DIRECTORY ${GFLAGS_INCLUDE_DIR} DESTINATION third_party/gflags)
  IF(ANDROID)
    INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib/${ANDROID_ABI})

--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -68,7 +68,7 @@ LINK_LIBRARIES(glog gflags)

 LIST(APPEND external_project_dependencies glog)

-IF(WITH_C_API OR WITH_FLUID)
+IF(WITH_C_API)
  INSTALL(DIRECTORY ${GLOG_INCLUDE_DIR} DESTINATION third_party/glog)
  IF(ANDROID)
    INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib/${ANDROID_ABI})

--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -250,7 +250,7 @@ IF(NOT PROTOBUF_FOUND)
    SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY}
        CACHE FILEPATH "protoc library." FORCE)

-    IF(WITH_C_API OR WITH_FLUID)
+    IF(WITH_C_API)
        INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf)
        IF(ANDROID)
            INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})

--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -52,6 +52,7 @@ ExternalProject_Add(
                    -DWITH_TORCH=OFF
                    -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
                    -DBUILD_SHARED=ON
+                    -DBUILD_TESTS=OFF
                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                    ${EXTERNAL_OPTIONAL_ARGS}

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -179,15 +179,24 @@ function(cc_library TARGET_NAME)
  set(oneValueArgs "")
  set(multiValueArgs SRCS DEPS)
  cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  if (cc_library_SRCS)
-    if (cc_library_SHARED OR cc_library_shared) # build *.so
+  if(cc_library_SRCS)
+    if(cc_library_SHARED OR cc_library_shared) # build *.so
      add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
    else()
      add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
    endif()
-    if (cc_library_DEPS)
+    if(cc_library_DEPS)
+      # Don't need link libwarpctc.so
+      if("${cc_library_DEPS};" MATCHES "warpctc;")
+        list(REMOVE_ITEM cc_library_DEPS warpctc)
+        add_dependencies(${TARGET_NAME} warpctc)
+      endif()
+      # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
+      target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
+      if("${cc_library_DEPS}" MATCHES "ARCHIVE_START")
+        list(REMOVE_ITEM cc_library_DEPS ARCHIVE_START ARCHIVE_END)
+      endif()
      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
-      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
    endif()
    
    # cpplint code style
@@ -224,12 +233,18 @@ function(cc_test TARGET_NAME)
  if(WITH_TESTING)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
+    set(multiValueArgs SRCS DEPS ARGS)
    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
+    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
+      list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
+    endif()
    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
-    add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+    add_test(NAME ${TARGET_NAME}
+             COMMAND ${TARGET_NAME} ${cc_test_ARGS}
+             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
  endif()
 endfunction(cc_test)

@@ -457,12 +472,12 @@ endfunction()

 function(py_test TARGET_NAME)
  if(WITH_TESTING)
-    set(options STATIC static SHARED shared)
+    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS ARGS)
+    set(multiValueArgs SRCS DEPS ARGS ENVS)
    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python
+             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_ENVS}
             ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
  endif()

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
+# make package for paddle fluid shared and static library
+function(copy TARGET)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DSTS DEPS)
+    cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
+    list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
+    if(NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len})
+        message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers")
+    endif()
+    math(EXPR len "${copy_lib_SRCS_len} - 1")
+    
+    add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS})
+    foreach(index RANGE ${len})
+        list(GET copy_lib_SRCS ${index} src)
+        list(GET copy_lib_DSTS ${index} dst)
+        add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND mkdir -p "${dst}")
+        if(IS_DIRECTORY ${src})
+            add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND cp -r "${src}" "${dst}")
+        else()
+            add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND cp "${src}" "${dst}")
+        endif()
+    endforeach()
+endfunction()
+
+# third party
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/eigen3")
+copy(eigen3_lib
+  SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
+  DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported
+)
+
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/gflags")
+copy(gflags_lib
+  SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
+  DSTS ${dst_dir} ${dst_dir}/lib
+)
+
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/glog")
+copy(glog_lib
+  SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
+  DSTS ${dst_dir} ${dst_dir}/lib
+)
+
+IF(NOT PROTOBUF_FOUND)
+    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf")
+    copy(protobuf_lib
+      SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LITE_LIBRARY}
+      DSTS ${dst_dir} ${dst_dir}/lib
+    )
+ENDIF(NOT PROTOBUF_FOUND)
+
+# paddle fluid module
+set(src_dir "${PADDLE_SOURCE_DIR}/paddle")
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle")
+set(module "framework")
+copy(framework_lib DEPS framework_py_proto 
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/framework/framework.pb.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
+)
+
+set(module "memory")
+copy(memory_lib
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
+)
+
+set(module "inference")
+copy(inference_lib DEPENDS paddle_fluid_shared
+  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/inference/libpaddle_fluid.so
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}
+)
+
+set(module "platform")
+copy(platform_lib
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details
+)
+
+set(module "string")
+copy(string_lib
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
+)
+
+add_custom_target(inference_lib_dist DEPENDS 
+  inference_lib framework_lib memory_lib platform_lib string_lib
+  gflags_lib glog_lib protobuf_lib eigen3_lib)
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -47,3 +47,5 @@ sphinx_add_target(paddle_docs_cn
                  ${SPHINX_CACHE_DIR_CN}
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR_CN})
+
+add_subdirectory(api)
--- a/doc/api/CMakeLists.txt
+++ b/doc/api/CMakeLists.txt
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
+    "${BINARY_BUILD_DIR_EN}/conf.py"
+    @ONLY)
+
+sphinx_add_target(paddle_api_docs
+                  html
+                  ${BINARY_BUILD_DIR_EN}
+                  ${SPHINX_CACHE_DIR_EN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_EN})
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -87,6 +87,11 @@ roi_pool
 ..  autoclass:: paddle.v2.layer.roi_pool
    :noindex:

+pad
+----
+..  autoclass:: paddle.v2.layer.pad
+    :noindex:
+
 Norm Layer
 ==========

@@ -133,6 +138,11 @@ grumemory
 ..  autoclass:: paddle.v2.layer.grumemory
    :noindex:

+gated_unit
+-----------
+..  autoclass:: paddle.v2.layer.gated_unit
+    :noindex:
+    
 Recurrent Layer Group
 =====================

@@ -340,6 +350,11 @@ bilinear_interp
 ..  autoclass:: paddle.v2.layer.bilinear_interp
    :noindex:

+dropout
+--------
+..  autoclass:: paddle.v2.layer.dropout
+    :noindex:
+    
 dot_prod
 ---------
 .. autoclass:: paddle.v2.layer.dot_prod
@@ -402,6 +417,11 @@ scale_shift
 ..  autoclass:: paddle.v2.layer.scale_shift
    :noindex:

+factorization_machine
+---------------------
+..  autoclass:: paddle.v2.layer.factorization_machine
+    :noindex:
+
 Sampling Layers
 ===============

@@ -420,22 +440,6 @@ multiplex
 ..  autoclass:: paddle.v2.layer.multiplex
    :noindex:

-Factorization Machine Layer
-============================
-
-factorization_machine
---------------------
-..  autoclass:: paddle.v2.layer.factorization_machine
-    :noindex:
-
-Slicing and Joining Layers
-==========================
-
-pad
----
-..  autoclass:: paddle.v2.layer.pad
-    :noindex:
-
 ..  _api_v2.layer_costs:

 Cost Layers
@@ -526,6 +530,11 @@ multibox_loss
 ..  autoclass:: paddle.v2.layer.multibox_loss
    :noindex:

+detection_output
+----------------
+..  autoclass:: paddle.v2.layer.detection_output
+    :noindex:
+    
 Check Layer
 ============

@@ -534,31 +543,10 @@ eos
 ..  autoclass:: paddle.v2.layer.eos
    :noindex:

-Miscs
-=====
-
-dropout
--------
-..  autoclass:: paddle.v2.layer.dropout
-    :noindex:
-
-Activation with learnable parameter
-===================================
+Activation
+==========

 prelu
 --------
 ..  autoclass:: paddle.v2.layer.prelu
    :noindex:
-
-gated_unit
-----------
-..  autoclass:: paddle.v2.layer.gated_unit
-    :noindex:
-
-Detection output Layer
-======================
-
-detection_output
----------------
-..  autoclass:: paddle.v2.layer.detection_output
-    :noindex:
--- a/doc/api/v2/data/dataset.rst
+++ b/doc/api/v2/data/dataset.rst
@@ -73,3 +73,10 @@ wmt14
 ..  automodule:: paddle.v2.dataset.wmt14
    :members:
    :noindex:
+
+wmt16
+++++
+
+..  automodule:: paddle.v2.dataset.wmt16
+    :members:
+    :noindex:
--- a/doc/api/v2/fluid/data_feeder.rst
+++ b/doc/api/v2/fluid/data_feeder.rst
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
 ===========
-DataFeeder
+data_feeder
 ===========

 DataFeeder
-----------
-..  automodule:: paddle.v2.fluid.data_feeder
-    :members: DataFeeder
+----------
+
+..  autoclass:: paddle.v2.fluid.data_feeder.DataFeeder
+    :members:
    :noindex:
+
--- a/doc/api/v2/fluid/evaluator.rst
+++ b/doc/api/v2/fluid/evaluator.rst
-===========
-Evaluator
-===========
-
-Evaluator
-----------
-..  automodule:: paddle.v2.fluid.evaluator
-    :members: Evaluator
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=========
+evaluator
+=========
+
+Accuracy
+--------
+
+..  autoclass:: paddle.v2.fluid.evaluator.Accuracy
+    :members:
    :noindex:
+
+ChunkEvaluator
+--------------
+
+..  autoclass:: paddle.v2.fluid.evaluator.ChunkEvaluator
+    :members:
+    :noindex:
+
--- a/doc/api/v2/fluid/executor.rst
+++ b/doc/api/v2/fluid/executor.rst
-===========
-Executor
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+========
+executor
+========

 Executor
+--------
+
+..  autoclass:: paddle.v2.fluid.executor.Executor
+    :members:
+    :noindex:
+
+global_scope
+------------
+
+..  autofunction:: paddle.v2.fluid.executor.global_scope
+    :noindex:
+
+scope_guard
 -----------
-..  automodule:: paddle.v2.fluid.executor
-    :members: Executor
+
+..  autofunction:: paddle.v2.fluid.executor.scope_guard
+    :noindex:
+
+switch_scope
+------------
+
+..  autofunction:: paddle.v2.fluid.executor.switch_scope
    :noindex:
+
--- a/doc/api/v2/fluid/gen_doc.py
+++ b/doc/api/v2/fluid/gen_doc.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import argparse
+import sys
+import types
+
+import paddle.v2.fluid as fluid
+
+
+def parse_arg():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--submodules', nargs="*")
+    parser.add_argument(
+        'module', type=str, help='Generate the documentation of which module')
+    return parser.parse_args()
+
+
+class DocGenerator(object):
+    def __init__(self, module_name, stream=sys.stdout):
+        self.stream = stream
+        self.module_name = module_name
+        if not hasattr(fluid, module_name):
+            raise ValueError("Cannot find fluid.{0}".format(module_name))
+        else:
+            self.module = getattr(fluid, module_name)
+        self.stream.write('''..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+''')
+
+        self._print_header_(module_name, dot='=', is_title=True)
+
+    def print_submodule(self, submodule_name):
+        submodule = getattr(self.module, submodule_name)
+        if submodule is None:
+            raise ValueError("Cannot find submodule {0}".format(submodule_name))
+        self.print_section(submodule_name)
+
+        for item in submodule.__all__:
+            self.print_item(item)
+
+    def print_current_module(self):
+        for item in self.module.__all__:
+            self.print_item(item)
+
+    def print_section(self, name):
+        self._print_header_(name, dot='=', is_title=False)
+
+    def print_item(self, name):
+        item = getattr(self.module, name)
+        if isinstance(item, types.TypeType):
+            self.print_class(name)
+        elif isinstance(item, types.FunctionType):
+            self.print_method(name)
+        else:
+            raise RuntimeError("Unsupported item {0}".format(name))
+
+    def print_class(self, name):
+        self._print_header_(name, dot='-', is_title=False)
+        self.stream.write('''..  autoclass:: paddle.v2.fluid.{0}.{1}
+    :members:
+    :noindex:
+
+'''.format(self.module_name, name))
+
+    def print_method(self, name):
+        self._print_header_(name, dot='-', is_title=False)
+        self.stream.write('''..  autofunction:: paddle.v2.fluid.{0}.{1}
+    :noindex:
+
+'''.format(self.module_name, name))
+
+    def _print_header_(self, name, dot, is_title):
+        dot_line = dot * len(name)
+        if is_title:
+            self.stream.write(dot_line)
+            self.stream.write('\n')
+        self.stream.write(name)
+        self.stream.write('\n')
+        self.stream.write(dot_line)
+        self.stream.write('\n')
+        self.stream.write('\n')
+
+
+def main():
+    args = parse_arg()
+    gen = DocGenerator(args.module)
+    if args.submodules is None:
+        gen.print_current_module()
+    else:
+        for submodule_name in args.submodules:
+            gen.print_submodule(submodule_name)
+
+
+if __name__ == '__main__':
+    main()
--- a/doc/api/v2/fluid/gen_doc.sh
+++ b/doc/api/v2/fluid/gen_doc.sh
+#!/bin/bash
+python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst
+
+for module in io data_feeder evaluator executor initializer io nets optimizer param_attr profiler regularizer
+do
+  python gen_doc.py ${module} > ${module}.rst
+done
--- a/doc/api/v2/fluid/initializer.rst
+++ b/doc/api/v2/fluid/initializer.rst
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
 ===========
-Initializer
+initializer
 ===========

+Constant
+--------

-
-Initializer
-----------
-..  automodule:: paddle.v2.fluid.initializer
-    :members: Initializer
-    :noindex:
-
-
-
-ConstantInitializer
-------------------
-..  automodule:: paddle.v2.fluid.initializer
-    :members: ConstantInitializer
+..  autoclass:: paddle.v2.fluid.initializer.Constant
+    :members:
    :noindex:

+Uniform
+-------

-
-UniformInitializer
------------------
-..  automodule:: paddle.v2.fluid.initializer
-    :members: UniformInitializer
-    :noindex:
-
-
-
-NormalInitializer
-----------------
-..  automodule:: paddle.v2.fluid.initializer
-    :members: NormalInitializer
+..  autoclass:: paddle.v2.fluid.initializer.Uniform
+    :members:
    :noindex:

+Normal
+------

-XavierInitializer
-----------------
-..  automodule:: paddle.v2.fluid.initializer
-    :members: XavierInitializer
+..  autoclass:: paddle.v2.fluid.initializer.Normal
+    :members:
    :noindex:

+Xavier
+------

-MSRAInitializer
---------------
-..  automodule:: paddle.v2.fluid.initializer
-    :members: MSRAInitializer
+..  autoclass:: paddle.v2.fluid.initializer.Xavier
+    :members:
    :noindex:

--- a/doc/api/v2/fluid/io.rst
+++ b/doc/api/v2/fluid/io.rst
-===========
-IO
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!

+==
+io
+==

+save_vars
+---------

-is_parameter
+..  autofunction:: paddle.v2.fluid.io.save_vars
+    :noindex:
+
+save_params
 -----------
-..  autofunction:: paddle.v2.fluid.io.is_parameter
+
+..  autofunction:: paddle.v2.fluid.io.save_params
+    :noindex:
+
+save_persistables
+-----------------
+
+..  autofunction:: paddle.v2.fluid.io.save_persistables
+    :noindex:
+
+load_vars
+---------
+
+..  autofunction:: paddle.v2.fluid.io.load_vars
+    :noindex:
+
+load_params
+-----------
+
+..  autofunction:: paddle.v2.fluid.io.load_params
    :noindex:
+
+load_persistables
+-----------------
+
+..  autofunction:: paddle.v2.fluid.io.load_persistables
+    :noindex:
+
+save_inference_model
+--------------------
+
+..  autofunction:: paddle.v2.fluid.io.save_inference_model
+    :noindex:
+
+load_inference_model
+--------------------
+
+..  autofunction:: paddle.v2.fluid.io.load_inference_model
+    :noindex:
+
+get_inference_program
+---------------------
+
+..  autofunction:: paddle.v2.fluid.io.get_inference_program
+    :noindex:
+
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
-==========
-Layers
-==========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!

+======
+layers
+======

-fc
---
-..  autofunction:: paddle.v2.fluid.layers.fc
+control_flow
+============
+
+split_lod_tensor
+----------------
+
+..  autofunction:: paddle.v2.fluid.layers.split_lod_tensor
    :noindex:

-embedding
---------
-..  autofunction:: paddle.v2.fluid.layers.embedding
+merge_lod_tensor
+----------------
+
+..  autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
    :noindex:

-dynamic_lstm
------------
-..  autofunction:: paddle.v2.fluid.layers.dynamic_lstm
+BlockGuard
+----------
+
+..  autoclass:: paddle.v2.fluid.layers.BlockGuard
+    :members:
    :noindex:

-dynamic_gru
-----------
-..  autofunction:: paddle.v2.fluid.layers.dynamic_gru
+BlockGuardWithCompletion
+------------------------
+
+..  autoclass:: paddle.v2.fluid.layers.BlockGuardWithCompletion
+    :members:
    :noindex:

-data
----
-..  autofunction:: paddle.v2.fluid.layers.data
+StaticRNNMemoryLink
+-------------------
+
+..  autoclass:: paddle.v2.fluid.layers.StaticRNNMemoryLink
+    :members:
    :noindex:

-mean
----
-..  autofunction:: paddle.v2.fluid.layers.mean
+WhileGuard
+----------
+
+..  autoclass:: paddle.v2.fluid.layers.WhileGuard
+    :members:
    :noindex:

-mul
---
-..  autofunction:: paddle.v2.fluid.layers.mul
+While
+-----
+
+..  autoclass:: paddle.v2.fluid.layers.While
+    :members:
    :noindex:

-elementwise_add
---------------
-..  autofunction:: paddle.v2.fluid.layers.elementwise_add
+lod_rank_table
+--------------
+
+..  autofunction:: paddle.v2.fluid.layers.lod_rank_table
    :noindex:

-elementwise_sub
---------------
-..  autofunction:: paddle.v2.fluid.layers.elementwise_sub
+max_sequence_len
+----------------
+
+..  autofunction:: paddle.v2.fluid.layers.max_sequence_len
    :noindex:

-elementwise_mul
---------------
-..  autofunction:: paddle.v2.fluid.layers.elementwise_mul
+topk
+----
+
+..  autofunction:: paddle.v2.fluid.layers.topk
    :noindex:

-elementwise_div
---------------
-..  autofunction:: paddle.v2.fluid.layers.elementwise_div
+lod_tensor_to_array
+-------------------
+
+..  autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
    :noindex:

+array_to_lod_tensor
+-------------------

-dropout
-------
-..  autofunction:: paddle.v2.fluid.layers.dropout
+..  autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
    :noindex:

+increment
+---------

-reshape
--------
-..  autofunction:: paddle.v2.fluid.layers.reshape
+..  autofunction:: paddle.v2.fluid.layers.increment
    :noindex:

+array_write
+-----------

-sigmoid
---------
-..  autofunction:: paddle.v2.fluid.layers.sigmoid
+..  autofunction:: paddle.v2.fluid.layers.array_write
    :noindex:

+create_array
+------------

-scale
+..  autofunction:: paddle.v2.fluid.layers.create_array
+    :noindex:
+
+less_than
 ---------
-..  autofunction:: paddle.v2.fluid.layers.scale
+
+..  autofunction:: paddle.v2.fluid.layers.less_than
    :noindex:

+array_read
+----------

-transpose
+..  autofunction:: paddle.v2.fluid.layers.array_read
+    :noindex:
+
+shrink_memory
+-------------
+
+..  autofunction:: paddle.v2.fluid.layers.shrink_memory
+    :noindex:
+
+array_length
+------------
+
+..  autofunction:: paddle.v2.fluid.layers.array_length
+    :noindex:
+
+IfElse
+------
+
+..  autoclass:: paddle.v2.fluid.layers.IfElse
+    :members:
+    :noindex:
+
+DynamicRNN
+----------
+
+..  autoclass:: paddle.v2.fluid.layers.DynamicRNN
+    :members:
+    :noindex:
+
+ConditionalBlock
+----------------
+
+..  autoclass:: paddle.v2.fluid.layers.ConditionalBlock
+    :members:
+    :noindex:
+
+StaticRNN
 ---------
-..  autofunction:: paddle.v2.fluid.layers.transpose
+
+..  autoclass:: paddle.v2.fluid.layers.StaticRNN
+    :members:
    :noindex:

+reorder_lod_tensor_by_rank
+--------------------------

-sigmoid_cross_entropy_with_logits
---------------------------------
-..  autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits
+..  autofunction:: paddle.v2.fluid.layers.reorder_lod_tensor_by_rank
    :noindex:

+ParallelDo
+----------

-cast
+..  autoclass:: paddle.v2.fluid.layers.ParallelDo
+    :members:
+    :noindex:
+
+Print
+-----
+
+..  autofunction:: paddle.v2.fluid.layers.Print
+    :noindex:
+
+device
+======
+
+get_places
+----------
+
+..  autofunction:: paddle.v2.fluid.layers.get_places
+    :noindex:
+
+io
+==
+
+data
 ----
-..  autofunction:: paddle.v2.fluid.layers.cast
+
+..  autofunction:: paddle.v2.fluid.layers.data
    :noindex:

+BlockGuardServ
+--------------

-concat
-------
-..  autofunction:: paddle.v2.fluid.layers.concat
+..  autoclass:: paddle.v2.fluid.layers.BlockGuardServ
+    :members:
    :noindex:

+ListenAndServ
+-------------

-sums
+..  autoclass:: paddle.v2.fluid.layers.ListenAndServ
+    :members:
+    :noindex:
+
+Send
 ----
-..  autofunction:: paddle.v2.fluid.layers.sums
+
+..  autofunction:: paddle.v2.fluid.layers.Send
    :noindex:

+nn
+==

-linear_chain_crf
----------------
-..  autofunction:: paddle.v2.fluid.layers.linear_chain_crf
+fc
+--
+
+..  autofunction:: paddle.v2.fluid.layers.fc
    :noindex:

+embedding
+---------

-assign
-------
 ..  autofunction:: paddle.v2.fluid.layers.embedding
    :noindex:

+dynamic_lstm
+------------

-split_lod_tensor
----------------
-..  autofunction:: paddle.v2.fluid.layers.split_lod_tensor
+..  autofunction:: paddle.v2.fluid.layers.dynamic_lstm
    :noindex:

+dynamic_lstmp
+-------------

-merge_lod_tensor
+..  autofunction:: paddle.v2.fluid.layers.dynamic_lstmp
+    :noindex:
+
+dynamic_gru
+-----------
+
+..  autofunction:: paddle.v2.fluid.layers.dynamic_gru
+    :noindex:
+
+gru_unit
+--------
+
+..  autofunction:: paddle.v2.fluid.layers.gru_unit
+    :noindex:
+
+linear_chain_crf
 ----------------
-..  autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
+
+..  autofunction:: paddle.v2.fluid.layers.linear_chain_crf
+    :noindex:
+
+crf_decoding
+------------
+
+..  autofunction:: paddle.v2.fluid.layers.crf_decoding
    :noindex:

 cos_sim
--------
+-------
+
 ..  autofunction:: paddle.v2.fluid.layers.cos_sim
    :noindex:

-
 cross_entropy
 -------------
+
 ..  autofunction:: paddle.v2.fluid.layers.cross_entropy
    :noindex:

-
-
 square_error_cost
 -----------------
+
 ..  autofunction:: paddle.v2.fluid.layers.square_error_cost
    :noindex:

-
 accuracy
---------
+--------
+
 ..  autofunction:: paddle.v2.fluid.layers.accuracy
    :noindex:

+chunk_eval
+----------
+
+..  autofunction:: paddle.v2.fluid.layers.chunk_eval
+    :noindex:

 sequence_conv
 -------------
+
 ..  autofunction:: paddle.v2.fluid.layers.sequence_conv
    :noindex:

-
 conv2d
 ------
+
 ..  autofunction:: paddle.v2.fluid.layers.conv2d
    :noindex:

-
 sequence_pool
 -------------
+
 ..  autofunction:: paddle.v2.fluid.layers.sequence_pool
    :noindex:

+pool2d
+------

-sequence_first_step
-------------------
-..  autofunction:: paddle.v2.fluid.layers.sequence_first_step
+..  autofunction:: paddle.v2.fluid.layers.pool2d
    :noindex:

+batch_norm
+----------

-sequence_last_step
+..  autofunction:: paddle.v2.fluid.layers.batch_norm
+    :noindex:
+
+layer_norm
+----------
+
+..  autofunction:: paddle.v2.fluid.layers.layer_norm
+    :noindex:
+
+beam_search_decode
 ------------------
-..  autofunction:: paddle.v2.fluid.layers.sequence_last_step
+
+..  autofunction:: paddle.v2.fluid.layers.beam_search_decode
    :noindex:

+conv2d_transpose
+----------------

-pool2d
------
-..  autofunction:: paddle.v2.fluid.layers.pool2d
+..  autofunction:: paddle.v2.fluid.layers.conv2d_transpose
    :noindex:

+sequence_expand
+---------------

-batch_norm
+..  autofunction:: paddle.v2.fluid.layers.sequence_expand
+    :noindex:
+
+lstm_unit
+---------
+
+..  autofunction:: paddle.v2.fluid.layers.lstm_unit
+    :noindex:
+
+reduce_sum
 ----------
-..  autofunction:: paddle.v2.fluid.layers.batch_norm
+
+..  autofunction:: paddle.v2.fluid.layers.reduce_sum
+    :noindex:
+
+reduce_mean
+-----------
+
+..  autofunction:: paddle.v2.fluid.layers.reduce_mean
    :noindex:

+reduce_max
+----------

-beam_search_decode
+..  autofunction:: paddle.v2.fluid.layers.reduce_max
+    :noindex:
+
+reduce_min
+----------
+
+..  autofunction:: paddle.v2.fluid.layers.reduce_min
+    :noindex:
+
+sequence_first_step
+-------------------
+
+..  autofunction:: paddle.v2.fluid.layers.sequence_first_step
+    :noindex:
+
+sequence_last_step
 ------------------
-..  autofunction:: paddle.v2.fluid.layers.beam_search_decode
+
+..  autofunction:: paddle.v2.fluid.layers.sequence_last_step
+    :noindex:
+
+dropout
+-------
+
+..  autofunction:: paddle.v2.fluid.layers.dropout
+    :noindex:
+
+split
+-----
+
+..  autofunction:: paddle.v2.fluid.layers.split
    :noindex:

+ctc_greedy_decoder
+------------------

-lod_rank_table
--------------
-..  autofunction:: paddle.v2.fluid.layers.lod_rank_table
+..  autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder
    :noindex:

+edit_distance
+-------------

-max_sequence_len
----------------
-..  autofunction:: paddle.v2.fluid.layers.max_sequence_len
+..  autofunction:: paddle.v2.fluid.layers.edit_distance
    :noindex:

+l2_normalize
+------------

-topk
-----
-..  autofunction:: paddle.v2.fluid.layers.topk
+..  autofunction:: paddle.v2.fluid.layers.l2_normalize
    :noindex:

+matmul
+------

-lod_tensor_to_array
-------------------
-..  autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
+..  autofunction:: paddle.v2.fluid.layers.matmul
    :noindex:

+warpctc
+-------

-
-array_to_lod_tensor
-------------------
-..  autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
+..  autofunction:: paddle.v2.fluid.layers.warpctc
    :noindex:

+sequence_reshape
+----------------

+..  autofunction:: paddle.v2.fluid.layers.sequence_reshape
+    :noindex:

+transpose
+---------

-fill_constant
-------------
-..  autofunction:: paddle.v2.fluid.layers.fill_constant
+..  autofunction:: paddle.v2.fluid.layers.transpose
    :noindex:

+im2sequence
+-----------

-
-fill_constant_batch_size_like
-----------------------------
-..  autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
+..  autofunction:: paddle.v2.fluid.layers.im2sequence
    :noindex:

+nce
+---

-ones
----
-..  autofunction:: paddle.v2.fluid.layers.ones
+..  autofunction:: paddle.v2.fluid.layers.nce
    :noindex:

+beam_search
+-----------

-zeros
-----
-..  autofunction:: paddle.v2.fluid.layers.zeros
+..  autofunction:: paddle.v2.fluid.layers.beam_search
    :noindex:

+row_conv
+--------

-increment
---------
-..  autofunction:: paddle.v2.fluid.layers.increment
+..  autofunction:: paddle.v2.fluid.layers.row_conv
    :noindex:

+multiplex
+---------

-array_write
-----------
-..  autofunction:: paddle.v2.fluid.layers.array_write
+..  autofunction:: paddle.v2.fluid.layers.multiplex
    :noindex:

+ops
+===

+mean
+----

-create_array
------------
-..  autofunction:: paddle.v2.fluid.layers.create_array
+..  autofunction:: paddle.v2.fluid.layers.mean
    :noindex:

+mul
+---

-less_than
---------
-..  autofunction:: paddle.v2.fluid.layers.less_than
+..  autofunction:: paddle.v2.fluid.layers.mul
    :noindex:

+reshape
+-------

-array_read
----------
-..  autofunction:: paddle.v2.fluid.layers.array_read
+..  autofunction:: paddle.v2.fluid.layers.reshape
    :noindex:

+scale
+-----

-shrink_memory
--------------
-..  autofunction:: paddle.v2.fluid.layers.shrink_memory
+..  autofunction:: paddle.v2.fluid.layers.scale
    :noindex:

+sigmoid_cross_entropy_with_logits
+---------------------------------

-array_length
-------------
-..  autofunction:: paddle.v2.fluid.layers.array_length
+..  autofunction:: paddle.v2.fluid.layers.sigmoid_cross_entropy_with_logits
    :noindex:

+elementwise_add
+---------------

-conv2d_transpose
----------------
-..  autofunction:: paddle.v2.fluid.layers.conv2d_transpose
+..  autofunction:: paddle.v2.fluid.layers.elementwise_add
    :noindex:

-
-sequence_expand
+elementwise_div
 ---------------
-..  autofunction:: paddle.v2.fluid.layers.sequence_expand
+
+..  autofunction:: paddle.v2.fluid.layers.elementwise_div
    :noindex:

+elementwise_sub
+---------------

-gru_unit
--------
-..  autofunction:: paddle.v2.fluid.layers.gru_unit
+..  autofunction:: paddle.v2.fluid.layers.elementwise_sub
    :noindex:

+elementwise_mul
+---------------

-lstm_unit
---------
-..  autofunction:: paddle.v2.fluid.layers.lstm_unit
+..  autofunction:: paddle.v2.fluid.layers.elementwise_mul
    :noindex:

+elementwise_max
+---------------

-sequence_softmax
----------------
-..  autofunction:: paddle.v2.fluid.layers.sequence_softmax
+..  autofunction:: paddle.v2.fluid.layers.elementwise_max
    :noindex:

+elementwise_min
+---------------

-reduce_sum
----------
-..  autofunction:: paddle.v2.fluid.layers.reduce_sum
+..  autofunction:: paddle.v2.fluid.layers.elementwise_min
    :noindex:

+elementwise_pow
+---------------

-reduce_mean
-----------
-..  autofunction:: paddle.v2.fluid.layers.reduce_mean
+..  autofunction:: paddle.v2.fluid.layers.elementwise_pow
    :noindex:

+clip
+----

-reduce_max
----------
-..  autofunction:: paddle.v2.fluid.layers.reduce_max
+..  autofunction:: paddle.v2.fluid.layers.clip
    :noindex:

+clip_by_norm
+------------

-reduce_min
----------
-..  autofunction:: paddle.v2.fluid.layers.reduce_min
+..  autofunction:: paddle.v2.fluid.layers.clip_by_norm
    :noindex:

+sequence_softmax
+----------------

-split
-----
-..  autofunction:: paddle.v2.fluid.layers.split
+..  autofunction:: paddle.v2.fluid.layers.sequence_softmax
    :noindex:

+sigmoid
+-------

-matmul
------
-..  autofunction:: paddle.v2.fluid.layers.matmul
+..  autofunction:: paddle.v2.fluid.layers.sigmoid
    :noindex:

 logsigmoid
 ----------
+
 ..  autofunction:: paddle.v2.fluid.layers.logsigmoid
    :noindex:

 exp
 ---
+
 ..  autofunction:: paddle.v2.fluid.layers.exp
    :noindex:

 relu
 ----
+
 ..  autofunction:: paddle.v2.fluid.layers.relu
    :noindex:

 tanh
 ----
+
 ..  autofunction:: paddle.v2.fluid.layers.tanh
    :noindex:

 tanh_shrink
 -----------
+
 ..  autofunction:: paddle.v2.fluid.layers.tanh_shrink
    :noindex:

 softshrink
 ----------
+
 ..  autofunction:: paddle.v2.fluid.layers.softshrink
    :noindex:

 sqrt
 ----
+
 ..  autofunction:: paddle.v2.fluid.layers.sqrt
    :noindex:

 abs
----
+---
+
 ..  autofunction:: paddle.v2.fluid.layers.abs
    :noindex:

 ceil
 ----
+
 ..  autofunction:: paddle.v2.fluid.layers.ceil
    :noindex:

 floor
 -----
+
 ..  autofunction:: paddle.v2.fluid.layers.floor
    :noindex:

 round
 -----
+
 ..  autofunction:: paddle.v2.fluid.layers.round
    :noindex:

 reciprocal
 ----------
+
 ..  autofunction:: paddle.v2.fluid.layers.reciprocal
    :noindex:

 log
 ---
+
 ..  autofunction:: paddle.v2.fluid.layers.log
    :noindex:

 square
 ------
+
 ..  autofunction:: paddle.v2.fluid.layers.square
    :noindex:

 softplus
 --------
+
 ..  autofunction:: paddle.v2.fluid.layers.softplus
    :noindex:

 softsign
---------
+--------
+
 ..  autofunction:: paddle.v2.fluid.layers.softsign
    :noindex:

 brelu
 -----
+
 ..  autofunction:: paddle.v2.fluid.layers.brelu
    :noindex:

 leaky_relu
 ----------
+
 ..  autofunction:: paddle.v2.fluid.layers.leaky_relu
    :noindex:

 soft_relu
 ---------
+
 ..  autofunction:: paddle.v2.fluid.layers.soft_relu
    :noindex:

 elu
----
+---
+
 ..  autofunction:: paddle.v2.fluid.layers.elu
    :noindex:

 relu6
 -----
+
 ..  autofunction:: paddle.v2.fluid.layers.relu6
    :noindex:

 pow
----
+---
+
 ..  autofunction:: paddle.v2.fluid.layers.pow
    :noindex:

+stanh
+-----
+
+..  autofunction:: paddle.v2.fluid.layers.stanh
+    :noindex:
+
 hard_shrink
 -----------
+
 ..  autofunction:: paddle.v2.fluid.layers.hard_shrink
    :noindex:

 thresholded_relu
 ----------------
+
 ..  autofunction:: paddle.v2.fluid.layers.thresholded_relu
    :noindex:

 hard_sigmoid
-------------
+------------
+
 ..  autofunction:: paddle.v2.fluid.layers.hard_sigmoid
    :noindex:

 swish
------
+-----
+
 ..  autofunction:: paddle.v2.fluid.layers.swish
    :noindex:

-im2sequence
+tensor
+======
+
+create_tensor
+-------------
+
+..  autofunction:: paddle.v2.fluid.layers.create_tensor
+    :noindex:
+
+create_parameter
+----------------
+
+..  autofunction:: paddle.v2.fluid.layers.create_parameter
+    :noindex:
+
+create_global_var
+-----------------
+
+..  autofunction:: paddle.v2.fluid.layers.create_global_var
+    :noindex:
+
+cast
+----
+
+..  autofunction:: paddle.v2.fluid.layers.cast
+    :noindex:
+
+concat
 ------
-..  autofunction:: paddle.v2.fluid.layers.im2sequence
+
+..  autofunction:: paddle.v2.fluid.layers.concat
    :noindex:

-edit_distance
---------------
-..  autofunction:: paddle.v2.fluid.layers.edit_distance_error
+sums
+----
+
+..  autofunction:: paddle.v2.fluid.layers.sums
    :noindex:

-ctc_greedy_decoder
---------------
-..  autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder
+assign
+------
+
+..  autofunction:: paddle.v2.fluid.layers.assign
    :noindex:

-l2_normalize
------------
-..  autofunction:: paddle.v2.fluid.layers.l2_normalize
+fill_constant_batch_size_like
+-----------------------------
+
+..  autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
    :noindex:

-sequence_reshape
----------------
-..  autofunction:: paddle.v2.fluid.layers.sequence_reshape
+fill_constant
+-------------
+
+..  autofunction:: paddle.v2.fluid.layers.fill_constant
    :noindex:

-row_conv
--------
-..  autofunction:: paddle.v2.fluid.layers.row_conv
+ones
+----
+
+..  autofunction:: paddle.v2.fluid.layers.ones
+    :noindex:
+
+zeros
+-----
+
+..  autofunction:: paddle.v2.fluid.layers.zeros
    :noindex:
+
--- a/doc/api/v2/fluid/nets.rst
+++ b/doc/api/v2/fluid/nets.rst
-===========
-Nets
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+====
+nets
+====

 simple_img_conv_pool
 --------------------
-..  autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
-    :noindex:

-
-img_conv_group
---------------
-..  autofunction:: paddle.v2.fluid.nets.img_conv_group
+..  autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
    :noindex:

-
 sequence_conv_pool
 ------------------
+
 ..  autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
    :noindex:

-
 glu
 ---
+
 ..  autofunction:: paddle.v2.fluid.nets.glu
    :noindex:

-
 scaled_dot_product_attention
 ----------------------------
+
 ..  autofunction:: paddle.v2.fluid.nets.scaled_dot_product_attention
    :noindex:

--- a/doc/api/v2/fluid/optimizer.rst
+++ b/doc/api/v2/fluid/optimizer.rst
-===========
-Optimizer
-===========
-
-Optimizer
-----------
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: Optimizer
-    :noindex:
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!

+=========
+optimizer
+=========

-SGDOptimizer
-----------
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: SGDOptimizer
-    :noindex:
+SGD
+---

+..  autoclass:: paddle.v2.fluid.optimizer.SGD
+    :members:
+    :noindex:

+Momentum
+--------

-MomentumOptimizer
-----------------
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: MomentumOptimizer
+..  autoclass:: paddle.v2.fluid.optimizer.Momentum
+    :members:
    :noindex:

+Adagrad
+-------

-
-AdagradOptimizer
----------------
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: AdagradOptimizer
+..  autoclass:: paddle.v2.fluid.optimizer.Adagrad
+    :members:
    :noindex:

+Adam
+----

-AdamOptimizer
-------------
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: AdamOptimizer
+..  autoclass:: paddle.v2.fluid.optimizer.Adam
+    :members:
    :noindex:

+Adamax
+------

-AdamaxOptimizer
-----------
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: AdamaxOptimizer
+..  autoclass:: paddle.v2.fluid.optimizer.Adamax
+    :members:
    :noindex:

+DecayedAdagrad
+--------------

-DecayedAdagradOptimizer
-----------------------
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: DecayedAdagradOptimizer
+..  autoclass:: paddle.v2.fluid.optimizer.DecayedAdagrad
+    :members:
    :noindex:

--- a/doc/api/v2/fluid/param_attr.rst
+++ b/doc/api/v2/fluid/param_attr.rst
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+==========
+param_attr
+==========
+
 ParamAttr
-===========
+---------

+..  autoclass:: paddle.v2.fluid.param_attr.ParamAttr
+    :members:
+    :noindex:

+WeightNormParamAttr
+-------------------

-ParamAttr
-----------
-..  automodule:: paddle.v2.fluid.param_attr
-    :members: ParamAttr
+..  autoclass:: paddle.v2.fluid.param_attr.WeightNormParamAttr
+    :members:
    :noindex:
+
--- a/doc/api/v2/fluid/profiler.rst
+++ b/doc/api/v2/fluid/profiler.rst
-===========
-Profiler
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!

+========
+profiler
+========

+cuda_profiler
+-------------

-Profiler
-----------
 ..  autofunction:: paddle.v2.fluid.profiler.cuda_profiler
    :noindex:
+
+reset_profiler
+--------------
+
+..  autofunction:: paddle.v2.fluid.profiler.reset_profiler
+    :noindex:
+
+profiler
+--------
+
+..  autofunction:: paddle.v2.fluid.profiler.profiler
+    :noindex:
+
--- a/doc/api/v2/fluid/regularizer.rst
+++ b/doc/api/v2/fluid/regularizer.rst
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
 ===========
-Regularizer
+regularizer
 ===========

-WeightDecayRegularizer
----------------------
-..  automodule:: paddle.v2.fluid.regularizer
-    :members: WeightDecayRegularizer
-    :noindex:
-
+append_regularization_ops
+-------------------------

-L2DecayRegularizer
------------------
-..  automodule:: paddle.v2.fluid.regularizer
-    :members: L2DecayRegularizer
+..  autofunction:: paddle.v2.fluid.regularizer.append_regularization_ops
    :noindex:

+L1Decay
+-------

+..  autoclass:: paddle.v2.fluid.regularizer.L1Decay
+    :members:
+    :noindex:

-L1DecayRegularizer
-------------------
-..  automodule:: paddle.v2.fluid.regularizer
-    :members: L1DecayRegularizer
+L2Decay
+-------

+..  autoclass:: paddle.v2.fluid.regularizer.L2Decay
+    :members:
+    :noindex:

--- a/doc/howto/dev/build_cn.md
+++ b/doc/howto/dev/build_cn.md
--- a/doc/howto/dev/build_en.md
+++ b/doc/howto/dev/build_en.md
--- a/doc/build_and_install/build_from_source_cn.rst
+++ b/doc/build_and_install/build_from_source_cn.rst
+从源码编译
+======================
+
+.. _build_step:
+
+编译方法
+----------------
+
+PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译工具。
+我们推荐您使用PaddlePaddle Docker编译环境镜像完成编译，这样可以免去单独安装编译依赖的步骤，可选的不同编译环境Docker镜像
+可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到。
+
+如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。
+
+编译PaddlePaddle，需要执行：
+
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # 如果使用Docker编译环境，执行下面的命令编译CPU-Only的二进制
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+   # 如果不使用Docker编译环境，执行下面的命令
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
+   make
+
+编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：
+
+.. code-block:: bash
+
+   pip install build/python/dist/*.whl
+
+如果机器中已经安装过PaddlePaddle，有两种方法：
+
+.. code-block:: bash
+
+   1. 先卸载之前的版本，再重新安装
+   pip uninstall paddlepaddle
+   pip install build/python/dist/*.whl
+
+   2. 直接升级到更新的版本
+   pip install build/python/dist/*.whl -U
+
+.. _run_test:
+
+执行单元测试
+----------------
+
+如果您期望在编译完成后立即执行所有的单元测试，可以按照下面的方法：
+
+使用Docker的情况下，设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后，立即执行单元测试。
+开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+
+如果不使用Docker，可以执行ctest命令即可：
+
+.. code-block:: bash
+
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
+   make
+   ctest
+   # 指定执行其中一个单元测试 test_mul_op
+   ctest -R test_mul_op
+
+.. _compile_deps:
+
+编译依赖
+----------------
+
+PaddlePaddle编译需要使用到下面的依赖（包含但不限于），其他的依赖软件，会自动在编译时下载。
+
+.. csv-table:: PaddlePaddle编译依赖
+   :header: "依赖", "版本", "说明"
+   :widths: 10, 15, 30
+
+   "CMake", ">=3.2", ""
+   "GCC", "4.8.2", "推荐使用CentOS的devtools2"
+   "Python", "2.7.x", "依赖libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
+   "SWIG", ">=2.0", ""
+   "Go", ">=1.8", "可选"
+
+
+.. _build_options:
+
+编译选项
+----------------
+
+PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。
+用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考
+`官方文档 <https://cmake.org/cmake-tutorial>`_ 。
+
+在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如：
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: 编译选项说明
+    :header: "选项", "说明", "默认值"
+    :widths: 1, 7, 2
+
+    "WITH_GPU", "是否支持GPU", "ON"
+    "WITH_C_API", "是否仅编译CAPI", "OFF"
+    "WITH_DOUBLE", "是否使用双精度浮点数", "OFF"
+    "WITH_DSO", "是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。", "ON"
+    "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
+    "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
+    "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
+    "WITH_TESTING", "是否开启单元测试", "OFF"
+    "WITH_DOC", "是否编译中英文文档", "OFF"
+    "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练", "Auto"
+    "WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON"
+    "WITH_MKL", "是否使用MKL数学库，如果为否则是用OpenBLAS", "ON"
+
+BLAS
+++++
+
+PaddlePaddle支持 `MKL <https://software.intel.com/en-us/intel-mkl>`_ 和
+`OpenBlAS <http://www.openblas.net/>`_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集，
+还会下载MKL-DNN数学库，详细参考 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
+
+如果关闭MKL，则会使用OpenBLAS作为BLAS库。
+
+CUDA/cuDNN
+++++++++++
+
+PaddlePaddle在编译时/运行时会自动找到系统中安装的CUDA和cuDNN库进行编译和执行。
+使用参数 :code:`-DCUDA_ARCH_NAME=Auto` 可以指定开启自动检测SM架构，加速编译。
+
+PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cuDNN是同一个版本。
+我们推荐使用最新版本的cuDNN。
+
+编译选项的设置
++++++++++++++
+
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ :code:`/usr/lib:/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
+
+**注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（** :code:`rm -rf` ）**后，再指定。**
--- a/doc/build_and_install/build_from_source_en.rst
+++ b/doc/build_and_install/build_from_source_en.rst
+Build from Sources
+==========================
+
+.. _build_step:
+
+How To Build
+----------------
+
+PaddlePaddle mainly uses `CMake <https://cmake.org>`_ and GCC, G++ as compile
+tools. We recommend you to use our pre-built Docker image to run the build
+to avoid installing dependencies by yourself. We have several build environment
+Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ .
+
+If you choose not to use Docker image for your build, you need to install the
+below `Compile Dependencies`_ before run the build.
+
+Then run:
+
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # run the following command to build a CPU-Only binaries if you are using docker
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+   # else run these commands
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
+   make
+
+When the compile finishes, you can get the output whl package under
+build/python/dist, then you can choose to install the whl on local
+machine or copy it to the target machine.
+
+.. code-block:: bash
+
+   pip install build/python/dist/*.whl
+
+If the machine has installed PaddlePaddle before, there are two methods:
+
+.. code-block:: bash
+
+   1. uninstall and reinstall
+   pip uninstall paddlepaddle
+   pip install build/python/dist/*.whl
+
+   2. upgrade directly
+   pip install build/python/dist/*.whl -U
+
+.. _run_test:
+
+Run Tests
+----------------
+
+If you wish to run the tests, you may follow the below steps:
+
+When using Docker, set :code:`RUN_TEST=ON` and :code:`WITH_TESTING=ON` will run test immediately after the build.
+Set :code:`WITH_GPU=ON` Can also run tests on GPU.
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/paddle/scripts/docker/build.sh
+
+If you don't use Docker, just run ctest will start the tests:
+
+.. code-block:: bash
+
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=ON ..
+   make
+   ctest
+   # run a single test like test_mul_op
+   ctest -R test_mul_op
+
+
+.. _compile_deps:
+
+Compile Dependencies
+----------------
+
+PaddlePaddle need the following dependencies when compiling, other dependencies
+will be downloaded automatically.
+
+.. csv-table:: PaddlePaddle Compile Dependencies
+   :header: "Dependency", "Version", "Description"
+   :widths: 10, 15, 30
+
+   "CMake", ">=3.2", ""
+   "GCC", "4.8.2", "Recommend devtools2 for CentOS"
+   "Python", "2.7.x", "Need libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
+   "SWIG", ">=2.0", ""
+   "Go", ">=1.8", "Optional"
+
+
+.. _build_options:
+
+Build Options
+----------------
+
+Build options include whether build binaries for CPU or GPU, which BLAS
+library to use etc. You may pass these settings when running cmake.
+For detailed cmake tutorial please refer to `here <https://cmake.org/cmake-tutorial>`_ 。
+
+.. _build_options_bool:
+
+Bool Type Options
+----------------
+
+You can add :code:`-D` argument to pass such options, like:
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: Bool Type Options
+    :header: "Option", "Description", "Default"
+    :widths: 1, 7, 2
+
+    "WITH_GPU", "Build with GPU support", "ON"
+    "WITH_C_API", "Build only CAPI", "OFF"
+    "WITH_DOUBLE", "Build with double precision", "OFF"
+    "WITH_DSO", "Dynamically load CUDA libraries", "ON"
+    "WITH_AVX", "Build with AVX support", "ON"
+    "WITH_PYTHON", "Build with integrated Python interpreter", "ON"
+    "WITH_STYLE_CHECK", "Check code style when building", "ON"
+    "WITH_TESTING", "Build unit tests", "OFF"
+    "WITH_DOC", "Build documentations", "OFF"
+    "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
+    "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"
+    "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON"
+
+
+BLAS
+++++
+
+PaddlePaddle supports `MKL <https://software.intel.com/en-us/intel-mkl>`_ and
+`OpenBlAS <http://www.openblas.net/>`_ as BLAS library。By default it uses MKL.
+If you are using MKL and your machine supports AVX2, MKL-DNN will also be downloaded
+and used, for more `details <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ .
+
+If you choose not to use MKL, then OpenBlAS will be used.
+
+CUDA/cuDNN
+++++++++++
+
+PaddlePaddle will automatically find CUDA and cuDNN when compiling and running.
+parameter :code:`-DCUDA_ARCH_NAME=Auto` can be used to detect SM architecture
+automatically in order to speed up the build.
+
+PaddlePaddle can build with any version later than cuDNN v5.1, and we intend to
+keep on with latest cuDNN versions. Be sure to run with the same version of cuDNN
+you built.
+
+Pass Compile Options
++++++++++++++
+
+You can pass compile options to use intended BLAS/CUDA/Cudnn libraries.
+When running cmake command, it will search system paths like
+:code:`/usr/lib:/usr/local/lib` and then search paths that you
+passed to cmake, i.e.
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
+
+**NOTE: These options only take effect when running cmake for the first time, you need to clean the cmake cache or clean the build directory (** :code:`rm -rf` **) if you want to change it.**
--- a/doc/build_and_install/docker_install_cn.rst
+++ b/doc/build_and_install/docker_install_cn.rst
+使用Docker安装运行
+================================
+
+使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境即可运行。并且也可以在Windows的docker中运行。
+您可以在 `Docker官网 <https://docs.docker.com/get-started/>`_ 获得基本的Docker安装和使用方法。
+
+如果您在使用Windows，可以参考
+`这篇 <https://docs.docker.com/toolbox/toolbox_install_windows/>`_
+教程，完成在Windows上安装和使用Docker。
+
+在了解Docker的基本使用方法之后，即可开始下面的步骤：
+
+.. _docker_pull:
+
+获取PaddlePaddle的Docker镜像
+------------------------------
+
+执行下面的命令获取最新的PaddlePaddle Docker镜像，版本为cpu_avx_mkl：
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle
+
+对于国内用户，我们提供了加速访问的镜像源：
+
+  .. code-block:: bash
+
+     docker pull docker.paddlepaddlehub.com/paddle
+
+下载GPU版本（cuda8.0_cudnn5_avx_mkl）的Docker镜像：
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle:latest-gpu
+     docker pull docker.paddlepaddlehub.com/paddle:latest-gpu
+
+选择下载使用不同的BLAS库的Docker镜像：
+
+  .. code-block:: bash
+
+     # 默认是使用MKL的镜像
+     docker pull paddlepaddle/paddle
+     # 使用OpenBLAS的镜像
+     docker pull paddlepaddle/paddle:latest-openblas
+
+下载指定版本的Docker镜像，可以从 `DockerHub网站 <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 获取可选的tag，并执行下面的命令：
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle:[tag]
+     # 比如：
+     docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu
+
+.. _docker_run:
+
+在Docker中执行PaddlePaddle训练程序
+----------------------------------
+
+假设您已经在当前目录（比如在/home/work）编写了一个PaddlePaddle的程序 :code:`train.py` （可以参考
+`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_ 
+编写），就可以使用下面的命令开始执行训练：
+
+  .. code-block:: bash
+
+     cd /home/work
+     docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
+ 
+上述命令中， :code:`-it` 参数说明容器已交互式运行； :code:`-v $PWD:/work`
+指定将当前路径（Linux中$PWD变量会展开为当前路径的绝对路径）挂载到容器内部的 :code:`/work`
+目录； :code:`paddlepaddle/paddle` 指定需要使用的容器； 最后 :code:`/work/train.py`
+为容器内执行的命令，即运行训练程序。
+
+当然，您也可以进入到Docker容器中，以交互式的方式执行或调试您的代码：
+
+  .. code-block:: bash
+     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
+     cd /work
+     python train.py
+
+**注：PaddlePaddle Docker镜像为了减小体积，默认没有安装vim，您可以在容器中执行** :code:`apt-get install -y vim` **安装后，在容器中编辑代码。**
+
+.. _docker_run_book:
+
+使用Docker启动PaddlePaddle Book教程
+-----------------------------------
+
+使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook，可以通过网页浏览。
+PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
+如果您想要更深入了解deep learning，PaddlePaddle Book一定是您最好的选择。
+大家可以通过它阅读教程，或者制作和分享带有代码、公式、图表、文字的交互式文档。
+
+我们提供可以直接运行PaddlePaddle Book的Docker镜像，直接运行：
+
+  .. code-block:: bash
+
+     docker run -p 8888:8888 paddlepaddle/book
+
+国内用户可以使用下面的镜像源来加速访问：
+
+  .. code-block: bash
+
+    docker run -p 8888:8888 docker.paddlepaddlehub.com/book
+
+然后在浏览器中输入以下网址：
+
+  .. code-block:: text
+
+     http://localhost:8888/
+
+就这么简单，享受您的旅程！
+
+.. _docker_run_gpu:
+
+使用Docker执行GPU训练
+------------------------------
+
+为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_ 来运行镜像。
+请不要忘记提前在物理机上安装GPU最新驱动。
+
+  .. code-block:: bash
+
+     nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
+
+**注: 如果没有安装nvidia-docker，可以尝试以下的方法，将CUDA库和Linux设备挂载到Docker容器内：**
+
+  .. code-block:: bash
+
+     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+     docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
+
+**关于AVX：**
+
+AVX是一种CPU指令集，可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认
+是开启AVX编译的，所以，如果您的电脑不支持AVX，需要单独
+`编译 <./build_from_source_cn.html>`_ PaddlePaddle为no-avx版本。
+
+以下指令能检查Linux电脑是否支持AVX：
+
+   .. code-block:: bash
+
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
+
+如果输出是No，就需要选择使用no-AVX的镜像
--- a/doc/build_and_install/docker_install_en.rst
+++ b/doc/build_and_install/docker_install_en.rst
+Run in Docker Containers
+=================================
+
+Run PaddlePaddle in Docker container so that you don't need to care about
+runtime dependencies, also you can run under Windows system. You can get
+tutorials at `here <https://docs.docker.com/get-started/>`_ .
+
+If you are using Windows, please refer to
+`this <https://docs.docker.com/toolbox/toolbox_install_windows/>`_
+tutorial to start running docker under windows.
+
+After you've read above tutorials you may proceed the following steps.
+
+.. _docker_pull:
+
+Pull PaddlePaddle Docker Image
+------------------------------
+
+Run the following command to download the latest Docker images, the version is cpu_avx_mkl:
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle
+
+For users in China, we provide a faster mirror:
+
+  .. code-block:: bash
+
+     docker pull docker.paddlepaddlehub.com/paddle
+
+Download GPU version (cuda8.0_cudnn5_avx_mkl) images:
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle:latest-gpu
+     docker pull docker.paddlepaddlehub.com/paddle:latest-gpu
+
+Choose between different BLAS version:
+
+  .. code-block:: bash
+
+     # image using MKL by default
+     docker pull paddlepaddle/paddle
+     # image using OpenBLAS
+     docker pull paddlepaddle/paddle:latest-openblas
+
+
+If you want to use legacy versions, choose a tag from
+`DockerHub <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_
+and run:
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle:[tag]
+     # i.e.
+     docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu
+
+.. _docker_run:
+
+Launch your training program in Docker
+--------------------------------------
+
+Assume that you have already written a PaddlePaddle program
+named :code:`train.py` under directory :code:`/home/work` (refer to 
+`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_ 
+for more samples), then run the following command:
+
+  .. code-block:: bash
+
+     cd /home/work
+     docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
+
+In the above command, :code:`-it` means run the container interactively;
+:code:`-v $PWD:/work` means mount the current directory ($PWD will expand
+to current absolute path in Linux) under :code:`/work` in the container.
+:code:`paddlepaddle/paddle` to specify image to use; finnally
+:code:`/work/train.py` is the command to run inside docker.
+
+Also, you can go into the container shell, run or debug your code
+interactively:
+
+  .. code-block:: bash
+     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
+     cd /work
+     python train.py
+
+**NOTE: We did not install vim in the default docker image to reduce the image size, you can run** :code:`apt-get install -y vim` **to install it if you need to edit python files.**
+
+.. _docker_run_book:
+
+PaddlePaddle Book
+------------------
+
+You can create a container serving PaddlePaddle Book using Jupyter Notebook in
+one minute using Docker. PaddlePaddle Book is an interactive Jupyter Notebook
+for users and developers.If you want to
+dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
+
+We provide a packaged book image, simply issue the command:
+
+  .. code-block:: bash
+
+     docker run -p 8888:8888 paddlepaddle/book
+
+For users in China, we provide a faster mirror:
+
+  .. code-block: bash
+
+    docker run -p 8888:8888 docker.paddlepaddlehub.com/book
+
+Then, you would back and paste the address into the local browser:
+
+  .. code-block:: text
+
+     http://localhost:8888/
+
+That's all. Enjoy your journey!
+
+.. _docker_run_gpu:
+
+Train with Docker with GPU
+------------------------------
+
+We recommend using
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_
+to run GPU training jobs. Please ensure you have latest
+GPU driver installed before move on.
+
+  .. code-block:: bash
+
+     nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
+
+**NOTE: If you don't have nvidia-docker installed, try the following method to mount CUDA libs and devices into the container.**
+
+  .. code-block:: bash
+
+     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+     docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
+
+**About AVX:**
+
+AVX is a kind of CPU instruction can accelerate PaddlePaddle's calculations.
+The latest PaddlePaddle Docker image turns AVX on by default, so, if your
+computer doesn't support AVX, you'll probably need to
+`build <./build_from_source_en.html>`_ with :code:`WITH_AVX=OFF`.
+
+The following command will tell you whether your computer supports AVX.
+
+   .. code-block:: bash
+
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
--- a/doc/build_and_install/index_cn.rst
+++ b/doc/build_and_install/index_cn.rst
+安装与编译
+==========
+
+.. _install_steps:
+
+安装流程
++++++++
+
+PaddlePaddle提供pip和Docker的安装方式：
+
+.. toctree::
+   :maxdepth: 1
+
+   pip_install_cn.rst
+   docker_install_cn.rst
+   build_cn.md
+
+编译流程
++++++++
+
+..  warning::
+
+    建议直接使用上述安装流程，方便快速安装。只有在遇到需要独立定制的二进制时才需要编译。
+
+..  toctree::
+    :maxdepth: 1
+
+    build_from_source_cn.rst
+
+常见问题解答
++++++++++
+
+`常见问题解答 <http://www.paddlepaddle.org/docs/develop/documentation/zh/faq/build_and_install/index_cn.html>`_
--- a/doc/build_and_install/index_en.rst
+++ b/doc/build_and_install/index_en.rst
+Install and Build
+=================
+
+.. _install_steps:
+
+Install Steps
++++++++
+
+You can choose either pip or Docker to complete your install:
+
+.. toctree::
+   :maxdepth: 1
+
+   pip_install_en.rst
+   docker_install_en.rst
+   build_en.md
+
+
+Build from Source
+-----------------
+
+..  warning::
+
+    We recommend to directly install via above installation steps, you'll only need to build PaddlePaddle from source when you need a modifed binary.
+
+..  toctree::
+    :maxdepth: 1
+
+    build_from_source_en.md
+
+FAQ
++++++++++
+
+`FAQ <http://www.paddlepaddle.org/docs/develop/documentation/zh/faq/build_and_install/index_en.html>`_
--- a/doc/getstarted/build_and_install/paddleci.png
+++ b/doc/getstarted/build_and_install/paddleci.png
--- a/doc/build_and_install/pip_install_cn.rst
+++ b/doc/build_and_install/pip_install_cn.rst
+使用pip安装
+================================
+
+PaddlePaddle可以使用常用的Python包管理工具
+`pip <https://pip.pypa.io/en/stable/installing/>`_
+完成安装，并可以在大多数主流的Linux操作系统以及MacOS上执行。
+
+.. _pip_install:
+
+使用pip安装
+------------------------------
+
+
+执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境，并自动下载安装依赖软件，版本为cpu_avx_openblas。
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+
+如果需要安装支持GPU的版本（cuda7.5_cudnn5_avx_openblas），需要执行：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+如果需要获取并安装最新的（开发分支）PaddlePaddle，可以从我们的CI系统中下载最新的whl安装包和c-api开发包并安装，
+您可以从下面的表格中找到需要的版本：
+
+如果在点击下面链接时出现如下登陆界面，点击“Log in as guest”即可开始下载：
+
+.. image:: paddleci.png
+   :scale: 50 %
+   :align: center
+
+..  csv-table:: 各个版本最新的whl包
+    :header: "版本说明", "cp27-cp27mu", "cp27-cp27m", "C-API"
+    :widths: 1, 3, 3, 3
+
+    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
+    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+
+.. _pip_dependency:
+
+运行环境依赖
+------------------------------
+
+PaddlePaddle安装包由于不仅仅包含.py程序，而且包含了C++编写的部分，所以我们确保发布的二进制包可以支持主流的Linux操作系统，比如CentOS 6以上，Ubuntu 14.04以上，MacOS 10.12以上。
+
+PaddlePaddle发布的安装包会尽量对齐 `manylinux1 <https://www.python.org/dev/peps/pep-0513/#the-manylinux1-policy>`_ 标准，通常使用CentOS 5作为编译环境。但由于CUDA库通常需要CentOS 6以上，而且CentOS 5即将停止维护，所以我们默认使用CentOS 6作为标准编译环境。
+
+.. csv-table:: PaddlePaddle环境依赖
+   :header: "依赖", "版本", "说明"
+   :widths: 10, 15, 30
+
+   "操作系统", "Linux, MacOS", "CentOS 6以上，Ubuntu 14.04以上，MacOS 10.12以上"
+   "Python", "2.7.x", "暂时不支持Python3"
+   "libc.so", "GLIBC_2.7", "glibc至少包含GLIBC_2.7以上的符号"
+   "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "至少包含GLIBCXX_3.4.11, CXXABI_1.3.3以上的符号"
+   "libgcc_s.so", "GCC_3.3", "至少包含GCC_3.3以上的符号"
+
+.. _pip_faq:
+
+安装常见问题和解决方法
+------------------------------
+
+- paddlepaddle*.whl is not a supported wheel on this platform.
+  
+  出现这个问题的主要原因是，没有找到和当前系统匹配的paddlepaddle安装包。请检查Python版本是否为2.7系列。另外最新的pip官方源中的安装包默认是manylinux1标准，需要使用最新的pip (>9.0.0) 才可以安装。可以使用下面的命令更新您的pip：
+
+    .. code-block:: bash
+
+       pip install --upgrade pip
+
+  如果仍然存在问题，可以执行：
+
+      .. code-block:: bash
+
+         python -c "import pip; print(pip.pep425tags.get_supported())"
+
+  获取当前系统支持的安装包格式，并检查和需安装的包是否匹配。pypi安装包可以在 `这个 <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_ 链接中找到。
+
+  如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ，需要升级pip版本到最新； 如果系统支持 manylinux1_x86_64 而安装包（本地）是 linux_x86_64 ，可以重命名这个whl包为 manylinux1_x86_64 再安装。
--- a/doc/build_and_install/pip_install_en.rst
+++ b/doc/build_and_install/pip_install_en.rst
+Install Using pip
+================================
+
+You can use current widely used Python package management
+tool `pip <https://pip.pypa.io/en/stable/installing/>`_
+to install PaddlePaddle. This method can be used in
+most of current Linux systems or MacOS.
+
+.. _pip_install:
+
+Install Using pip
+------------------------------
+
+Run the following command to install PaddlePaddle on the current
+machine, it will also download requirements, the version is cpu_avx_openblas.
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+
+If you wish to install GPU version (cuda7.5_cudnn5_avx_openblas), just run:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+If you wish to install the latest develop branch PaddlePaddle, 
+you can download the latest whl package from our CI system. Access
+the below links, log in as guest, then click at the "Artifact"
+tab, you'll find the download link of whl packages.
+
+If the links below shows up the login form, just click "Log in as guest" to start the download:
+
+.. image:: paddleci.png
+   :scale: 50 %
+   :align: center
+
+..  csv-table:: whl package of each version
+    :header: "version", "cp27-cp27mu", "cp27-cp27m", "C-API"
+    :widths: 1, 3, 3, 3
+
+    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
+    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+
+.. _pip_dependency:
+
+Runtime Dependency
+------------------------------
+
+PaddlePaddle installation packages (whl) does not only contain .py files,
+but also binaries built from C++ code. We ensure that PaddlePaddle can
+run on current mainline Linux distributions, like CentOS 6, Ubuntu 14.04
+and MacOS 10.12.
+
+PaddlePaddle whl packages are trying to satisfy
+`manylinux1 <https://www.python.org/dev/peps/pep-0513/#the-manylinux1-policy>`_
+standard, which uses CentOS 5 as default build environment. But CUDA libraries
+seems only run on CentOS 6 at least, also, CentOS 5 is about to end its lifetime,
+so we use CentOS 6 as default build environment.
+
+.. csv-table:: PaddlePaddle Runtime Deps
+   :header: "Dependency", "version", "description"
+   :widths: 10, 15, 30
+
+   "OS", "Linux, MacOS", "CentOS 6 or later，Ubuntu 14.04 or later，MacOS 10.12 or later"
+   "Python", "2.7.x", "Currently Python3 is not supported"
+   "libc.so", "GLIBC_2.7", "glibc at least include GLIBC_2.7 symbols"
+   "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "At least include GLIBCXX_3.4.11, CXXABI_1.3.3 symbols"
+   "libgcc_s.so", "GCC_3.3", "At least include GCC_3.3 symbols"
+
+.. _pip_faq:
+
+FAQ
+------------------------------
+
+- paddlepaddle*.whl is not a supported wheel on this platform.
+  
+  The main cause of this issue is that your current platform is
+  not supported. Please check that you are using Python 2.7 series.
+  Besides, pypi only supports manylinux1 standard, you'll need to
+  upgrade your pip to >9.0.0. Then run the below command:
+
+    .. code-block:: bash
+
+       pip install --upgrade pip
+
+  If the problem still exists, run the following command:
+
+      .. code-block:: bash
+
+         python -c "import pip; print(pip.pep425tags.get_supported())"
+
+  Then you'll get supported package suffixes, then check if it matches
+  the file name of the whl package. You can find default whl package at
+  `here <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_
+
+  If your system supports linux_x86_64 but the whl package is manylinux1_x86_64,
+  you'll need to update pip to the latest version; If your system supports
+  manylinux1_x86_64 but the whl package is linux_x86_64 you can rename the
+  file to manylinux1_x86_64 suffix and then install.
--- a/doc/design/auto_gradient_check.md
+++ b/doc/design/auto_gradient_check.md
-## Auto Gradient Checker Design
+## Auto Gradient Check Design

-## Backgraound：
- Generally, it is easy to check whether the forward computation of an Operator is correct or not. However, backpropagation is a notoriously difficult algorithm to debug and get right:
-  1. you should get the right backpropagation formula according to the forward computation.
-  2. you should implement it right in CPP.
-  3. it's difficult to prepare test data.
+## Background：
+- Generally, it is easy to check whether the forward computation of an Operator is correct or not. However, backpropagation is a notoriously difficult algorithm to debug and get right because of the following challenges:
+  1. The formula for backpropagation formula should be correct according to the forward computation.
+  2. The Implementation of the above shoule be correct in CPP.
+  3. It is difficult to prepare an unbiased test data.

- Auto gradient checking gets a numerical gradient by forward Operator and use it as a reference of the backward Operator's result. It has several advantages:
-  1. numerical gradient checker only need forward operator.
-  2. user only need to prepare the input data for forward Operator.
+- Auto gradient checking gets a numerical gradient using forward Operator and uses it as a reference for the backward Operator's result. It has several advantages:
+  1. Numerical gradient checker only needs the forward operator.
+  2. The user only needs to prepare the input data for forward Operator and not worry about the backward Operator.

 ## Mathematical Theory
-The following two document from Stanford has a detailed explanation of how to get numerical gradient and why it's useful.
+The following documents from Stanford have a detailed explanation of how to compute the numerical gradient and why it is useful.

 - [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
 - [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)


-## Numeric Gradient Implementation
+## Numerical Gradient Implementation
 ### Python Interface
 ```python
 def get_numerical_gradient(op,
@@ -27,73 +27,76 @@ def get_numerical_gradient(op,
                         delta=0.005,
                         local_scope=None):
    """
-    Get Numeric Gradient for an operator's input.
+    Get Numerical Gradient for the input of an operator.

-    :param op: C++ operator instance, could be an network
+    :param op: C++ operator instance, could be an network.
    :param input_values: The input variables. Should be an dictionary, whose key is
-    variable name, and value is numpy array.
+    variable name, and value is a numpy array.
    :param output_name: The final output variable name.
-    :param input_to_check: The input variable with respect to which to compute the gradient.
-    :param delta: The perturbation value for numeric gradient method. The
-    smaller delta is, the more accurate result will get. But if that delta is
-     too small, it will suffer from numerical stability problem.
+    :param input_to_check: The input variable with respect to which the gradient has to be computed.
+    :param delta: The perturbation value for numerical gradient method. The
+    smaller the delta, the more accurate the result. But if the delta is too
+    small, it will suffer from the numerical stability problem.
    :param local_scope: The local scope used for get_numeric_gradient.
    :return: The gradient array in numpy format.
    """
 ```

-### Explaination:
+### Explanation:

- Why need `output_name`
-  - An Operator may have multiple Output, one can get independent gradient from each Output. So caller should specify the name of the output variable.
+- Why do we need an `output_name`
+  - An Operator may have multiple Outputs, one can compute an independent gradient from each Output. So the caller should specify the name of the output variable.

- Why need `input_to_check`
-  - One operator may have multiple inputs. Gradient Op can calculate the gradient of these inputs at the same time. But Numeric Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times.
+- Why do we need `input_to_check`
+  - One operator can have multiple inputs. Gradient Op can calculate the gradient of these inputs at the same time. But Numerical Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times each with a different input.


 ### Core Algorithm Implementation


 ```python
-    # we only compute gradient of one element a time.
+    # we only compute the gradient of one element a time.
    # we use a for loop to compute the gradient of each element.
    for i in xrange(tensor_size):
-        # get one input element by its index i.
-        origin = tensor_to_check.get_float_element(i)
+        # get one input element using the index i.
+        original = tensor_to_check.get_float_element(i)

-        # add delta to it, run op and then get the new value of the result tensor.
-        x_pos = origin + delta
+        # add delta to it, run the forward op and then
+        # get the new value of the result tensor.
+        x_pos = original + delta
        tensor_to_check.set_float_element(i, x_pos)
        y_pos = get_output()

-        # plus delta to this element, run op and get the new value of the result tensor.
-        x_neg = origin - delta
+        # Subtract delta from this element, run the op again
+        # and get the new value of the result tensor.
+        x_neg = original - delta
        tensor_to_check.set_float_element(i, x_neg)
        y_neg = get_output()

        # restore old value
-        tensor_to_check.set_float_element(i, origin)
+        tensor_to_check.set_float_element(i, original)

-        # compute the gradient of this element and store it into a numpy array.
+        # compute the gradient of this element and store
+        # it into a numpy array.
        gradient_flat[i] = (y_pos - y_neg) / delta / 2

    # reshape the gradient result to the shape of the source tensor.
    return gradient_flat.reshape(tensor_to_check.get_dims())
 ```

-## Auto Graident Checker Framework
+## Auto Gradient Check Framework

 Each Operator Kernel has three kinds of Gradient:

 1. Numerical gradient
 2. CPU kernel gradient
-3. GPU kernel gradient (if supported)
+3. GPU kernel gradient (if supported by the device)

-The numerical gradient only relies on forward Operator. So we use the numerical gradient as the reference value. And the gradient checking is performed in the following three steps:
+The numerical gradient only relies on the forward Operator, so we use the numerical gradient as the reference value. The gradient checking is performed in the following three steps:

-1. calculate the numerical gradient
-2. calculate CPU kernel gradient with the backward Operator and compare it with the numerical gradient
-3. calculate GPU kernel gradient with the backward Operator and compare it with the numeric gradient (if supported)
+1. Calculate the numerical gradient
+2. Calculate CPU kernel gradient with the backward Operator and compare it with the numerical gradient.
+3. Calculate GPU kernel gradient with the backward Operator and compare it with the numeric gradient. (if supported)

 #### Python Interface

@@ -109,26 +112,27 @@ The numerical gradient only relies on forward Operator. So we use the numerical
        """
        :param forward_op: used to create backward_op
        :param input_vars: numpy value of input variable. The following
-            computation will use these variables.
-        :param inputs_to_check: the input variable with respect to which to compute the gradient.
+          computation will use these variables.
+        :param inputs_to_check: the input variable with respect to which the
+          gradient will be computed.
        :param output_name: The final output variable name.
        :param max_relative_error: The relative tolerance parameter.
-        :param no_grad_set: used when create backward ops
+        :param no_grad_set: used to create backward ops
        :param only_cpu: only compute and check gradient on cpu kernel.
        :return:
        """
 ```

-### How to check if two numpy array is close enough?
-if `abs_numerical_grad` is nearly zero, then use abs error for numerical_grad
+### How to check if two numpy arrays are close enough?
+if `abs_numerical_grad` is nearly zero, then use absolute error for numerical_grad.

 ```python
 numerical_grad = ...
 operator_grad = numpy.array(scope.find_var(grad_var_name(name)).get_tensor())

 abs_numerical_grad = numpy.abs(numerical_grad)
-# if abs_numerical_grad is nearly zero, then use abs error for numeric_grad, not relative
-# error.
+# if abs_numerical_grad is nearly zero, then use abs error for
+# numeric_grad, instead of relative error.
 abs_numerical_grad[abs_numerical_grad < 1e-3] = 1

 diff_mat = numpy.abs(abs_numerical_grad - operator_grad) / abs_numerical_grad
@@ -137,10 +141,10 @@ max_diff = numpy.max(diff_mat)


 #### Notes：
-The Input data for auto gradient checker should be reasonable to avoid numerical  stability problem.
+The Input data for auto gradient checker should be reasonable to avoid numerical stability problem.


-#### Refs:
+#### References:

 - [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
 - [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
--- a/doc/design/cpp_data_feeding.md
+++ b/doc/design/cpp_data_feeding.md
+# C++ Data Feeding
+
+In training with Paddle V2 API, data feeding wholly dependents on Python code. To get rid of the Python environment and achieve the goal of "wrapping the whole training by a while loop op" in Paddle Fluid, a C++ data feeding mechanism is required. 
+
+In this document we show the fundamental design of C++ data feeding process, which includes the data reading, shuffling and batching.
+
+## Reader
+
+A new concept named 'Reader' is introduced. `Reader` is a series of inherited classes which can be hold by our `Variable` and they are used to read or process file data.
+
+
+### `ReaderBase`
+
+`ReaderBase` is the abstract base class of all readers. It defines the all readers' interfaces.
+
+```cpp
+class ReaderBase {
+ public:
+  explicit ReaderBase(const std::vector<DDim>& shapes) : shapes_(shapes) {
+    PADDLE_ENFORCE(!shapes_.empty());
+  }
+  // Read the next batch of data. (A 'batch' can be only one instance)
+  virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
+  // Show whether the next bacth exists.
+  virtual bool HasNext() const = 0;
+  
+  // Reinitialize the reader and read the file from the begin.
+  virtual void ReInit() = 0;
+  
+  // Get a certain read in data's shape.
+  DDim shape(size_t idx) const;
+  // Get shapes of all read in data.
+  std::vector<DDim> shapes() const { return shapes_; }
+  // Set shapes of read in data.
+  void set_shapes(const std::vector<DDim>& shapes) { shapes_ = shapes; }
+
+  virtual ~ReaderBase() {}
+
+ protected:
+  std::vector<DDim> shapes_;
+};
+```
+
+### `FileReader` and `DecoratedReader`
+
+These two classes are derived from the `ReaderBase` and will further be derived by respective specific readers. That is to say, in our design, there are two kinds of readers: file readers and decorated readers. A file reader reads from a file of some specific format, and yield only one instance of data at a time. e.g. RecordIO reader, jpg reader, .... A decorated reader takes another reader(both file reader and decorated reader are OK) as its 'underlying reader'. It gets data from its underlying reader, does some process on them(shuffling, or batching), then yields processed data. The output data of a decorated reader can be a single instance or a batch. `ShuffleReader` and `BatchReader` are both decorated readers.
+
+All the readers share exactly the same interfaces defined in `ReaderBase`. So they can be decorated for more than one time: We can **shuffle** a reader's outputs and then **batch** the shuffle outputs. The interface consistency also allows related ops use readers without knowing what they are exactly.
+
+
+### `ReaderHolder`
+
+Different readers belong to different class types. It leads to a problem: How can we drop them into `Variable`s and fetch them out by a unified method? For example, if a Variable holds a `BatchReader`, we can not get it by the following code:
+
+```cpp
+var->Get<ReaderBase>("batch_reader");
+```
+
+we have to write:
+
+```cpp
+var->Get<BatchReader>("batch_reader");
+```
+
+This requires each time getting a reader from a variable we must know the reader's type exactly. It is nearly impossible.
+
+To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an empty decorator of `ReaderBase`, which erases reader's type. With `ReaderHolder` we are able to fetch all types of readers by `var->Get<ReaderHolder>("...")` and regard the obtained object as a reader.
+
+## Related Operators
+
+To create and invoke readers, some now ops are introduced:
+
+### `CreateReaderOp`
+
+Each reader has its creating op. File readers' creating ops have no input and yield the created file reader as its output. Decorated readers' creating ops take the underlying readers as inputs and then yield new decorated readers.
+
+### `ReadOp`
+
+A reader is only a Variable. It cannot trigger the reading process by itself. So we add the `ReadOp` to execute it. A `ReadOp` takes a reader Variable as its input. Each time it runs, it invokes the reader‘s `ReadNext()` function and gets a new batch of data(or only one instance of data, if we use file reader directly). The output data of a reader are in the form of `std::vector<LoDTenosr>`, so the `ReadOp` also needs to split the vector and move LoDTensors to their respective output Variables.
--- a/doc/design/csp.md
+++ b/doc/design/csp.md
@@ -42,7 +42,7 @@ The type *channel* is conceptually the blocking queue.  In Go, its implemented i

 The `select` operation has been in OS kernels long before Go language.  All Unix kernels implement system calls *poll* and *select*.  They monitor multiple file descriptors to see if I/O is possible on any of them.  This takes O(N) time.  Since Linux 2.6, a new system call, *epoll*, can do the same in O(1) time.  In BSD systems, there is a similar system call *kqueue*.  Go's Linux implementation uses epoll.

-It might be a good idea to implement Fluid's select using epoll too.  In this design doc, we start from the O(N) way, so we could focus on Python binding and the syntax.
+It might be a good idea to implement Fluid's select using epoll too.  In this design doc, we start from the O(N) way so that we could focus on Python binding and the syntax.

 ### Type Channel

@@ -71,14 +71,14 @@ ch1 := make(chan int, 100)  // a channel that can buffer 100 ints.
 In Fluid, we should be able to do the same:

 ```python
-ch  = fluid.make_chan(dtype=INT)
-ch1 = fluid.make_chan(dtype=INT, 100)
+ch  = fluid.make_channel(dtype=INT)
+ch1 = fluid.make_channel(dtype=INT, 100)
 ```

 In addition to that, we want channels that can hold more complex element types, e.g., Tensors of float16:

 ```python
-ch = fluid.make_chan(dtype=Tensor, etype=float16)
+ch = fluid.make_channel(dtype=Tensor, etype=float16)
 ```

 or Tensors of Tensors of float16 etc.
@@ -87,8 +87,136 @@ The point here is that we need a consistent way to compose types, like in C++ we

 ### Send and Recv

+Go's CSP implementation depends on data type *channel*. There are two types of channels:
+
+1. The unblocked channel, or buffered channel, is a blocking queue with a non-zero sized buffer. The sending to buffered channel blocks if the buffer is full, and the receive operation blocks if the buffer is empty.
+1. blocked channel, or unbuffered channel, is a blocking queue with no buffer.  Both sending and receiving block with unbuffered channels.
+
+There are four types of actions with a channel:
+
+1. Create a channel
+
+   ```go
+   ch := make(chan int) // this is an unbuffered channel
+   ch := make(chan int, 100) // this is a buffered channel of 100 ints.
+   ```
+
+1. Send
+
+   ```go
+   ch <- 111
+   ```
+
+1. Recv
+
+   ```go
+   y, ok <- ch
+   ```
+
+1. Close
+
+   ```go
+   close(ch)
+   ```
+   
+   Please be aware that a closed channel is not a nil channel, which is `var ch chan int`.
+   
+There are some [axioms with channels](https://dave.cheney.net/2014/03/19/channel-axioms):
+
+1. A send to a nil channel blocks forever
+
+1. A receive from a nil channel blocks forever
+
+1. A send to a closed channel panics
+
+1. A receive from a closed channel returns the residual values and then zeros.
+
+In Fluid, we have [buffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/buffered_channel.h) and [unbuffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/unbuffered_channel.h)
+
+The following program illustrates the Python syntax for accessing Fluid buffers.
+
+```python
+import fluid
+
+buffer_size = 10
+ch = fluid.make_channel(dtype=INT, buffer_size)
+
+# Now write three elements to the channel
+with fluid.while(steps=buffer_size):
+  fluid.send(ch, step)
+
+fluid.close_channel(ch)
+
+with fluid.while(steps=buffer_size):
+  fluid.print(fluid.recv(ch))
+```
+
+The following example shows that to avoid the always-blocking behavior of unbuffered channels, we need to use Fluid's goroutines.
+
+```python
+import fluid
+
+ch = fluid.make_channel(dtype=INT)
+
+with fluid.go():
+  fluid.send(ch)
+
+y = fluid.recv(ch)
+
+fluid.close_channel(ch)
+```
+
 ### Select

+In Go, the `select` statement lets a goroutine wait on multiple communication operations. A `select` blocks until one of its cases can run, then it executes that case. It chooses one at random if multiple are ready.
+
+```go
+
+ch1  := make(chan int)       
+ch2  := make(chan int, 100)
+
+x := 0
+
+for {
+    select {
+    case ch1 <- x:
+      x := x + 1
+    case y <- ch2:
+      fmt.Println("Received on channel")
+    default:
+      fmt.Println("Default")
+    }
+  }
+
+```
+
+In Fluid, we should be able to do the same:
+
+```python
+ch1  = fluid.make_chan(dtype=INT)
+ch2 = fluid.make_chan(dtype=INT, 100)
+
+sel = fluid.select()
+
+with sel.case(ch1, 'w', X):
+    fluid.layers.increment(X)
+
+with sel.case(ch2, 'r', Y):
+    fluid.print("Received on Channel")
+
+with sel.default():
+    fluid.print("Default")
+
+```
+
+In the above code snippet, `X` and `Y` are variables. Now let us look at each of these statements one by one.
+
+- `sel.case(ch1, 'w', X)` : This specifies that we are writing to `ch1` and we want to write the integer in variable `X` to the channel. The character `w` is used here to make the syntax familiar to write syntax in Python I/O.
+
+- `sel.case(ch2, 'r', Y)` : This specifies that we would like to read the result from `ch2` into variable `Y`. The character `r` is used here to make the syntax familiar to read syntax in Python I/O.
+
+- `sel.default()` : This is equivalent to the default in Go `select`. If none of the channels are ready for read or write, then the fluid code in the default block will be executed.
+
 ## Example Programs

 ### 1. RPC between Trainers and Parameter Servers

--- a/doc/design/dist_refactor/distributed_architecture.md
+++ b/doc/design/dist_refactor/distributed_architecture.md
@@ -152,12 +152,12 @@ for data in train_reader():
 `JobDesc` object describe the distributed job resource specification to run on
 Cluster environment.

-<img src="src/remote_executor.png"/>
+<img src="src/remote_executor.png" width="500" align="center" />

 `RemoteExecutor.run` sends the `ProgramDesc` and
 [TrainingJob](https://github.com/PaddlePaddle/cloud/blob/develop/doc/autoscale/README.md#training-job-resource)
 to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible
-to start the final Kubernetes Jobs to run the different role of `ProgramDesc`.
+to start the final Kubernetes Jobs to run the different role of `ProgramDesc` from `ConfigMap`.


 ### Placement Algorithm

--- a/doc/design/dist_refactor/src/remote_executor.graffle
+++ b/doc/design/dist_refactor/src/remote_executor.graffle
--- a/doc/design/dist_refactor/src/remote_executor.png
+++ b/doc/design/dist_refactor/src/remote_executor.png
--- a/doc/design/speech/README.MD
+++ b/doc/design/speech/README.MD
-# DeepSpeech2 on PaddlePaddle: Design Doc 
-
-We are planning to build Deep Speech 2 (DS2) \[[1](#references)\], a powerful Automatic Speech Recognition (ASR) engine,  on PaddlePaddle. For the first-stage plan, we have the following short-term goals:
-
- Release a basic distributed implementation of DS2 on PaddlePaddle.
- Contribute a chapter of Deep Speech to PaddlePaddle Book.
-
-Intensive system optimization and low-latency inference library (details in \[[1](#references)\]) are not yet covered in this first-stage plan.
-
-## Table of Contents
-
- [Tasks](#tasks)
- [Task Dependency](#task-dependency)
- [Design Details](#design-details)
-    - [Overview](#overview)
-    - [Row Convolution](#row-convolution)
-    - [Beam Search With CTC and LM](#beam-search-with-ctc-and-lm)
- [Future Work](#future-work)
- [References](#references)
-
-## Tasks
-
-We roughly break down the project into 14 tasks:
-
-1. Develop an **audio data provider**:
-	- Json filelist generator.
-	- Audio file format transformer.
-	- Spectrogram feature extraction, power normalization etc.
-	- Batch data reader with SortaGrad.
-	- Data augmentation (optional).
-	- Prepare (one or more) public English data sets & baseline.
-2. Create a **simplified DS2 model configuration**:
-   - With only fixed-length (by padding) audio sequences (otherwise need *Task 3*).
-	- With only bidirectional-GRU (otherwise need *Task 4*).
-	- With only greedy decoder (otherwise need *Task 5, 6*).
-3. Develop to support **variable-shaped** dense-vector (image) batches of input data.
-   - Update `DenseScanner` in `dataprovider_converter.py`, etc.
-4. Develop a new **lookahead-row-convolution layer** (See \[[1](#references)\] for details):
-   - Lookahead convolution windows.
-   - Within-row convolution, without kernels shared across rows.
-5. Build KenLM **language model** (5-gram) for beam search decoder:
-   - Use KenLM toolkit.
-   - Prepare the corpus & train the model.
-   - Create infererence interfaces (for Task 6).
-6. Develop a **beam search decoder** with CTC + LM + WORDCOUNT:
-   - Beam search with CTC.
-   - Beam search with external custom scorer (e.g. LM).
-   - Try to design a more general beam search interface.
-7. Develop a **Word Error Rate evaluator**:
-   - update `ctc_error_evaluator`(CER) to support WER.
-8. Prepare internal dataset for Mandarin (optional):
-    - Dataset, baseline, evaluation details.
-    - Particular data preprocessing for Mandarin.
-    - Might need cooperating with the Speech Department.
-9. Create **standard DS2 model configuration**:
-   - With variable-length audio sequences (need *Task 3*).
-	- With unidirectional-GRU + row-convolution (need *Task 4*).
-	- With CTC-LM beam search decoder (need *Task 5, 6*).
-10. Make it run perfectly on **clusters**.
-11. Experiments and **benchmarking** (for accuracy, not efficiency):
-    - With public English dataset.
-    - With internal (Baidu) Mandarin dataset (optional).
-12. Time **profiling** and optimization.
-13. Prepare **docs**.
-14. Prepare PaddlePaddle **Book** chapter with a simplified version.
-
-## Task Dependency
-
-Tasks parallelizable within phases:
-
-Roadmap     | Description                               | Parallelizable Tasks 
----------- | :------------------------------------     | :--------------------
-Phase I	    | Simplified model & components             | *Task 1* ~ *Task 8*
-Phase II    | Standard model & benchmarking & profiling | *Task 9* ~ *Task 12*
-Phase III   | Documentations                            | *Task13* ~ *Task14*
-
-Issue for each task will be created later. Contributions, discussions and comments are all highly appreciated and welcomed!
-
-## Design Details
-
-### Overview
-
-Traditional **ASR** (Automatic Speech Recognition) pipelines require great human efforts devoted to elaborately tuning multiple hand-engineered components (e.g. audio feature design, accoustic model, pronuncation model and language model etc.). **Deep Speech 2** (**DS2**) \[[1](#references)\], however, trains such ASR models in an end-to-end manner, replacing most intermediate modules with only a single deep network architecture. With scaling up both the data and model sizes, DS2 achieves a very significant performance boost.
-
-Please read Deep Speech 2 \[[1](#references),[2](#references)\] paper for more background knowledge.
-
-The classical DS2 network contains 15 layers (from bottom to top):
-
- **Two** data layers (audio spectrogram, transcription text)
- **Three** 2D convolution layers
- **Seven** uni-directional simple-RNN layers
- **One** lookahead row convolution layers
- **One** fully-connected layers
- **One** CTC-loss layer
-
-<div align="center">
-<img src="image/ds2_network.png" width=350><br/>
-Figure 1. Archetecture of Deep Speech 2 Network.
-</div>
-
-We don't have to persist on this 2-3-7-1-1-1 depth \[[2](#references)\]. Similar networks with different depths might also work well. As in \[[1](#references)\], authors use a different depth (e.g. 2-2-3-1-1-1) for final experiments.
-
-Key ingredients about the layers:
-
- **Data Layers**: 
-   - Frame sequences data of audio **spectrogram** (with FFT).
-   - Token sequences data of **transcription** text (labels). 
-   - These two type of sequences do not have the same lengthes, thus a CTC-loss layer is required.
- **2D Convolution Layers**: 
-   - Not only temporal convolution, but also **frequency convolution**. Like a 2D image convolution, but with a variable dimension (i.e. temporal dimension).
-   - With striding for only the first convlution layer.
-   - No pooling for all convolution layers.
- **Uni-directional RNNs** 
-	- Uni-directional + row convolution: for low-latency inference.
-	- Bi-direcitional + without row convolution: if we don't care about the inference latency.
- **Row convolution**:
-	- For looking only a few steps ahead into the feature, instead of looking into a whole sequence in bi-directional RNNs.
-	- Not nessesary if with bi-direcitional RNNs. 
-	- "**Row**" means convolutions are done within each frequency dimension (row), and no convolution kernels shared across.
- **Batch Normalization Layers**:
-   - Added to all above layers (except for data and loss layer).
-   - Sequence-wise normalization for RNNs: BatchNorm only performed on input-state projection and not state-state projection, for efficiency consideration.
- 
-
-Required Components                     | PaddlePaddle Support                      | Need to Develop
-:-------------------------------------  | :--------------------------------------   | :-----------------------
-Data Layer I (Spectrogram)	            | Not supported yet.                        |  TBD (Task 3)
-Data Layer II (Transcription)           | `paddle.data_type.integer_value_sequence` | -
-2D Convolution Layer                    | `paddle.layer.image_conv_layer`           | -
-DataType Converter (vec2seq)            | `paddle.layer.block_expand`               | -
-Bi-/Uni-directional RNNs                | `paddle.layer.recurrent_group`            | -
-Row Convolution Layer                   | Not supported yet.                        | TBD (Task 4)
-CTC-loss Layer                          | `paddle.layer.warp_ctc`                   | -
-Batch Normalization Layer               | `paddle.layer.batch_norm`                 | -
-CTC-Beam search                         | Not supported yet.                        | TBD (Task 6)
-
-### Row Convolution
-
-TODO by Assignees
-
-### Beam Search with CTC and LM
-
-TODO by Assignees
-
-## Future Work
-
- Efficiency Improvement
- Accuracy Improvement
- Low-latency Inference Library
- Large-scale benchmarking
-
-## References
-
-1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016.
-2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). 	arXiv:1512.02595.
--- a/doc/design/speech/deep_speech_2.md
+++ b/doc/design/speech/deep_speech_2.md
+# DeepSpeech2 on PaddlePaddle: Design Doc 
+
+We are planning to build Deep Speech 2 (DS2) \[[1](#references)\], a powerful Automatic Speech Recognition (ASR) engine,  on PaddlePaddle. For the first-stage plan, we have the following short-term goals:
+
+- Release a basic distributed implementation of DS2 on PaddlePaddle.
+- Contribute a chapter of Deep Speech to PaddlePaddle Book.
+
+Intensive system optimization and low-latency inference library (details in \[[1](#references)\]) are not yet covered in this first-stage plan.
+
+## Table of Contents
+
+- [Tasks](#tasks)
+- [Task Dependency](#task-dependency)
+- [Design Details](#design-details)
+    - [Overview](#overview)
+    - [Row Convolution](#row-convolution)
+    - [Beam Search With CTC and LM](#beam-search-with-ctc-and-lm)
+- [Future Work](#future-work)
+- [References](#references)
+
+## Tasks
+
+We roughly break down the project into 14 tasks:
+
+1. Develop an **audio data provider**:
+	- Json filelist generator.
+	- Audio file format transformer.
+	- Spectrogram feature extraction, power normalization etc.
+	- Batch data reader with SortaGrad.
+	- Data augmentation (optional).
+	- Prepare (one or more) public English data sets & baseline.
+2. Create a **simplified DS2 model configuration**:
+   - With only fixed-length (by padding) audio sequences (otherwise need *Task 3*).
+	- With only bidirectional-GRU (otherwise need *Task 4*).
+	- With only greedy decoder (otherwise need *Task 5, 6*).
+3. Develop to support **variable-shaped** dense-vector (image) batches of input data.
+   - Update `DenseScanner` in `dataprovider_converter.py`, etc.
+4. Develop a new **lookahead-row-convolution layer** (See \[[1](#references)\] for details):
+   - Lookahead convolution windows.
+   - Within-row convolution, without kernels shared across rows.
+5. Build KenLM **language model** (5-gram) for beam search decoder:
+   - Use KenLM toolkit.
+   - Prepare the corpus & train the model.
+   - Create infererence interfaces (for Task 6).
+6. Develop a **beam search decoder** with CTC + LM + WORDCOUNT:
+   - Beam search with CTC.
+   - Beam search with external custom scorer (e.g. LM).
+   - Try to design a more general beam search interface.
+7. Develop a **Word Error Rate evaluator**:
+   - update `ctc_error_evaluator`(CER) to support WER.
+8. Prepare internal dataset for Mandarin (optional):
+    - Dataset, baseline, evaluation details.
+    - Particular data preprocessing for Mandarin.
+    - Might need cooperating with the Speech Department.
+9. Create **standard DS2 model configuration**:
+   - With variable-length audio sequences (need *Task 3*).
+	- With unidirectional-GRU + row-convolution (need *Task 4*).
+	- With CTC-LM beam search decoder (need *Task 5, 6*).
+10. Make it run perfectly on **clusters**.
+11. Experiments and **benchmarking** (for accuracy, not efficiency):
+    - With public English dataset.
+    - With internal (Baidu) Mandarin dataset (optional).
+12. Time **profiling** and optimization.
+13. Prepare **docs**.
+14. Prepare PaddlePaddle **Book** chapter with a simplified version.
+
+## Task Dependency
+
+Tasks parallelizable within phases:
+
+Roadmap     | Description                               | Parallelizable Tasks 
+----------- | :------------------------------------     | :--------------------
+Phase I	    | Simplified model & components             | *Task 1* ~ *Task 8*
+Phase II    | Standard model & benchmarking & profiling | *Task 9* ~ *Task 12*
+Phase III   | Documentations                            | *Task13* ~ *Task14*
+
+Issue for each task will be created later. Contributions, discussions and comments are all highly appreciated and welcomed!
+
+## Design Details
+
+### Overview
+
+Traditional **ASR** (Automatic Speech Recognition) pipelines require great human efforts devoted to elaborately tuning multiple hand-engineered components (e.g. audio feature design, accoustic model, pronuncation model and language model etc.). **Deep Speech 2** (**DS2**) \[[1](#references)\], however, trains such ASR models in an end-to-end manner, replacing most intermediate modules with only a single deep network architecture. With scaling up both the data and model sizes, DS2 achieves a very significant performance boost.
+
+Please read Deep Speech 2 \[[1](#references),[2](#references)\] paper for more background knowledge.
+
+The classical DS2 network contains 15 layers (from bottom to top):
+
+- **Two** data layers (audio spectrogram, transcription text)
+- **Three** 2D convolution layers
+- **Seven** uni-directional simple-RNN layers
+- **One** lookahead row convolution layers
+- **One** fully-connected layers
+- **One** CTC-loss layer
+
+<div align="center">
+<img src="image/ds2_network.png" width=350><br/>
+Figure 1. Archetecture of Deep Speech 2 Network.
+</div>
+
+We don't have to persist on this 2-3-7-1-1-1 depth \[[2](#references)\]. Similar networks with different depths might also work well. As in \[[1](#references)\], authors use a different depth (e.g. 2-2-3-1-1-1) for final experiments.
+
+Key ingredients about the layers:
+
+- **Data Layers**: 
+   - Frame sequences data of audio **spectrogram** (with FFT).
+   - Token sequences data of **transcription** text (labels). 
+   - These two type of sequences do not have the same lengthes, thus a CTC-loss layer is required.
+- **2D Convolution Layers**: 
+   - Not only temporal convolution, but also **frequency convolution**. Like a 2D image convolution, but with a variable dimension (i.e. temporal dimension).
+   - With striding for only the first convlution layer.
+   - No pooling for all convolution layers.
+- **Uni-directional RNNs** 
+	- Uni-directional + row convolution: for low-latency inference.
+	- Bi-direcitional + without row convolution: if we don't care about the inference latency.
+- **Row convolution**:
+	- For looking only a few steps ahead into the feature, instead of looking into a whole sequence in bi-directional RNNs.
+	- Not nessesary if with bi-direcitional RNNs. 
+	- "**Row**" means convolutions are done within each frequency dimension (row), and no convolution kernels shared across.
+- **Batch Normalization Layers**:
+   - Added to all above layers (except for data and loss layer).
+   - Sequence-wise normalization for RNNs: BatchNorm only performed on input-state projection and not state-state projection, for efficiency consideration.
+ 
+
+Required Components                     | PaddlePaddle Support                      | Need to Develop
+:-------------------------------------  | :--------------------------------------   | :-----------------------
+Data Layer I (Spectrogram)	            | Not supported yet.                        |  TBD (Task 3)
+Data Layer II (Transcription)           | `paddle.data_type.integer_value_sequence` | -
+2D Convolution Layer                    | `paddle.layer.image_conv_layer`           | -
+DataType Converter (vec2seq)            | `paddle.layer.block_expand`               | -
+Bi-/Uni-directional RNNs                | `paddle.layer.recurrent_group`            | -
+Row Convolution Layer                   | Not supported yet.                        | TBD (Task 4)
+CTC-loss Layer                          | `paddle.layer.warp_ctc`                   | -
+Batch Normalization Layer               | `paddle.layer.batch_norm`                 | -
+CTC-Beam search                         | Not supported yet.                        | TBD (Task 6)
+
+### Row Convolution
+
+TODO by Assignees
+
+### Beam Search with CTC and LM
+
+<div align="center">
+<img src="image/beam_search.png" width=600><br/>
+Figure 2. Algorithm for CTC Beam Search Decoder.
+</div>
+
+- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts: 
+   - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths; 
+   - 2) the if condition ```if l^+ not in A_prev then``` after probabilities' computation is deprecated for it is hard to understand and seems unnecessary.
+- An **external scorer** would be passed into the decoder to evaluate a candidate prefix during decoding whenever a white space appended in English decoding and any character appended in Mandarin decoding.
+- Such external scorer consists of language model, word count or any other custom scorers.
+- The **language model** is built from Task 5, with parameters should be carefully tuned to achieve minimum WER/CER (c.f. Task 7)
+- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality. 
+ 
+
+## Future Work
+
+- Efficiency Improvement
+- Accuracy Improvement
+- Low-latency Inference Library
+- Large-scale benchmarking
+
+## References
+
+1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016.
+2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). 	arXiv:1512.02595.
+3. Awni Y. Hannun, etc. [First-Pass Large Vocabulary Continuous Speech Recognition using Bi-Directional Recurrent DNNs](https://arxiv.org/abs/1408.2873). arXiv:1408.2873
--- a/doc/design/speech/image/beam_search.png
+++ b/doc/design/speech/image/beam_search.png
--- a/doc/design/support_new_device.md
+++ b/doc/design/support_new_device.md
@@ -2,9 +2,9 @@

 ## Background

-Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries flexibly and efficiently.
+Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries in a flexible and efficient manner.

-On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example,Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library.
+On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example, Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library.

 On the other hand, users usually do not want to care about the low-level hardware and computing libraries when writing a neural network configuration. In Fluid, `Layer` is exposed in `Python`, and `Operator` is exposed in `C++`. Both `Layer` and `Operator` are hardware independent.

@@ -17,7 +17,7 @@ For a general overview of fluid, please refer to the [overview doc](https://gith

 There are mainly three parts that we have to consider while integrating a new device/library:

- Place and DeviceContext: indicates the device id and manages hardware resources
+- Place and DeviceContext: indicate the device id and manage hardware resources

 - Memory and Tensor: malloc/free data on certain device

@@ -25,10 +25,10 @@ There are mainly three parts that we have to consider while integrating a new de

 ### Place and DeviceContext

-Please remind that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.
+Please note that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.

 #### Place
-Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add corresponding `DevicePlace`.
+Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add the corresponding `DevicePlace`.

 ```
        |   CPUPlace
@@ -144,7 +144,7 @@ class Tensor {
 };
 ```

-`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configure its shape, and then call `mutuable_data` to allocate the actual memory.
+`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configurate its shape, and then call `mutuable_data` to allocate the actual memory.

 ```cpp
 paddle::framework::Tensor t;
@@ -163,7 +163,7 @@ Fluid implements computing units based on different DeviceContexts. Some computi

 Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/math/maxouting.h#L27) as an example:

-The interface is defined in header file.
+The interface is defined in the header file.

 ```
 template <typename DeviceContext, typename T>
@@ -174,7 +174,7 @@ class MaxOutFunctor {
 };
 ```

-CPU implemention is in .cc file
+CPU implementation is in .cc file

 ```
 template <typename T>
@@ -188,7 +188,7 @@ class MaxOutFunctor<platform::CPUDeviceContext, T> {
 };
 ```

-CUDA implemention is in .cu file
+CUDA implementation is in .cu file

 ```
 template <typename T>
@@ -203,9 +203,9 @@ class MaxOutFunctor<platform::CUDADeviceContext, T> {
 ```


-We get computing handle from a concrete DeviceContext, and make compution on tensors.
+We first obtain the computing handle from a concrete DeviceContext and then compute on tensors.

-The implemention of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map.
+The implementation of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map.

 Fluid provides different register interfaces in op_registry.h

@@ -231,7 +231,7 @@ REGISTER_OP_CUDA_KERNEL(

 ## Advanced topics: How to switch between different Device/Library

-Generally, we will impelement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not sutibale on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run at GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.
+Generally, we will implement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not suitable on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run on GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.


 For more details, please refer to following docs:

--- a/doc/design/switch.md
+++ b/doc/design/switch.md
+### Design Doc: Switch
+
+### Background
+
+Many programming languages provide `switch` as a generalization of `if-elif-else`.  We want to add it to Fluid.
+
+The following example shows the usage of `fluid.switch`.
+
+```python
+a = fluid.Var(10)
+b = fluid.Var(0)
+
+with switch() as switch:
+    with switch.case(fluid.less_equal(a, 10)):
+        fluid.print("Case 1")
+    with switch.case(fluid.larger(a, 0)):
+        fluid.print("Case 2")
+    with switch.default():
+        fluid.print("Case 3")
+```
+
+### The Semantics
+
+1. A `switch` control-flow checks cases one-by-one.
+1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values.
+1. It runs the first matched case, or the default case if there is one.
+1. Once it matches a case, it runs the corresponding branch and only that branch.  It's like there is a C's `break` keyword at the end of each case.
+
+The above program should print and print only "Case 1".
+
+The implementation of the backward pass of the `switch` control-flow is easier than the backward of the `if_else`, because `switch` runs at most one branch, whereas `if-else` could run more than one branches.
--- a/doc/howto/dev/FullyConnected.jpg
+++ b/doc/howto/dev/FullyConnected.jpg
--- a/doc/howto/dev/contribute_to_paddle_cn.md
+++ b/doc/howto/dev/contribute_to_paddle_cn.md
--- a/doc/dev/contribute_to_paddle_en.md
+++ b/doc/dev/contribute_to_paddle_en.md
+../../CONTRIBUTING.md
\ No newline at end of file
--- a/doc/dev/index_cn.rst
+++ b/doc/dev/index_cn.rst
+开发标准
+========
+
+..  toctree::
+  :maxdepth: 1
+
+  contribute_to_paddle_cn.md
+  write_docs_cn.rst
--- a/doc/dev/index_en.rst
+++ b/doc/dev/index_en.rst
+Development
+------------
+
+..  toctree::
+  :maxdepth: 1
+
+  new_layer_en.rst
+  contribute_to_paddle_en.md
+  write_docs_en.rst
--- a/doc/howto/dev/new_layer_cn.rst
+++ b/doc/howto/dev/new_layer_cn.rst
--- a/doc/howto/dev/new_layer_en.rst
+++ b/doc/howto/dev/new_layer_en.rst
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
--- a/doc/howto/dev/new_op_en.md
+++ b/doc/howto/dev/new_op_en.md
--- a/doc/howto/dev/new_op_kernel_en.md
+++ b/doc/howto/dev/new_op_kernel_en.md
--- a/doc/howto/dev/use_eigen_cn.md
+++ b/doc/howto/dev/use_eigen_cn.md
--- a/doc/howto/dev/use_eigen_en.md
+++ b/doc/howto/dev/use_eigen_en.md
--- a/doc/dev/write_docs_cn.rst
+++ b/doc/dev/write_docs_cn.rst
+#############
+如何贡献文档
+#############
+
+PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成，生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。
+也可以利用PaddlePaddle 工具来编译文档，这个情况下所有的文件会存在整理过的的文件目录 .ppo_workspace/content 下
+
+如何构建文档
+============
+
+PaddlePaddle的文档构建有三种方式。
+
+
+使用PaddlePaddle.org工具
+--------------
+这个是目前推荐的使用方法。除了可以自动编译文档，也可以直接在网页预览文档。
+
+文件工具是使用Docker，需要在系统里先安装好Docker工具包。Docker安装请参考Docker的官网。安装好Docker之后及可用以下命令启动工具
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+
+    # Please specify the working directory through -v
+    docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
+
+注意: PaddlePaddle.org 会在 -v (volume) 指定的内容存储库运行命令
+之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档
+编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
+
+如果不想使用 Docker，你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories and PaddlePaddle.org
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+    git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
+
+    # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd
+    export CONTENT_DIR=<path_to_paddlepaddle_working_directory>
+    export ENV=''
+    cd PaddlePaddle.org/portal/
+    pip install -r requirements.txt
+    python manage.py runserver
+
+工具服务器将读取环境变量 CONTENT_DIR 搜索代码库。请指定的PaddlePaddle工作目录给环境变量 CONTENT_DIR。
+之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档。
+编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
+
+想了解更多PaddlePaddle.org工具的详细信息，可以 `点击这里 <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.cn.md>`_ 。
+
+使用Docker构建
+--------------
+
+使用Docker构建PaddlePaddle的文档，需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。安装好Docker之后可以使用源码目录下的脚本构建文档，即
+
+..  code-block:: bash
+
+    cd TO_YOUR_PADDLE_CLONE_PATH
+    cd paddle/scripts/tools/build_docs
+    sh build_docs.sh
+
+编译完成之后，会在当前目录生成两个子目录\: doc(英文文档目录)和 doc_cn(中文文档目录)。
+打开浏览器访问对应目录下的index.html即可访问本地文档。
+
+直接构建
+--------
+
+如果提示正确，可以执行以下命令编译生成文档，即
+
+..  code-block:: bash
+
+    cd TO_YOUR_PADDLE_CLONE_PATH
+    mkdir -p build
+    cd build
+    cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
+    make gen_proto_py
+    make paddle_docs paddle_docs_cn
+
+编译完成之后，会在当前目录生成两个子目录\: doc(英文文档目录)和 doc_cn(中文文档目录)。
+打开浏览器访问对应目录下的index.html即可访问本地文档。
+
+
+如何书写文档
+============
+
+PaddlePaddle文档使用 `sphinx`_ 自动生成，用户可以参考sphinx教程进行书写。
+
+如何更新www.paddlepaddle.org
+============================
+
+更新的文档以PR的形式提交到github中，提交方式参见 `贡献文档 <http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html>`_ 。
+目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ 和
+`英文文档 <http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html>`_ 。
+
+
+..  _cmake: https://cmake.org/
+..  _sphinx: http://www.sphinx-doc.org/en/1.4.8/
--- a/doc/dev/write_docs_en.rst
+++ b/doc/dev/write_docs_en.rst
+########################
+Contribute Documentation
+########################
+
+PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``.
+Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories.
+When using the PaddlePaddle.org to compile documentations, the compiled documentations will be stored under a consolidated directory: .ppo_workspace/content
+
+How to Build Documentations
+============
+
+We recommend using PaddlePaddle.org tool to build documentation
+
+
+Use PaddlePaddle.org tool
+--------------
+This is the recommended method to build documentation. It can compile documentation and preview the documentation in a web browser.
+
+The tool uses Docker, please install it on your system. Please check Docker official website on how to install Docker. You may use the following commands to activate the tool
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories. You may only clone the contents you need
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+
+    # Please specify the working directory through -v
+    docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
+
+Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run command
+Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
+The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
+
+
+If you don't wish to use Docker, you can also activate the tool through Django. Use the following the commands to set up
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories and PaddlePaddle.org
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+    git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
+
+    # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd
+    export CONTENT_DIR=<path_to_paddlepaddle_working_directory>
+    export ENV=''
+    cd PaddlePaddle.org/portal/
+    pip install -r requirements.txt
+    python manage.py runserver
+
+Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
+The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
+
+If you want to learn more on the PaddlePaddle.org, please `click here <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.md>`_ 。
+
+How to write Documentations
+============
+
+PaddlePaddle uses `sphinx`_ to compile documentations，Please check sphinx official website for more detail.
+
+
+How to update www.paddlepaddle.org
+============================
+
+Please create PRs and submit them to github, please check `Contribute Code <http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html>`_ 。
+PaddlePaddle develop branch will update the documentation once the PR is merged. User may check latest `Chinese Docs <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ and
+`English Docs <http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html>`_ 。
+
+..  _cmake: https://cmake.org/
+..  _sphinx: http://www.sphinx-doc.org/en/1.4.8/
--- a/doc/getstarted/build_and_install/build_from_source_cn.rst
+++ b/doc/getstarted/build_and_install/build_from_source_cn.rst
-从源码编译
-======================
-
-.. _build_step:
-
-编译方法
----------------
-
-PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译工具。
-我们推荐您使用PaddlePaddle Docker编译环境镜像完成编译，这样可以免去单独安装编译依赖的步骤，可选的不同编译环境Docker镜像
-可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到。
-
-如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。
-
-编译PaddlePaddle，需要执行：
-
-.. code-block:: bash
-
-   git clone https://github.com/PaddlePaddle/Paddle.git
-   cd Paddle
-   # 如果使用Docker编译环境，执行下面的命令编译CPU-Only的二进制
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
-   # 如果不使用Docker编译环境，执行下面的命令
-   mkdir build
-   cd build
-   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
-   make
-
-编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：
-
-.. code-block:: bash
-
-   pip install build/python/dist/*.whl
-
-如果机器中已经安装过PaddlePaddle，有两种方法：
-
-.. code-block:: bash
-
-   1. 先卸载之前的版本，再重新安装
-   pip uninstall paddlepaddle
-   pip install build/python/dist/*.whl
-
-   2. 直接升级到更新的版本
-   pip install build/python/dist/*.whl -U
-
-.. _run_test:
-
-执行单元测试
----------------
-
-如果您期望在编译完成后立即执行所有的单元测试，可以按照下面的方法：
-
-使用Docker的情况下，设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后，立即执行单元测试。
-开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。
-
-.. code-block:: bash
-
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
-
-如果不使用Docker，可以执行ctest命令即可：
-
-.. code-block:: bash
-
-   mkdir build
-   cd build
-   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
-   make
-   ctest
-   # 指定执行其中一个单元测试 test_mul_op
-   ctest -R test_mul_op
-
-.. _compile_deps:
-
-编译依赖
----------------
-
-PaddlePaddle编译需要使用到下面的依赖（包含但不限于），其他的依赖软件，会自动在编译时下载。
-
-.. csv-table:: PaddlePaddle编译依赖
-   :header: "依赖", "版本", "说明"
-   :widths: 10, 15, 30
-
-   "CMake", ">=3.2", ""
-   "GCC", "4.8.2", "推荐使用CentOS的devtools2"
-   "Python", "2.7.x", "依赖libpython2.7.so"
-   "pip", ">=9.0", ""
-   "numpy", "", ""
-   "SWIG", ">=2.0", ""
-   "Go", ">=1.8", "可选"
-
-
-.. _build_options:
-
-编译选项
----------------
-
-PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。
-用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考
-`官方文档 <https://cmake.org/cmake-tutorial>`_ 。
-
-在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如：
-
-..  code-block:: bash
-
-    cmake .. -DWITH_GPU=OFF
-
-..  csv-table:: 编译选项说明
-    :header: "选项", "说明", "默认值"
-    :widths: 1, 7, 2
-
-    "WITH_GPU", "是否支持GPU", "ON"
-    "WITH_C_API", "是否仅编译CAPI", "OFF"
-    "WITH_DOUBLE", "是否使用双精度浮点数", "OFF"
-    "WITH_DSO", "是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。", "ON"
-    "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
-    "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
-    "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
-    "WITH_TESTING", "是否开启单元测试", "ON"
-    "WITH_DOC", "是否编译中英文文档", "OFF"
-    "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练", "Auto"
-    "WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON"
-    "WITH_MKL", "是否使用MKL数学库，如果为否则是用OpenBLAS", "ON"
-
-BLAS
-+++++
-
-PaddlePaddle支持 `MKL <https://software.intel.com/en-us/intel-mkl>`_ 和
-`OpenBlAS <http://www.openblas.net/>`_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集，
-还会下载MKL-DNN数学库，详细参考 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
-
-如果关闭MKL，则会使用OpenBLAS作为BLAS库。
-
-CUDA/cuDNN
-+++++++++++
-
-PaddlePaddle在编译时/运行时会自动找到系统中安装的CUDA和cuDNN库进行编译和执行。
-使用参数 :code:`-DCUDA_ARCH_NAME=Auto` 可以指定开启自动检测SM架构，加速编译。
-
-PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cuDNN是同一个版本。
-我们推荐使用最新版本的cuDNN。
-
-编译选项的设置
-++++++++++++++
-
-PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ :code:`/usr/lib:/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
-
-..  code-block:: bash
-
-    cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
-
-**注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（** :code:`rm -rf` ）**后，再指定。**
--- a/doc/getstarted/build_and_install/build_from_source_en.rst
+++ b/doc/getstarted/build_and_install/build_from_source_en.rst
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
--- a/doc/getstarted/build_and_install/index_cn.rst
+++ b/doc/getstarted/build_and_install/index_cn.rst
-安装与编译
-==========
-
-.. _install_steps:
-
-安装流程
-++++++++
-
-PaddlePaddle提供pip和Docker的安装方式：
-
-.. toctree::
-   :maxdepth: 1
-
-   pip_install_cn.rst
-   docker_install_cn.rst
-   ../../howto/dev/build_cn.md
-
-编译流程
-++++++++
-
-..  warning::
-
-    建议直接使用上述安装流程，方便快速安装。只有在遇到需要独立定制的二进制时才需要编译。
-
-..  toctree::
-    :maxdepth: 1
-
-    build_from_source_cn.rst
-
-常见问题解答
-++++++++++
-
-`常见问题解答 <http://www.paddlepaddle.org/docs/develop/documentation/zh/faq/build_and_install/index_cn.html>`_
--- a/doc/getstarted/build_and_install/index_en.rst
+++ b/doc/getstarted/build_and_install/index_en.rst
-Install and Build
-=================
-
-.. _install_steps:
-
-Install Steps
-++++++++
-
-You can choose either pip or Docker to complete your install:
-
-.. toctree::
-   :maxdepth: 1
-
-   pip_install_en.rst
-   docker_install_en.rst
-   ../../howto/dev/build_en.md
-
-
-Build from Source
-----------------
-
-..  warning::
-
-    We recommend to directly install via above installation steps, you'll only need to build PaddlePaddle from source when you need a modifed binary.
-
-..  toctree::
-    :maxdepth: 1
-
-    build_from_source_en.md
-
-FAQ
-++++++++++
-
-`FAQ <http://www.paddlepaddle.org/docs/develop/documentation/zh/faq/build_and_install/index_en.html>`_
--- a/doc/getstarted/build_and_install/pip_install_cn.rst
+++ b/doc/getstarted/build_and_install/pip_install_cn.rst
--- a/doc/getstarted/build_and_install/pip_install_en.rst
+++ b/doc/getstarted/build_and_install/pip_install_en.rst
--- a/doc/getstarted/concepts/use_concepts_cn.rst
+++ b/doc/getstarted/concepts/use_concepts_cn.rst
@@ -4,7 +4,7 @@

 PaddlePaddle是源于百度的一个深度学习平台。PaddlePaddle为深度学习研究人员提供了丰富的API，可以轻松地完成神经网络配置，模型训练等任务。
 这里将介绍PaddlePaddle的基本使用概念，并且展示了如何利用PaddlePaddle来解决一个经典的线性回归问题。
-在使用该文档之前，请参考 `安装文档 <../build_and_install/index_cn.html>`_ 完成PaddlePaddle的安装。
+在使用该文档之前，请参考 `安装文档 <../../build_and_install/index_cn.html>`_ 完成PaddlePaddle的安装。


 配置网络

--- a/doc/getstarted/index_cn.rst
+++ b/doc/getstarted/index_cn.rst
--- a/doc/getstarted/index_en.rst
+++ b/doc/getstarted/index_en.rst
--- a/doc/getstarted/quickstart_cn.rst
+++ b/doc/getstarted/quickstart_cn.rst
--- a/doc/getstarted/quickstart_en.rst
+++ b/doc/getstarted/quickstart_en.rst
--- a/doc/howto/capi/compile_paddle_lib_cn.md
+++ b/doc/howto/capi/compile_paddle_lib_cn.md
--- a/doc/howto/usage/capi/images/csr.png
+++ b/doc/howto/usage/capi/images/csr.png
--- a/doc/howto/usage/capi/images/sequence_data.png
+++ b/doc/howto/usage/capi/images/sequence_data.png
--- a/doc/howto/usage/capi/images/workflow_of_CAPI.png
+++ b/doc/howto/usage/capi/images/workflow_of_CAPI.png
--- a/doc/howto/capi/index_cn.rst
+++ b/doc/howto/capi/index_cn.rst
+C-API预测库
+==================
+
+..  toctree::
+  :maxdepth: 1
+
+  compile_paddle_lib_cn.md
+  organization_of_the_inputs_cn.md
+  workflow_of_capi_cn.md
--- a/doc/howto/usage/capi/organization_of_the_inputs_cn.md
+++ b/doc/howto/usage/capi/organization_of_the_inputs_cn.md
--- a/doc/howto/capi/workflow_of_capi_cn.md
+++ b/doc/howto/capi/workflow_of_capi_cn.md
--- a/doc/howto/cluster/cmd_argument_cn.md
+++ b/doc/howto/cluster/cmd_argument_cn.md
--- a/doc/howto/cluster/cmd_argument_en.md
+++ b/doc/howto/cluster/cmd_argument_en.md
--- a/doc/howto/cluster/fluid_cluster_train_en.md
+++ b/doc/howto/cluster/fluid_cluster_train_en.md
--- a/doc/howto/cluster/index_cn.rst
+++ b/doc/howto/cluster/index_cn.rst
+分布式训练
+==========
+
+本节将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示：
+
+.. image:: src/ps_cn.png
+   :width: 500
+
+- 数据分片（Data shard): 用于训练神经网络的数据，被切分成多个部分，每个部分分别给每个trainer使用。
+- 计算节点（Trainer）: 每个trainer启动后读取切分好的一部分数据，开始神经网络的“前馈”和“后馈”计算，并和参数服务器通信。在完成一定量数据的训练后，上传计算得出的梯度（gradients），然后下载优化更新后的神经网络参数（parameters）。
+- 参数服务器（Parameter server）:每个参数服务器只保存整个神经网络所有参数的一部分。参数服务器接收从计算节点上传的梯度，并完成参数优化更新，再将更新后的参数下发到每个计算节点。
+
+这样，通过计算节点和参数服务器的分布式协作，可以完成神经网络的SGD方法的训练。PaddlePaddle可以同时支持同步随机梯度下降（SGD）和异步随机梯度下降。
+
+在使用同步SGD训练神经网络时，PaddlePaddle使用同步屏障（barrier），使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中，则并不会等待所有trainer提交梯度才更新参数，这样极大地提高了计算的并行性：参数服务器之间不相互依赖，并行地接收梯度和更新参数，参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步，计算节点之间也不会相互依赖，并行地执行模型的训练。可以看出，虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新，在任意时间某一台参数服务器上保存的参数可能比另一台要更新，与同步SGD相比，梯度会有噪声。
+
+..  toctree::
+  :maxdepth: 1
+
+  preparations_cn.md
+  cmd_argument_cn.md
+  multi_cluster/index_cn.rst
--- a/doc/howto/cluster/index_en.rst
+++ b/doc/howto/cluster/index_en.rst
--- a/doc/howto/usage/cluster/fabric_cn.md
+++ b/doc/howto/usage/cluster/fabric_cn.md
--- a/doc/howto/usage/cluster/fabric_en.md
+++ b/doc/howto/usage/cluster/fabric_en.md
--- a/doc/howto/cluster/multi_cluster/index_cn.rst
+++ b/doc/howto/cluster/multi_cluster/index_cn.rst
--- a/doc/howto/cluster/multi_cluster/index_en.rst
+++ b/doc/howto/cluster/multi_cluster/index_en.rst
--- a/doc/howto/usage/cluster/k8s_aws_cn.md
+++ b/doc/howto/usage/cluster/k8s_aws_cn.md
--- a/doc/howto/usage/cluster/k8s_aws_en.md
+++ b/doc/howto/usage/cluster/k8s_aws_en.md
--- a/doc/howto/usage/cluster/k8s_cn.md
+++ b/doc/howto/usage/cluster/k8s_cn.md
--- a/doc/howto/usage/cluster/k8s_distributed_cn.md
+++ b/doc/howto/usage/cluster/k8s_distributed_cn.md
--- a/doc/howto/usage/cluster/k8s_en.md
+++ b/doc/howto/usage/cluster/k8s_en.md
--- a/doc/howto/usage/cluster/openmpi_cn.md
+++ b/doc/howto/usage/cluster/openmpi_cn.md
--- a/doc/howto/usage/cluster/openmpi_en.md
+++ b/doc/howto/usage/cluster/openmpi_en.md
--- a/doc/howto/usage/cluster/src/add_security_group.png
+++ b/doc/howto/usage/cluster/src/add_security_group.png
--- a/doc/howto/usage/cluster/src/create_efs.png
+++ b/doc/howto/usage/cluster/src/create_efs.png
--- a/doc/howto/usage/cluster/src/k8s-paddle-arch.png
+++ b/doc/howto/usage/cluster/src/k8s-paddle-arch.png
--- a/doc/howto/usage/cluster/src/k8s_data/Dockerfile
+++ b/doc/howto/usage/cluster/src/k8s_data/Dockerfile
--- a/doc/howto/usage/cluster/src/k8s_data/README.md
+++ b/doc/howto/usage/cluster/src/k8s_data/README.md
--- a/doc/howto/usage/cluster/src/k8s_data/get_data.sh
+++ b/doc/howto/usage/cluster/src/k8s_data/get_data.sh
--- a/doc/howto/usage/cluster/src/k8s_train/Dockerfile
+++ b/doc/howto/usage/cluster/src/k8s_train/Dockerfile
--- a/doc/howto/usage/cluster/src/k8s_train/README.md
+++ b/doc/howto/usage/cluster/src/k8s_train/README.md
--- a/doc/howto/usage/cluster/src/k8s_train/start.sh
+++ b/doc/howto/usage/cluster/src/k8s_train/start.sh
--- a/doc/howto/usage/cluster/src/k8s_train/start_paddle.py
+++ b/doc/howto/usage/cluster/src/k8s_train/start_paddle.py
--- a/doc/howto/usage/cluster/src/pserver_and_trainer.png
+++ b/doc/howto/usage/cluster/src/pserver_and_trainer.png
--- a/doc/howto/usage/cluster/src/route53_create_recordset.png
+++ b/doc/howto/usage/cluster/src/route53_create_recordset.png
--- a/doc/howto/usage/cluster/src/route53_create_zone.png
+++ b/doc/howto/usage/cluster/src/route53_create_zone.png
--- a/doc/howto/usage/cluster/src/worker_security_group.png
+++ b/doc/howto/usage/cluster/src/worker_security_group.png
--- a/doc/howto/cluster/preparations_cn.md
+++ b/doc/howto/cluster/preparations_cn.md
--- a/doc/howto/cluster/preparations_en.md
+++ b/doc/howto/cluster/preparations_en.md
--- a/doc/howto/usage/cluster/src/Dockerfile
+++ b/doc/howto/usage/cluster/src/Dockerfile
--- a/doc/howto/usage/cluster/src/efs_mount.png
+++ b/doc/howto/usage/cluster/src/efs_mount.png
--- a/doc/howto/usage/cluster/src/managed_policy.png
+++ b/doc/howto/usage/cluster/src/managed_policy.png
--- a/doc/howto/usage/cluster/src/trainer_cn.png
+++ b/doc/howto/usage/cluster/src/trainer_cn.png
--- a/doc/howto/usage/cluster/src/trainer.png
+++ b/doc/howto/usage/cluster/src/trainer.png
--- a/doc/howto/cluster/src/trainer.png
+++ b/doc/howto/cluster/src/trainer.png
--- a/doc/howto/cluster/src/trainer_cn.png
+++ b/doc/howto/cluster/src/trainer_cn.png
--- a/doc/howto/usage/cluster/src/word2vec/api_train_v2.py
+++ b/doc/howto/usage/cluster/src/word2vec/api_train_v2.py
--- a/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
+++ b/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
--- a/doc/howto/usage/cluster/src/word2vec/prepare.py
+++ b/doc/howto/usage/cluster/src/word2vec/prepare.py
--- a/doc/howto/usage/cmd_parameter/arguments_cn.md
+++ b/doc/howto/usage/cmd_parameter/arguments_cn.md
--- a/doc/howto/usage/cmd_parameter/arguments_en.md
+++ b/doc/howto/usage/cmd_parameter/arguments_en.md
--- a/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
+++ b/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
--- a/doc/howto/usage/cmd_parameter/detail_introduction_en.md
+++ b/doc/howto/usage/cmd_parameter/detail_introduction_en.md
--- a/doc/howto/cmd_parameter/index_cn.rst
+++ b/doc/howto/cmd_parameter/index_cn.rst
--- a/doc/howto/usage/cmd_parameter/index_en.rst
+++ b/doc/howto/usage/cmd_parameter/index_en.rst
--- a/doc/howto/usage/cmd_parameter/use_case_cn.md
+++ b/doc/howto/usage/cmd_parameter/use_case_cn.md
--- a/doc/howto/usage/cmd_parameter/use_case_en.md
+++ b/doc/howto/usage/cmd_parameter/use_case_en.md
--- a/doc/howto/deep_model/rnn/index_cn.rst
+++ b/doc/howto/deep_model/rnn/index_cn.rst
--- a/doc/howto/dev/contribute_to_paddle_en.md
+++ b/doc/howto/dev/contribute_to_paddle_en.md
--- a/doc/howto/dev/write_docs_cn.rst
+++ b/doc/howto/dev/write_docs_cn.rst
--- a/doc/howto/dev/write_docs_en.rst
+++ b/doc/howto/dev/write_docs_en.rst
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
--- a/doc/howto/optimization/cpu_profiling.md
+++ b/doc/howto/optimization/cpu_profiling.md
--- a/doc/howto/optimization/gpu_profiling_cn.rst
+++ b/doc/howto/optimization/gpu_profiling_cn.rst
--- a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
+++ b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
--- a/doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst
+++ b/doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst
--- a/doc/howto/rnn/index_cn.rst
+++ b/doc/howto/rnn/index_cn.rst
--- a/doc/howto/deep_model/rnn/index_en.rst
+++ b/doc/howto/deep_model/rnn/index_en.rst
--- a/doc/howto/deep_model/rnn/recurrent_group_cn.md
+++ b/doc/howto/deep_model/rnn/recurrent_group_cn.md
--- a/doc/howto/deep_model/rnn/rnn_config_cn.rst
+++ b/doc/howto/deep_model/rnn/rnn_config_cn.rst
--- a/doc/howto/deep_model/rnn/rnn_config_en.rst
+++ b/doc/howto/deep_model/rnn/rnn_config_en.rst
--- a/doc/howto/deep_model/rnn/src/bi_lstm.jpg
+++ b/doc/howto/deep_model/rnn/src/bi_lstm.jpg
--- a/doc/howto/deep_model/rnn/src/encoder-decoder-attention-model.png
+++ b/doc/howto/deep_model/rnn/src/encoder-decoder-attention-model.png
--- a/doc/howto/deep_model/rnn/src/glossary_rnn.dot
+++ b/doc/howto/deep_model/rnn/src/glossary_rnn.dot
--- a/doc/howto/deep_model/rnn/src/glossary_rnn_with_memory.dot
+++ b/doc/howto/deep_model/rnn/src/glossary_rnn_with_memory.dot
--- a/doc/howto/deep_model/rnn/src/simple_full_hierarchical_recurrent.dot
+++ b/doc/howto/deep_model/rnn/src/simple_full_hierarchical_recurrent.dot
--- a/doc/howto/deep_model/rnn/src/simple_full_recurrent.dot
+++ b/doc/howto/deep_model/rnn/src/simple_full_recurrent.dot
--- a/doc/howto/usage/capi/compile_paddle_lib_cn.md
+++ b/doc/howto/usage/capi/compile_paddle_lib_cn.md
--- a/doc/howto/usage/capi/index_cn.rst
+++ b/doc/howto/usage/capi/index_cn.rst
--- a/doc/howto/usage/capi/workflow_of_capi_cn.md
+++ b/doc/howto/usage/capi/workflow_of_capi_cn.md
--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
--- a/doc/howto/usage/cluster/fluid_cluster_train_en.md
+++ b/doc/howto/usage/cluster/fluid_cluster_train_en.md
--- a/doc/howto/usage/cmd_parameter/index_cn.rst
+++ b/doc/howto/usage/cmd_parameter/index_cn.rst
--- a/doc/index_cn.rst
+++ b/doc/index_cn.rst
--- a/doc/index_en.rst
+++ b/doc/index_en.rst
--- a/doc/mobile/index_cn.rst
+++ b/doc/mobile/index_cn.rst
--- a/doc/mobile/index_en.rst
+++ b/doc/mobile/index_en.rst
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
--- a/paddle/framework/.clang-format
+++ b/paddle/framework/.clang-format
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
--- a/paddle/fluid/framework/attribute.cc
+++ b/paddle/fluid/framework/attribute.cc
--- a/paddle/fluid/framework/attribute.h
+++ b/paddle/fluid/framework/attribute.h
--- a/paddle/fluid/framework/backward.cc
+++ b/paddle/fluid/framework/backward.cc
--- a/paddle/fluid/framework/backward.h
+++ b/paddle/fluid/framework/backward.h
--- a/paddle/fluid/framework/backward_test.cc
+++ b/paddle/fluid/framework/backward_test.cc
--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
--- a/paddle/fluid/framework/channel_test.cc
+++ b/paddle/fluid/framework/channel_test.cc
--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
--- a/paddle/fluid/framework/data_device_transform.h
+++ b/paddle/fluid/framework/data_device_transform.h
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
--- a/paddle/fluid/framework/data_layout.h
+++ b/paddle/fluid/framework/data_layout.h
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
--- a/paddle/fluid/framework/data_layout_transform_test.cc
+++ b/paddle/fluid/framework/data_layout_transform_test.cc
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
--- a/paddle/fluid/framework/data_transform.h
+++ b/paddle/fluid/framework/data_transform.h
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
--- a/paddle/fluid/framework/data_type_transform.h
+++ b/paddle/fluid/framework/data_type_transform.h
--- a/paddle/fluid/framework/data_type_transform_test.cc
+++ b/paddle/fluid/framework/data_type_transform_test.cc
--- a/paddle/fluid/framework/ddim.cc
+++ b/paddle/fluid/framework/ddim.cc
--- a/paddle/fluid/framework/ddim.h
+++ b/paddle/fluid/framework/ddim.h
--- a/paddle/fluid/framework/ddim_test.cc
+++ b/paddle/fluid/framework/ddim_test.cc
--- a/paddle/fluid/framework/details/buffered_channel.h
+++ b/paddle/fluid/framework/details/buffered_channel.h
--- a/paddle/fluid/framework/details/cow_ptr.h
+++ b/paddle/fluid/framework/details/cow_ptr.h
--- a/paddle/fluid/framework/details/cow_ptr_test.cc
+++ b/paddle/fluid/framework/details/cow_ptr_test.cc
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
--- a/paddle/fluid/framework/details/unbuffered_channel.h
+++ b/paddle/fluid/framework/details/unbuffered_channel.h
--- a/paddle/fluid/framework/dim.h
+++ b/paddle/fluid/framework/dim.h
--- a/paddle/fluid/framework/dim_test.cu
+++ b/paddle/fluid/framework/dim_test.cu
--- a/paddle/fluid/framework/eigen.h
+++ b/paddle/fluid/framework/eigen.h
--- a/paddle/fluid/framework/eigen_test.cc
+++ b/paddle/fluid/framework/eigen_test.cc
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
--- a/paddle/fluid/framework/feed_fetch_method.h
+++ b/paddle/fluid/framework/feed_fetch_method.h
--- a/paddle/fluid/framework/feed_fetch_type.h
+++ b/paddle/fluid/framework/feed_fetch_type.h
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
--- a/paddle/fluid/framework/grad_op_desc_maker.h
+++ b/paddle/fluid/framework/grad_op_desc_maker.h
--- a/paddle/fluid/framework/init.cc
+++ b/paddle/fluid/framework/init.cc
--- a/paddle/framework/init.h
+++ b/paddle/framework/init.h
--- a/paddle/fluid/framework/init_test.cc
+++ b/paddle/fluid/framework/init_test.cc
--- a/paddle/framework/library_type.h
+++ b/paddle/framework/library_type.h
--- a/paddle/fluid/framework/lod_rank_table.cc
+++ b/paddle/fluid/framework/lod_rank_table.cc
--- a/paddle/fluid/framework/lod_rank_table.h
+++ b/paddle/fluid/framework/lod_rank_table.h
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
--- a/paddle/framework/lod_tensor.md
+++ b/paddle/framework/lod_tensor.md
--- a/paddle/fluid/framework/lod_tensor_array.h
+++ b/paddle/fluid/framework/lod_tensor_array.h
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
--- a/paddle/fluid/framework/lod_tensor_test.cu
+++ b/paddle/fluid/framework/lod_tensor_test.cu
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
--- a/paddle/fluid/framework/mixed_vector_test.cu
+++ b/paddle/fluid/framework/mixed_vector_test.cu
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
--- a/paddle/fluid/framework/op_info.cc
+++ b/paddle/fluid/framework/op_info.cc
--- a/paddle/fluid/framework/op_info.h
+++ b/paddle/fluid/framework/op_info.h
--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
--- a/paddle/fluid/framework/op_kernel_type_test.cc
+++ b/paddle/fluid/framework/op_kernel_type_test.cc
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
--- a/paddle/fluid/framework/op_proto_maker_test.cc
+++ b/paddle/fluid/framework/op_proto_maker_test.cc
--- a/paddle/fluid/framework/op_registry.cc
+++ b/paddle/fluid/framework/op_registry.cc
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
--- a/paddle/fluid/framework/op_registry_test.cc
+++ b/paddle/fluid/framework/op_registry_test.cc
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
--- a/paddle/fluid/framework/program_desc_test.cc
+++ b/paddle/fluid/framework/program_desc_test.cc
--- a/paddle/framework/proto_desc.h
+++ b/paddle/framework/proto_desc.h
--- a/paddle/fluid/framework/prune.cc
+++ b/paddle/fluid/framework/prune.cc
--- a/paddle/fluid/framework/prune.h
+++ b/paddle/fluid/framework/prune.h
--- a/paddle/fluid/framework/prune_test.cc
+++ b/paddle/fluid/framework/prune_test.cc
--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
--- a/paddle/fluid/framework/scope_test.cc
+++ b/paddle/fluid/framework/scope_test.cc
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
--- a/paddle/fluid/framework/selected_rows_test.cc
+++ b/paddle/fluid/framework/selected_rows_test.cc
--- a/paddle/fluid/framework/shape_inference.cc
+++ b/paddle/fluid/framework/shape_inference.cc
--- a/paddle/fluid/framework/shape_inference.h
+++ b/paddle/fluid/framework/shape_inference.h
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
--- a/paddle/framework/tensor.md
+++ b/paddle/framework/tensor.md
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
--- a/paddle/fluid/framework/tensor_util.cu
+++ b/paddle/fluid/framework/tensor_util.cu
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
--- a/paddle/fluid/framework/tensor_util_test.cu
+++ b/paddle/fluid/framework/tensor_util_test.cu
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
--- a/paddle/fluid/framework/threadpool_test.cc
+++ b/paddle/fluid/framework/threadpool_test.cc
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
--- a/paddle/fluid/framework/var_desc.cc
+++ b/paddle/fluid/framework/var_desc.cc
--- a/paddle/fluid/framework/var_desc.h
+++ b/paddle/fluid/framework/var_desc.h
--- a/paddle/fluid/framework/var_type.h
+++ b/paddle/fluid/framework/var_type.h
--- a/paddle/fluid/framework/var_type_inference.h
+++ b/paddle/fluid/framework/var_type_inference.h
--- a/paddle/fluid/framework/var_type_inference_test.cc
+++ b/paddle/fluid/framework/var_type_inference_test.cc
--- a/paddle/fluid/framework/variable.h
+++ b/paddle/fluid/framework/variable.h
--- a/paddle/framework/variable.md
+++ b/paddle/framework/variable.md
--- a/paddle/fluid/framework/variable_test.cc
+++ b/paddle/fluid/framework/variable_test.cc
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
--- a/paddle/fluid/inference/tests/book/test_helper.h
+++ b/paddle/fluid/inference/tests/book/test_helper.h
--- a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
--- a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
--- a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
--- a/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
--- a/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
--- a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
--- a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
--- a/paddle/platform/.clang-format
+++ b/paddle/platform/.clang-format
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
--- a/paddle/memory/README.md
+++ b/paddle/memory/README.md
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
--- a/paddle/fluid/memory/detail/memory_block.cc
+++ b/paddle/fluid/memory/detail/memory_block.cc
--- a/paddle/memory/detail/memory_block.h
+++ b/paddle/memory/detail/memory_block.h
--- a/paddle/fluid/memory/detail/meta_cache.cc
+++ b/paddle/fluid/memory/detail/meta_cache.cc
--- a/paddle/fluid/memory/detail/meta_cache.h
+++ b/paddle/fluid/memory/detail/meta_cache.h
--- a/paddle/fluid/memory/detail/meta_data.cc
+++ b/paddle/fluid/memory/detail/meta_data.cc
--- a/paddle/fluid/memory/detail/meta_data.h
+++ b/paddle/fluid/memory/detail/meta_data.h
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
--- a/paddle/fluid/memory/detail/system_allocator_test.cc
+++ b/paddle/fluid/memory/detail/system_allocator_test.cc
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
--- a/paddle/fluid/memory/memcpy.h
+++ b/paddle/fluid/memory/memcpy.h
--- a/paddle/fluid/memory/memory.cc
+++ b/paddle/fluid/memory/memory.cc
--- a/paddle/fluid/memory/memory.h
+++ b/paddle/fluid/memory/memory.h
--- a/paddle/fluid/memory/memory_test.cc
+++ b/paddle/fluid/memory/memory_test.cc
--- a/paddle/fluid/operators/.clang-format
+++ b/paddle/fluid/operators/.clang-format
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
--- a/paddle/fluid/operators/accuracy_op.cc
+++ b/paddle/fluid/operators/accuracy_op.cc
--- a/paddle/fluid/operators/accuracy_op.cu
+++ b/paddle/fluid/operators/accuracy_op.cu
--- a/paddle/fluid/operators/accuracy_op.h
+++ b/paddle/fluid/operators/accuracy_op.h
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
--- a/paddle/fluid/operators/adadelta_op.cc
+++ b/paddle/fluid/operators/adadelta_op.cc
--- a/paddle/fluid/operators/adadelta_op.cu
+++ b/paddle/fluid/operators/adadelta_op.cu
--- a/paddle/fluid/operators/adadelta_op.h
+++ b/paddle/fluid/operators/adadelta_op.h
--- a/paddle/fluid/operators/adagrad_op.cc
+++ b/paddle/fluid/operators/adagrad_op.cc
--- a/paddle/fluid/operators/adagrad_op.cu
+++ b/paddle/fluid/operators/adagrad_op.cu
--- a/paddle/fluid/operators/adagrad_op.h
+++ b/paddle/fluid/operators/adagrad_op.h
--- a/paddle/fluid/operators/adam_op.cc
+++ b/paddle/fluid/operators/adam_op.cc
--- a/paddle/fluid/operators/adam_op.cu
+++ b/paddle/fluid/operators/adam_op.cu
--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
--- a/paddle/fluid/operators/adamax_op.cc
+++ b/paddle/fluid/operators/adamax_op.cc
--- a/paddle/fluid/operators/adamax_op.cu
+++ b/paddle/fluid/operators/adamax_op.cu
--- a/paddle/fluid/operators/adamax_op.h
+++ b/paddle/fluid/operators/adamax_op.h
--- a/paddle/fluid/operators/array_operator.h
+++ b/paddle/fluid/operators/array_operator.h
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
--- a/paddle/fluid/operators/assign_op.cc
+++ b/paddle/fluid/operators/assign_op.cc
--- a/paddle/fluid/operators/assign_value_op.cc
+++ b/paddle/fluid/operators/assign_value_op.cc
--- a/paddle/fluid/operators/assign_value_op.cu.cc
+++ b/paddle/fluid/operators/assign_value_op.cu.cc
--- a/paddle/fluid/operators/assign_value_op.h
+++ b/paddle/fluid/operators/assign_value_op.h
--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
--- a/paddle/fluid/operators/batch_norm_op.cu.cc
+++ b/paddle/fluid/operators/batch_norm_op.cu.cc
--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
--- a/paddle/fluid/operators/beam_search_decode_op.h
+++ b/paddle/fluid/operators/beam_search_decode_op.h
--- a/paddle/fluid/operators/beam_search_decode_op_test.cc
+++ b/paddle/fluid/operators/beam_search_decode_op_test.cc
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
--- a/paddle/fluid/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
--- a/paddle/fluid/operators/beam_search_op_test.cc
+++ b/paddle/fluid/operators/beam_search_op_test.cc
--- a/paddle/fluid/operators/bilinear_tensor_product_op.cc
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc
--- a/paddle/fluid/operators/bilinear_tensor_product_op.cu
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.cu
--- a/paddle/fluid/operators/bilinear_tensor_product_op.h
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.h
--- a/paddle/fluid/operators/bipartite_match_op.cc
+++ b/paddle/fluid/operators/bipartite_match_op.cc
--- a/paddle/fluid/operators/box_coder_op.cc
+++ b/paddle/fluid/operators/box_coder_op.cc
--- a/paddle/fluid/operators/box_coder_op.cu
+++ b/paddle/fluid/operators/box_coder_op.cu
--- a/paddle/fluid/operators/box_coder_op.h
+++ b/paddle/fluid/operators/box_coder_op.h
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
--- a/paddle/fluid/operators/cast_op.cu
+++ b/paddle/fluid/operators/cast_op.cu
--- a/paddle/fluid/operators/cast_op.h
+++ b/paddle/fluid/operators/cast_op.h
--- a/paddle/fluid/operators/chunk_eval_op.cc
+++ b/paddle/fluid/operators/chunk_eval_op.cc
--- a/paddle/fluid/operators/chunk_eval_op.h
+++ b/paddle/fluid/operators/chunk_eval_op.h
--- a/paddle/fluid/operators/clip_by_norm_op.cc
+++ b/paddle/fluid/operators/clip_by_norm_op.cc
--- a/paddle/fluid/operators/clip_by_norm_op.cu
+++ b/paddle/fluid/operators/clip_by_norm_op.cu
--- a/paddle/fluid/operators/clip_by_norm_op.h
+++ b/paddle/fluid/operators/clip_by_norm_op.h
--- a/paddle/fluid/operators/clip_op.cc
+++ b/paddle/fluid/operators/clip_op.cc
--- a/paddle/fluid/operators/clip_op.cu
+++ b/paddle/fluid/operators/clip_op.cu
--- a/paddle/fluid/operators/clip_op.h
+++ b/paddle/fluid/operators/clip_op.h
--- a/paddle/fluid/operators/compare_op.cc
+++ b/paddle/fluid/operators/compare_op.cc
--- a/paddle/fluid/operators/compare_op.cu
+++ b/paddle/fluid/operators/compare_op.cu
--- a/paddle/fluid/operators/compare_op.h
+++ b/paddle/fluid/operators/compare_op.h
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
--- a/paddle/fluid/operators/concat_op.cu.cc
+++ b/paddle/fluid/operators/concat_op.cu.cc
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
--- a/paddle/fluid/operators/cond_op.cc
+++ b/paddle/fluid/operators/cond_op.cc
--- a/paddle/fluid/operators/cond_op.h
+++ b/paddle/fluid/operators/cond_op.h
--- a/paddle/fluid/operators/conditional_block_op.cc
+++ b/paddle/fluid/operators/conditional_block_op.cc
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
--- a/paddle/fluid/operators/conv_op.cu.cc
+++ b/paddle/fluid/operators/conv_op.cu.cc
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
--- a/paddle/fluid/operators/conv_shift_op.cc
+++ b/paddle/fluid/operators/conv_shift_op.cc
--- a/paddle/fluid/operators/conv_shift_op.cu
+++ b/paddle/fluid/operators/conv_shift_op.cu
--- a/paddle/fluid/operators/conv_shift_op.h
+++ b/paddle/fluid/operators/conv_shift_op.h
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
--- a/paddle/fluid/operators/conv_transpose_op.cu.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cu.cc
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
--- a/paddle/fluid/operators/cos_sim_op.cc
+++ b/paddle/fluid/operators/cos_sim_op.cc
--- a/paddle/fluid/operators/cos_sim_op.cu
+++ b/paddle/fluid/operators/cos_sim_op.cu
--- a/paddle/fluid/operators/cos_sim_op.h
+++ b/paddle/fluid/operators/cos_sim_op.h
--- a/paddle/fluid/operators/create_reader_op.cc
+++ b/paddle/fluid/operators/create_reader_op.cc
--- a/paddle/fluid/operators/crf_decoding_op.cc
+++ b/paddle/fluid/operators/crf_decoding_op.cc
--- a/paddle/fluid/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
--- a/paddle/fluid/operators/crop_op.cu
+++ b/paddle/fluid/operators/crop_op.cu
--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
--- a/paddle/fluid/operators/cross_entropy_op.cu
+++ b/paddle/fluid/operators/cross_entropy_op.cu
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
--- a/paddle/fluid/operators/ctc_align_op.cc
+++ b/paddle/fluid/operators/ctc_align_op.cc
--- a/paddle/fluid/operators/ctc_align_op.cu
+++ b/paddle/fluid/operators/ctc_align_op.cu
--- a/paddle/fluid/operators/ctc_align_op.h
+++ b/paddle/fluid/operators/ctc_align_op.h
--- a/paddle/fluid/operators/cum_op.h
+++ b/paddle/fluid/operators/cum_op.h
--- a/paddle/fluid/operators/cumsum_op.cc
+++ b/paddle/fluid/operators/cumsum_op.cc
--- a/paddle/fluid/operators/cumsum_op.cu
+++ b/paddle/fluid/operators/cumsum_op.cu
--- a/paddle/fluid/operators/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/decayed_adagrad_op.cc
--- a/paddle/fluid/operators/decayed_adagrad_op.cu
+++ b/paddle/fluid/operators/decayed_adagrad_op.cu
--- a/paddle/fluid/operators/decayed_adagrad_op.h
+++ b/paddle/fluid/operators/decayed_adagrad_op.h
--- a/paddle/operators/detail/CMakeLists.txt
+++ b/paddle/operators/detail/CMakeLists.txt
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
--- a/paddle/fluid/operators/detail/grpc_client.h
+++ b/paddle/fluid/operators/detail/grpc_client.h
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
--- a/paddle/operators/detail/safe_ref.h
+++ b/paddle/operators/detail/safe_ref.h
--- a/paddle/operators/detail/send_recv.proto
+++ b/paddle/operators/detail/send_recv.proto
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
--- a/paddle/fluid/operators/detail/sendrecvop_utils.h
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.h
--- a/paddle/operators/detail/simple_block_queue.h
+++ b/paddle/operators/detail/simple_block_queue.h
--- a/paddle/fluid/operators/detail/strided_memcpy.h
+++ b/paddle/fluid/operators/detail/strided_memcpy.h
--- a/paddle/fluid/operators/detection_output_op.cc
+++ b/paddle/fluid/operators/detection_output_op.cc
--- a/paddle/fluid/operators/detection_output_op.cu.cc
+++ b/paddle/fluid/operators/detection_output_op.cu.cc
--- a/paddle/fluid/operators/detection_output_op.h
+++ b/paddle/fluid/operators/detection_output_op.h
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
--- a/paddle/fluid/operators/dropout_op.h
+++ b/paddle/fluid/operators/dropout_op.h
--- a/paddle/fluid/operators/edit_distance_op.cc
+++ b/paddle/fluid/operators/edit_distance_op.cc
--- a/paddle/fluid/operators/edit_distance_op.cu
+++ b/paddle/fluid/operators/edit_distance_op.cu
--- a/paddle/fluid/operators/edit_distance_op.h
+++ b/paddle/fluid/operators/edit_distance_op.h
--- a/paddle/fluid/operators/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise_add_op.cc
--- a/paddle/fluid/operators/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise_add_op.cu
--- a/paddle/fluid/operators/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise_add_op.h
--- a/paddle/fluid/operators/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise_div_op.cc
--- a/paddle/fluid/operators/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise_div_op.cu
--- a/paddle/fluid/operators/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise_div_op.h
--- a/paddle/fluid/operators/elementwise_max_op.cc
+++ b/paddle/fluid/operators/elementwise_max_op.cc
--- a/paddle/fluid/operators/elementwise_max_op.cu
+++ b/paddle/fluid/operators/elementwise_max_op.cu
--- a/paddle/fluid/operators/elementwise_max_op.h
+++ b/paddle/fluid/operators/elementwise_max_op.h
--- a/paddle/fluid/operators/elementwise_min_op.cc
+++ b/paddle/fluid/operators/elementwise_min_op.cc
--- a/paddle/fluid/operators/elementwise_min_op.cu
+++ b/paddle/fluid/operators/elementwise_min_op.cu
--- a/paddle/fluid/operators/elementwise_min_op.h
+++ b/paddle/fluid/operators/elementwise_min_op.h
--- a/paddle/fluid/operators/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_op.cc
--- a/paddle/fluid/operators/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise_mul_op.cu
--- a/paddle/fluid/operators/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise_mul_op.h
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
--- a/paddle/fluid/operators/elementwise_pow_op.cc
+++ b/paddle/fluid/operators/elementwise_pow_op.cc
--- a/paddle/fluid/operators/elementwise_pow_op.cu
+++ b/paddle/fluid/operators/elementwise_pow_op.cu
--- a/paddle/fluid/operators/elementwise_pow_op.h
+++ b/paddle/fluid/operators/elementwise_pow_op.h
--- a/paddle/fluid/operators/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise_sub_op.cc
--- a/paddle/fluid/operators/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise_sub_op.cu
--- a/paddle/fluid/operators/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise_sub_op.h
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
--- a/paddle/fluid/operators/expand_op.cu
+++ b/paddle/fluid/operators/expand_op.cu
--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
--- a/paddle/fluid/operators/feed_op.cc
+++ b/paddle/fluid/operators/feed_op.cc
--- a/paddle/fluid/operators/fetch_op.cc
+++ b/paddle/fluid/operators/fetch_op.cc
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.h
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.h
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
--- a/paddle/fluid/operators/fill_op.cc
+++ b/paddle/fluid/operators/fill_op.cc
--- a/paddle/fluid/operators/fill_zeros_like_op.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cc
--- a/paddle/fluid/operators/fill_zeros_like_op.cu.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
--- a/paddle/fluid/operators/fill_zeros_like_op.h
+++ b/paddle/fluid/operators/fill_zeros_like_op.h
--- a/paddle/fluid/operators/ftrl_op.cc
+++ b/paddle/fluid/operators/ftrl_op.cc
--- a/paddle/fluid/operators/ftrl_op.cu
+++ b/paddle/fluid/operators/ftrl_op.cu
--- a/paddle/fluid/operators/ftrl_op.h
+++ b/paddle/fluid/operators/ftrl_op.h
--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/fluid/operators/gather.cu.h
--- a/paddle/fluid/operators/gather.h
+++ b/paddle/fluid/operators/gather.h
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
--- a/paddle/fluid/operators/gather_op.h
+++ b/paddle/fluid/operators/gather_op.h
--- a/paddle/fluid/operators/gather_test.cc
+++ b/paddle/fluid/operators/gather_test.cc
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
--- a/paddle/fluid/operators/get_places_op.cc
+++ b/paddle/fluid/operators/get_places_op.cc
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
--- a/paddle/fluid/operators/gru_op.cu.cc
+++ b/paddle/fluid/operators/gru_op.cu.cc
--- a/paddle/fluid/operators/gru_op.h
+++ b/paddle/fluid/operators/gru_op.h
--- a/paddle/fluid/operators/gru_unit_op.cc
+++ b/paddle/fluid/operators/gru_unit_op.cc
--- a/paddle/fluid/operators/gru_unit_op.cu
+++ b/paddle/fluid/operators/gru_unit_op.cu
--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
--- a/paddle/fluid/operators/hinge_loss_op.cu
+++ b/paddle/fluid/operators/hinge_loss_op.cu
--- a/paddle/fluid/operators/hinge_loss_op.h
+++ b/paddle/fluid/operators/hinge_loss_op.h
--- a/paddle/fluid/operators/huber_loss_op.cc
+++ b/paddle/fluid/operators/huber_loss_op.cc
--- a/paddle/fluid/operators/huber_loss_op.cu
+++ b/paddle/fluid/operators/huber_loss_op.cu
--- a/paddle/fluid/operators/huber_loss_op.h
+++ b/paddle/fluid/operators/huber_loss_op.h
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
--- a/paddle/fluid/operators/im2sequence_op.cu
+++ b/paddle/fluid/operators/im2sequence_op.cu
--- a/paddle/fluid/operators/im2sequence_op.h
+++ b/paddle/fluid/operators/im2sequence_op.h
--- a/paddle/operators/images/batch_norm_fork.dot
+++ b/paddle/operators/images/batch_norm_fork.dot
--- a/paddle/operators/images/batch_norm_fork.png
+++ b/paddle/operators/images/batch_norm_fork.png
--- a/paddle/operators/images/batch_norm_op_kernel.png
+++ b/paddle/operators/images/batch_norm_op_kernel.png
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
--- a/paddle/fluid/operators/iou_similarity_op.cc
+++ b/paddle/fluid/operators/iou_similarity_op.cc
--- a/paddle/fluid/operators/iou_similarity_op.cu
+++ b/paddle/fluid/operators/iou_similarity_op.cu
--- a/paddle/fluid/operators/iou_similarity_op.h
+++ b/paddle/fluid/operators/iou_similarity_op.h
--- a/paddle/fluid/operators/is_empty_op.cc
+++ b/paddle/fluid/operators/is_empty_op.cc
--- a/paddle/fluid/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
--- a/paddle/fluid/operators/l1_norm_op.cu
+++ b/paddle/fluid/operators/l1_norm_op.cu
--- a/paddle/fluid/operators/l1_norm_op.h
+++ b/paddle/fluid/operators/l1_norm_op.h
--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
--- a/paddle/fluid/operators/label_smooth_op.cu
+++ b/paddle/fluid/operators/label_smooth_op.cu
--- a/paddle/fluid/operators/label_smooth_op.h
+++ b/paddle/fluid/operators/label_smooth_op.h
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
--- a/paddle/fluid/operators/layer_norm_op.cu
+++ b/paddle/fluid/operators/layer_norm_op.cu
--- a/paddle/fluid/operators/layer_norm_op.h
+++ b/paddle/fluid/operators/layer_norm_op.h
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
--- a/paddle/fluid/operators/linear_chain_crf_op.cu
+++ b/paddle/fluid/operators/linear_chain_crf_op.cu
--- a/paddle/fluid/operators/linear_chain_crf_op.h
+++ b/paddle/fluid/operators/linear_chain_crf_op.h
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
--- a/paddle/fluid/operators/lod_array_length_op.cc
+++ b/paddle/fluid/operators/lod_array_length_op.cc
--- a/paddle/fluid/operators/lod_rank_table_op.cc
+++ b/paddle/fluid/operators/lod_rank_table_op.cc
--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
--- a/paddle/fluid/operators/lod_reset_op.cu
+++ b/paddle/fluid/operators/lod_reset_op.cu
--- a/paddle/fluid/operators/lod_reset_op.h
+++ b/paddle/fluid/operators/lod_reset_op.h
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
--- a/paddle/fluid/operators/log_loss_op.cu
+++ b/paddle/fluid/operators/log_loss_op.cu
--- a/paddle/fluid/operators/log_loss_op.h
+++ b/paddle/fluid/operators/log_loss_op.h
--- a/paddle/fluid/operators/logical_op.cc
+++ b/paddle/fluid/operators/logical_op.cc
--- a/paddle/fluid/operators/logical_op.cu
+++ b/paddle/fluid/operators/logical_op.cu
--- a/paddle/fluid/operators/logical_op.h
+++ b/paddle/fluid/operators/logical_op.h
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
--- a/paddle/fluid/operators/lrn_op.cu
+++ b/paddle/fluid/operators/lrn_op.cu
--- a/paddle/fluid/operators/lrn_op.h
+++ b/paddle/fluid/operators/lrn_op.h
--- a/paddle/fluid/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
--- a/paddle/fluid/operators/lstm_op.cu.cc
+++ b/paddle/fluid/operators/lstm_op.cu.cc
--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
--- a/paddle/fluid/operators/lstm_unit_op.cc
+++ b/paddle/fluid/operators/lstm_unit_op.cc
--- a/paddle/fluid/operators/lstm_unit_op.cu
+++ b/paddle/fluid/operators/lstm_unit_op.cu
--- a/paddle/fluid/operators/lstm_unit_op.h
+++ b/paddle/fluid/operators/lstm_unit_op.h
--- a/paddle/fluid/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
--- a/paddle/fluid/operators/lstmp_op.cu
+++ b/paddle/fluid/operators/lstmp_op.cu
--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
--- a/paddle/fluid/operators/margin_rank_loss_op.cc
+++ b/paddle/fluid/operators/margin_rank_loss_op.cc
--- a/paddle/fluid/operators/margin_rank_loss_op.cu
+++ b/paddle/fluid/operators/margin_rank_loss_op.cu
--- a/paddle/fluid/operators/margin_rank_loss_op.h
+++ b/paddle/fluid/operators/margin_rank_loss_op.h
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
--- a/paddle/fluid/operators/math/context_project.cc
+++ b/paddle/fluid/operators/math/context_project.cc
--- a/paddle/fluid/operators/math/context_project.cu
+++ b/paddle/fluid/operators/math/context_project.cu
--- a/paddle/fluid/operators/math/context_project.h
+++ b/paddle/fluid/operators/math/context_project.h
--- a/paddle/fluid/operators/math/cos_sim_functor.cc
+++ b/paddle/fluid/operators/math/cos_sim_functor.cc
--- a/paddle/fluid/operators/math/cos_sim_functor.cu
+++ b/paddle/fluid/operators/math/cos_sim_functor.cu
--- a/paddle/fluid/operators/math/cos_sim_functor.h
+++ b/paddle/fluid/operators/math/cos_sim_functor.h
--- a/paddle/fluid/operators/math/cross_entropy.cc
+++ b/paddle/fluid/operators/math/cross_entropy.cc
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
--- a/paddle/fluid/operators/math/cross_entropy.h
+++ b/paddle/fluid/operators/math/cross_entropy.h
--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/fluid/operators/math/depthwise_conv.cu
--- a/paddle/fluid/operators/math/depthwise_conv.h
+++ b/paddle/fluid/operators/math/depthwise_conv.h
--- a/paddle/operators/math/detail/CMakeLists.txt
+++ b/paddle/operators/math/detail/CMakeLists.txt
--- a/paddle/fluid/operators/math/detail/activation_functions.h
+++ b/paddle/fluid/operators/math/detail/activation_functions.h
--- a/paddle/fluid/operators/math/detail/avx_functions.cc
+++ b/paddle/fluid/operators/math/detail/avx_functions.cc
--- a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
--- a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
--- a/paddle/fluid/operators/math/detail/gru_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_kernel.h
--- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
--- a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
--- a/paddle/fluid/operators/math/detail/lstm_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_kernel.h
--- a/paddle/fluid/operators/math/detection_util.h
+++ b/paddle/fluid/operators/math/detection_util.h
--- a/paddle/fluid/operators/math/gru_compute.cc
+++ b/paddle/fluid/operators/math/gru_compute.cc
--- a/paddle/fluid/operators/math/gru_compute.cu
+++ b/paddle/fluid/operators/math/gru_compute.cu
--- a/paddle/fluid/operators/math/gru_compute.h
+++ b/paddle/fluid/operators/math/gru_compute.h
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
--- a/paddle/fluid/operators/math/im2col.cu
+++ b/paddle/fluid/operators/math/im2col.cu
--- a/paddle/fluid/operators/math/im2col.h
+++ b/paddle/fluid/operators/math/im2col.h
--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
--- a/paddle/fluid/operators/math/lstm_compute.cc
+++ b/paddle/fluid/operators/math/lstm_compute.cc
--- a/paddle/fluid/operators/math/lstm_compute.cu
+++ b/paddle/fluid/operators/math/lstm_compute.cu
--- a/paddle/fluid/operators/math/lstm_compute.h
+++ b/paddle/fluid/operators/math/lstm_compute.h
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
--- a/paddle/fluid/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
--- a/paddle/fluid/operators/math/math_function_impl.h
+++ b/paddle/fluid/operators/math/math_function_impl.h
--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
--- a/paddle/fluid/operators/math/math_function_test.cu
+++ b/paddle/fluid/operators/math/math_function_test.cu
--- a/paddle/fluid/operators/math/matmul.h
+++ b/paddle/fluid/operators/math/matmul.h
--- a/paddle/fluid/operators/math/maxouting.cc
+++ b/paddle/fluid/operators/math/maxouting.cc
--- a/paddle/fluid/operators/math/maxouting.cu
+++ b/paddle/fluid/operators/math/maxouting.cu
--- a/paddle/fluid/operators/math/maxouting.h
+++ b/paddle/fluid/operators/math/maxouting.h
--- a/paddle/fluid/operators/math/pooling.cc
+++ b/paddle/fluid/operators/math/pooling.cc
--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/fluid/operators/math/pooling.cu
--- a/paddle/fluid/operators/math/pooling.h
+++ b/paddle/fluid/operators/math/pooling.h
--- a/paddle/operators/math/sampler.cc
+++ b/paddle/operators/math/sampler.cc
--- a/paddle/operators/math/sampler.h
+++ b/paddle/operators/math/sampler.h
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
--- a/paddle/fluid/operators/math/selected_rows_functor.h
+++ b/paddle/fluid/operators/math/selected_rows_functor.h
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu
--- a/paddle/fluid/operators/math/sequence2batch.cc
+++ b/paddle/fluid/operators/math/sequence2batch.cc
--- a/paddle/fluid/operators/math/sequence2batch.cu
+++ b/paddle/fluid/operators/math/sequence2batch.cu
--- a/paddle/fluid/operators/math/sequence2batch.h
+++ b/paddle/fluid/operators/math/sequence2batch.h
--- a/paddle/fluid/operators/math/sequence_padding.cc
+++ b/paddle/fluid/operators/math/sequence_padding.cc
--- a/paddle/fluid/operators/math/sequence_padding.cu
+++ b/paddle/fluid/operators/math/sequence_padding.cu
--- a/paddle/fluid/operators/math/sequence_padding.h
+++ b/paddle/fluid/operators/math/sequence_padding.h
--- a/paddle/fluid/operators/math/sequence_padding_test.cc
+++ b/paddle/fluid/operators/math/sequence_padding_test.cc
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
--- a/paddle/fluid/operators/math/sequence_pooling.h
+++ b/paddle/fluid/operators/math/sequence_pooling.h
--- a/paddle/fluid/operators/math/sequence_scale.cc
+++ b/paddle/fluid/operators/math/sequence_scale.cc
--- a/paddle/fluid/operators/math/sequence_scale.cu
+++ b/paddle/fluid/operators/math/sequence_scale.cu
--- a/paddle/fluid/operators/math/sequence_scale.h
+++ b/paddle/fluid/operators/math/sequence_scale.h
--- a/paddle/fluid/operators/math/softmax.cc
+++ b/paddle/fluid/operators/math/softmax.cc
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
--- a/paddle/fluid/operators/math/softmax.h
+++ b/paddle/fluid/operators/math/softmax.h
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
--- a/paddle/fluid/operators/math/unpooling.cc
+++ b/paddle/fluid/operators/math/unpooling.cc
--- a/paddle/fluid/operators/math/unpooling.cu
+++ b/paddle/fluid/operators/math/unpooling.cu
--- a/paddle/fluid/operators/math/unpooling.h
+++ b/paddle/fluid/operators/math/unpooling.h
--- a/paddle/fluid/operators/math/vol2col.cc
+++ b/paddle/fluid/operators/math/vol2col.cc
--- a/paddle/fluid/operators/math/vol2col.cu
+++ b/paddle/fluid/operators/math/vol2col.cu
--- a/paddle/fluid/operators/math/vol2col.h
+++ b/paddle/fluid/operators/math/vol2col.h
--- a/paddle/fluid/operators/math/vol2col_test.cc
+++ b/paddle/fluid/operators/math/vol2col_test.cc
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
--- a/paddle/fluid/operators/matmul_op.cu.cc
+++ b/paddle/fluid/operators/matmul_op.cu.cc
--- a/paddle/fluid/operators/matmul_op.h
+++ b/paddle/fluid/operators/matmul_op.h
--- a/paddle/fluid/operators/max_sequence_len_op.cc
+++ b/paddle/fluid/operators/max_sequence_len_op.cc
--- a/paddle/fluid/operators/maxout_op.cc
+++ b/paddle/fluid/operators/maxout_op.cc
--- a/paddle/fluid/operators/maxout_op.cu.cc
+++ b/paddle/fluid/operators/maxout_op.cu.cc
--- a/paddle/fluid/operators/maxout_op.h
+++ b/paddle/fluid/operators/maxout_op.h
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
--- a/paddle/fluid/operators/mean_op.h
+++ b/paddle/fluid/operators/mean_op.h
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
--- a/paddle/fluid/operators/mine_hard_examples_op.cc
+++ b/paddle/fluid/operators/mine_hard_examples_op.cc
--- a/paddle/fluid/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
--- a/paddle/fluid/operators/minus_op.cu
+++ b/paddle/fluid/operators/minus_op.cu
--- a/paddle/fluid/operators/minus_op.h
+++ b/paddle/fluid/operators/minus_op.h
--- a/paddle/fluid/operators/modified_huber_loss_op.cc
+++ b/paddle/fluid/operators/modified_huber_loss_op.cc
--- a/paddle/fluid/operators/modified_huber_loss_op.cu
+++ b/paddle/fluid/operators/modified_huber_loss_op.cu
--- a/paddle/fluid/operators/modified_huber_loss_op.h
+++ b/paddle/fluid/operators/modified_huber_loss_op.h
--- a/paddle/fluid/operators/momentum_op.cc
+++ b/paddle/fluid/operators/momentum_op.cc
--- a/paddle/fluid/operators/momentum_op.cu
+++ b/paddle/fluid/operators/momentum_op.cu
--- a/paddle/fluid/operators/momentum_op.h
+++ b/paddle/fluid/operators/momentum_op.h
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
--- a/paddle/fluid/operators/mul_op.cu.cc
+++ b/paddle/fluid/operators/mul_op.cu.cc
--- a/paddle/fluid/operators/mul_op.h
+++ b/paddle/fluid/operators/mul_op.h
--- a/paddle/fluid/operators/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/multiclass_nms_op.cc
--- a/paddle/fluid/operators/multiplex_op.cc
+++ b/paddle/fluid/operators/multiplex_op.cc
--- a/paddle/fluid/operators/multiplex_op.cu
+++ b/paddle/fluid/operators/multiplex_op.cu
--- a/paddle/fluid/operators/multiplex_op.h
+++ b/paddle/fluid/operators/multiplex_op.h
--- a/paddle/operators/nccl/CMakeLists.txt
+++ b/paddle/operators/nccl/CMakeLists.txt
--- a/paddle/fluid/operators/nccl/nccl_gpu_common.cc
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.cc
--- a/paddle/fluid/operators/nccl/nccl_gpu_common.h
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.h
--- a/paddle/fluid/operators/nccl_op.cc
+++ b/paddle/fluid/operators/nccl_op.cc
--- a/paddle/fluid/operators/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl_op.cu.cc
--- a/paddle/fluid/operators/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl_op_test.cu.cc
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
--- a/paddle/fluid/operators/net_op.cc
+++ b/paddle/fluid/operators/net_op.cc
--- a/paddle/fluid/operators/net_op.h
+++ b/paddle/fluid/operators/net_op.h
--- a/paddle/fluid/operators/net_op_test.cc
+++ b/paddle/fluid/operators/net_op_test.cc
--- a/paddle/fluid/operators/norm_op.cc
+++ b/paddle/fluid/operators/norm_op.cc
--- a/paddle/fluid/operators/norm_op.cu
+++ b/paddle/fluid/operators/norm_op.cu
--- a/paddle/fluid/operators/norm_op.h
+++ b/paddle/fluid/operators/norm_op.h
--- a/paddle/fluid/operators/one_hot_op.cc
+++ b/paddle/fluid/operators/one_hot_op.cc
--- a/paddle/fluid/operators/one_hot_op.cu
+++ b/paddle/fluid/operators/one_hot_op.cu
--- a/paddle/fluid/operators/one_hot_op.h
+++ b/paddle/fluid/operators/one_hot_op.h
--- a/paddle/operators/op_documentation/batch_norm_op.md
+++ b/paddle/operators/op_documentation/batch_norm_op.md
--- a/paddle/operators/op_documentation/name_convention.md
+++ b/paddle/operators/op_documentation/name_convention.md
--- a/paddle/operators/op_documentation/net_op_design.md
+++ b/paddle/operators/op_documentation/net_op_design.md
--- a/paddle/operators/op_documentation/op_markdown_format.md
+++ b/paddle/operators/op_documentation/op_markdown_format.md
--- a/paddle/operators/op_documentation/rnn_design.md
+++ b/paddle/operators/op_documentation/rnn_design.md
--- a/paddle/fluid/operators/pad_op.cc
+++ b/paddle/fluid/operators/pad_op.cc
--- a/paddle/fluid/operators/pad_op.cu
+++ b/paddle/fluid/operators/pad_op.cu
--- a/paddle/fluid/operators/pad_op.h
+++ b/paddle/fluid/operators/pad_op.h
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
--- a/paddle/fluid/operators/pool_op.cu.cc
+++ b/paddle/fluid/operators/pool_op.cu.cc
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
--- a/paddle/fluid/operators/pool_with_index_op.cu.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cu.cc
--- a/paddle/fluid/operators/pool_with_index_op.h
+++ b/paddle/fluid/operators/pool_with_index_op.h
--- a/paddle/fluid/operators/positive_negative_pair_op.cc
+++ b/paddle/fluid/operators/positive_negative_pair_op.cc
--- a/paddle/fluid/operators/positive_negative_pair_op.h
+++ b/paddle/fluid/operators/positive_negative_pair_op.h
--- a/paddle/fluid/operators/precision_recall_op.cc
+++ b/paddle/fluid/operators/precision_recall_op.cc
--- a/paddle/fluid/operators/precision_recall_op.h
+++ b/paddle/fluid/operators/precision_recall_op.h
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
--- a/paddle/fluid/operators/prelu_op.cu
+++ b/paddle/fluid/operators/prelu_op.cu
--- a/paddle/fluid/operators/prelu_op.h
+++ b/paddle/fluid/operators/prelu_op.h
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
--- a/paddle/fluid/operators/prior_box_op.cc
+++ b/paddle/fluid/operators/prior_box_op.cc
--- a/paddle/fluid/operators/prior_box_op.h
+++ b/paddle/fluid/operators/prior_box_op.h
--- a/paddle/fluid/operators/proximal_adagrad_op.cc
+++ b/paddle/fluid/operators/proximal_adagrad_op.cc
--- a/paddle/fluid/operators/proximal_adagrad_op.cu
+++ b/paddle/fluid/operators/proximal_adagrad_op.cu
--- a/paddle/fluid/operators/proximal_adagrad_op.h
+++ b/paddle/fluid/operators/proximal_adagrad_op.h
--- a/paddle/fluid/operators/proximal_gd_op.cc
+++ b/paddle/fluid/operators/proximal_gd_op.cc
--- a/paddle/fluid/operators/proximal_gd_op.cu
+++ b/paddle/fluid/operators/proximal_gd_op.cu
--- a/paddle/fluid/operators/proximal_gd_op.h
+++ b/paddle/fluid/operators/proximal_gd_op.h
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
--- a/paddle/fluid/operators/rank_loss_op.cu
+++ b/paddle/fluid/operators/rank_loss_op.cu
--- a/paddle/fluid/operators/rank_loss_op.h
+++ b/paddle/fluid/operators/rank_loss_op.h
--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
--- a/paddle/fluid/operators/reduce_op.cc
+++ b/paddle/fluid/operators/reduce_op.cc
--- a/paddle/fluid/operators/reduce_op.cu
+++ b/paddle/fluid/operators/reduce_op.cu
--- a/paddle/fluid/operators/reduce_op.h
+++ b/paddle/fluid/operators/reduce_op.h
--- a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
--- a/paddle/fluid/operators/reshape_op.cu
+++ b/paddle/fluid/operators/reshape_op.cu
--- a/paddle/fluid/operators/reshape_op.h
+++ b/paddle/fluid/operators/reshape_op.h
--- a/paddle/fluid/operators/rmsprop_op.cc
+++ b/paddle/fluid/operators/rmsprop_op.cc
--- a/paddle/fluid/operators/rmsprop_op.cu
+++ b/paddle/fluid/operators/rmsprop_op.cu
--- a/paddle/fluid/operators/rmsprop_op.h
+++ b/paddle/fluid/operators/rmsprop_op.h
--- a/paddle/fluid/operators/rnn_memory_helper_op.cc
+++ b/paddle/fluid/operators/rnn_memory_helper_op.cc
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
--- a/paddle/fluid/operators/roi_pool_op.h
+++ b/paddle/fluid/operators/roi_pool_op.h
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
--- a/paddle/fluid/operators/row_conv_op.cu
+++ b/paddle/fluid/operators/row_conv_op.cu
--- a/paddle/fluid/operators/row_conv_op.h
+++ b/paddle/fluid/operators/row_conv_op.h
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
--- a/paddle/fluid/operators/save_load_combine_op_test.cc
+++ b/paddle/fluid/operators/save_load_combine_op_test.cc
--- a/paddle/fluid/operators/save_load_op_test.cc
+++ b/paddle/fluid/operators/save_load_op_test.cc
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
--- a/paddle/fluid/operators/scale_op.cu
+++ b/paddle/fluid/operators/scale_op.cu
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
--- a/paddle/fluid/operators/scatter.cu.h
+++ b/paddle/fluid/operators/scatter.cu.h
--- a/paddle/fluid/operators/scatter.h
+++ b/paddle/fluid/operators/scatter.h
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
--- a/paddle/fluid/operators/scatter_op.cu
+++ b/paddle/fluid/operators/scatter_op.cu
--- a/paddle/fluid/operators/scatter_op.h
+++ b/paddle/fluid/operators/scatter_op.h
--- a/paddle/fluid/operators/scatter_test.cc
+++ b/paddle/fluid/operators/scatter_test.cc
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
--- a/paddle/fluid/operators/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cc
--- a/paddle/fluid/operators/sequence_concat_op.cu.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cu.cc
--- a/paddle/fluid/operators/sequence_concat_op.h
+++ b/paddle/fluid/operators/sequence_concat_op.h
--- a/paddle/fluid/operators/sequence_conv_op.cc
+++ b/paddle/fluid/operators/sequence_conv_op.cc
--- a/paddle/fluid/operators/sequence_conv_op.cu.cc
+++ b/paddle/fluid/operators/sequence_conv_op.cu.cc
--- a/paddle/fluid/operators/sequence_conv_op.h
+++ b/paddle/fluid/operators/sequence_conv_op.h
--- a/paddle/fluid/operators/sequence_erase_op.cc
+++ b/paddle/fluid/operators/sequence_erase_op.cc
--- a/paddle/fluid/operators/sequence_erase_op.cu
+++ b/paddle/fluid/operators/sequence_erase_op.cu
--- a/paddle/fluid/operators/sequence_erase_op.h
+++ b/paddle/fluid/operators/sequence_erase_op.h
--- a/paddle/fluid/operators/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_expand_op.cc
--- a/paddle/fluid/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
--- a/paddle/fluid/operators/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_pool_op.cc
--- a/paddle/fluid/operators/sequence_pool_op.cu
+++ b/paddle/fluid/operators/sequence_pool_op.cu
--- a/paddle/fluid/operators/sequence_pool_op.h
+++ b/paddle/fluid/operators/sequence_pool_op.h
--- a/paddle/fluid/operators/sequence_reshape_op.cc
+++ b/paddle/fluid/operators/sequence_reshape_op.cc
--- a/paddle/fluid/operators/sequence_reshape_op.cu
+++ b/paddle/fluid/operators/sequence_reshape_op.cu
--- a/paddle/fluid/operators/sequence_reshape_op.h
+++ b/paddle/fluid/operators/sequence_reshape_op.h
--- a/paddle/fluid/operators/sequence_slice_op.cc
+++ b/paddle/fluid/operators/sequence_slice_op.cc
--- a/paddle/fluid/operators/sequence_slice_op.cu
+++ b/paddle/fluid/operators/sequence_slice_op.cu
--- a/paddle/fluid/operators/sequence_slice_op.h
+++ b/paddle/fluid/operators/sequence_slice_op.h
--- a/paddle/fluid/operators/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cc
--- a/paddle/fluid/operators/sequence_softmax_op.cu.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cu.cc
--- a/paddle/fluid/operators/sequence_softmax_op.h
+++ b/paddle/fluid/operators/sequence_softmax_op.h
--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
--- a/paddle/fluid/operators/sgd_op.cu
+++ b/paddle/fluid/operators/sgd_op.cu
--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
--- a/paddle/fluid/operators/shrink_rnn_memory_op.cc
+++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
--- a/paddle/fluid/operators/sign_op.cc
+++ b/paddle/fluid/operators/sign_op.cc
--- a/paddle/fluid/operators/sign_op.cu
+++ b/paddle/fluid/operators/sign_op.cu
--- a/paddle/fluid/operators/sign_op.h
+++ b/paddle/fluid/operators/sign_op.h
--- a/paddle/fluid/operators/smooth_l1_loss_op.cc
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cc
--- a/paddle/fluid/operators/smooth_l1_loss_op.cu
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cu
--- a/paddle/fluid/operators/smooth_l1_loss_op.h
+++ b/paddle/fluid/operators/smooth_l1_loss_op.h
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
--- a/paddle/fluid/operators/softmax_op.cu.cc
+++ b/paddle/fluid/operators/softmax_op.cu.cc
--- a/paddle/fluid/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
--- a/paddle/fluid/operators/split_op.cu.cc
+++ b/paddle/fluid/operators/split_op.cu.cc
--- a/paddle/fluid/operators/split_op.h
+++ b/paddle/fluid/operators/split_op.h
--- a/paddle/fluid/operators/split_selected_rows_op.cc
+++ b/paddle/fluid/operators/split_selected_rows_op.cc
--- a/paddle/fluid/operators/split_selected_rows_op.cu
+++ b/paddle/fluid/operators/split_selected_rows_op.cu
--- a/paddle/fluid/operators/split_selected_rows_op.h
+++ b/paddle/fluid/operators/split_selected_rows_op.h
--- a/paddle/fluid/operators/spp_op.cc
+++ b/paddle/fluid/operators/spp_op.cc
--- a/paddle/fluid/operators/spp_op.cu.cc
+++ b/paddle/fluid/operators/spp_op.cu.cc
--- a/paddle/fluid/operators/spp_op.h
+++ b/paddle/fluid/operators/spp_op.h
--- a/paddle/fluid/operators/squared_l2_distance_op.cc
+++ b/paddle/fluid/operators/squared_l2_distance_op.cc
--- a/paddle/fluid/operators/squared_l2_distance_op.cu
+++ b/paddle/fluid/operators/squared_l2_distance_op.cu
--- a/paddle/fluid/operators/squared_l2_distance_op.h
+++ b/paddle/fluid/operators/squared_l2_distance_op.h
--- a/paddle/fluid/operators/squared_l2_norm_op.cc
+++ b/paddle/fluid/operators/squared_l2_norm_op.cc
--- a/paddle/fluid/operators/squared_l2_norm_op.cu
+++ b/paddle/fluid/operators/squared_l2_norm_op.cu
--- a/paddle/fluid/operators/squared_l2_norm_op.h
+++ b/paddle/fluid/operators/squared_l2_norm_op.h
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
--- a/paddle/fluid/operators/strided_memcpy_test.cc
+++ b/paddle/fluid/operators/strided_memcpy_test.cc
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
--- a/paddle/fluid/operators/target_assign_op.cc
+++ b/paddle/fluid/operators/target_assign_op.cc
--- a/paddle/fluid/operators/target_assign_op.cu
+++ b/paddle/fluid/operators/target_assign_op.cu
--- a/paddle/fluid/operators/target_assign_op.h
+++ b/paddle/fluid/operators/target_assign_op.h
--- a/paddle/fluid/operators/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/tensor_array_read_write_op.cc
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
--- a/paddle/fluid/operators/transpose_op.cu.cc
+++ b/paddle/fluid/operators/transpose_op.cu.cc
--- a/paddle/fluid/operators/transpose_op.h
+++ b/paddle/fluid/operators/transpose_op.h
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
--- a/paddle/fluid/operators/unpool_op.cc
+++ b/paddle/fluid/operators/unpool_op.cc
--- a/paddle/fluid/operators/unpool_op.cu.cc
+++ b/paddle/fluid/operators/unpool_op.cu.cc
--- a/paddle/fluid/operators/unpool_op.h
+++ b/paddle/fluid/operators/unpool_op.h
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
--- a/paddle/fluid/operators/warpctc_op.cu.cc
+++ b/paddle/fluid/operators/warpctc_op.cu.cc
--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
--- a/paddle/fluid/platform/.clang-format
+++ b/paddle/fluid/platform/.clang-format
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
--- a/paddle/fluid/platform/assert.h
+++ b/paddle/fluid/platform/assert.h
--- a/paddle/fluid/platform/call_once.h
+++ b/paddle/fluid/platform/call_once.h
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
--- a/paddle/platform/cpu_info.h
+++ b/paddle/platform/cpu_info.h
--- a/paddle/fluid/platform/cpu_info_test.cc
+++ b/paddle/fluid/platform/cpu_info_test.cc
--- a/paddle/platform/cuda_helper.h
+++ b/paddle/platform/cuda_helper.h
--- a/paddle/platform/cuda_profiler.h
+++ b/paddle/platform/cuda_profiler.h
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
--- a/paddle/fluid/platform/cudnn_helper_test.cc
+++ b/paddle/fluid/platform/cudnn_helper_test.cc
--- a/paddle/platform/details/device_ptr_cast.h
+++ b/paddle/platform/details/device_ptr_cast.h
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
--- a/paddle/fluid/platform/device_context_test.cu
+++ b/paddle/fluid/platform/device_context_test.cu
--- a/paddle/platform/dynload/CMakeLists.txt
+++ b/paddle/platform/dynload/CMakeLists.txt
--- a/paddle/fluid/platform/dynload/cublas.cc
+++ b/paddle/fluid/platform/dynload/cublas.cc
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
--- a/paddle/fluid/platform/dynload/curand.cc
+++ b/paddle/fluid/platform/dynload/curand.cc
--- a/paddle/fluid/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
--- a/paddle/platform/dynload/dynamic_loader.h
+++ b/paddle/platform/dynload/dynamic_loader.h
--- a/paddle/fluid/platform/dynload/nccl.cc
+++ b/paddle/fluid/platform/dynload/nccl.cc
--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
--- a/paddle/fluid/platform/dynload/warpctc.cc
+++ b/paddle/fluid/platform/dynload/warpctc.cc
--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
--- a/paddle/fluid/platform/enforce.cc
+++ b/paddle/fluid/platform/enforce.cc
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
--- a/paddle/fluid/platform/for_range.h
+++ b/paddle/fluid/platform/for_range.h
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
--- a/paddle/platform/hostdevice.h
+++ b/paddle/platform/hostdevice.h
--- a/paddle/platform/macros.h
+++ b/paddle/platform/macros.h
--- a/paddle/platform/mkldnn_helper.h
+++ b/paddle/platform/mkldnn_helper.h
--- a/paddle/fluid/platform/nccl_test.cu
+++ b/paddle/fluid/platform/nccl_test.cu
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
--- a/paddle/fluid/platform/place_test.cc
+++ b/paddle/fluid/platform/place_test.cc
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
--- a/paddle/fluid/platform/transform.h
+++ b/paddle/fluid/platform/transform.h
--- a/paddle/fluid/platform/transform_test.cu
+++ b/paddle/fluid/platform/transform_test.cu
--- a/paddle/platform/variant.h
+++ b/paddle/platform/variant.h
--- a/paddle/fluid/pybind/.clang-format
+++ b/paddle/fluid/pybind/.clang-format
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
--- a/paddle/fluid/pybind/const_value.h
+++ b/paddle/fluid/pybind/const_value.h
--- a/paddle/fluid/pybind/exception.cc
+++ b/paddle/fluid/pybind/exception.cc
--- a/paddle/fluid/pybind/exception.h
+++ b/paddle/fluid/pybind/exception.h
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
--- a/paddle/fluid/pybind/protobuf.h
+++ b/paddle/fluid/pybind/protobuf.h
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
--- a/paddle/framework/attribute.cc
+++ b/paddle/framework/attribute.cc
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
--- a/paddle/framework/backward.h
+++ b/paddle/framework/backward.h
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
--- a/paddle/framework/data_device_transform.cc
+++ b/paddle/framework/data_device_transform.cc
--- a/paddle/framework/data_device_transform.h
+++ b/paddle/framework/data_device_transform.h
--- a/paddle/framework/data_device_transform_test.cu
+++ b/paddle/framework/data_device_transform_test.cu
--- a/paddle/framework/data_layout.h
+++ b/paddle/framework/data_layout.h
--- a/paddle/framework/data_layout_transform.cc
+++ b/paddle/framework/data_layout_transform.cc
--- a/paddle/framework/data_layout_transform.h
+++ b/paddle/framework/data_layout_transform.h
--- a/paddle/framework/data_layout_transform_test.cc
+++ b/paddle/framework/data_layout_transform_test.cc
--- a/paddle/framework/data_transform.cc
+++ b/paddle/framework/data_transform.cc
--- a/paddle/framework/data_transform.h
+++ b/paddle/framework/data_transform.h
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
--- a/paddle/framework/data_type_transform.cc
+++ b/paddle/framework/data_type_transform.cc
--- a/paddle/framework/data_type_transform.h
+++ b/paddle/framework/data_type_transform.h
--- a/paddle/framework/data_type_transform_test.cc
+++ b/paddle/framework/data_type_transform_test.cc
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
--- a/paddle/framework/ddim_test.cc
+++ b/paddle/framework/ddim_test.cc
--- a/paddle/framework/details/cow_ptr.h
+++ b/paddle/framework/details/cow_ptr.h
--- a/paddle/framework/details/cow_ptr_test.cc
+++ b/paddle/framework/details/cow_ptr_test.cc
--- a/paddle/framework/details/op_registry.h
+++ b/paddle/framework/details/op_registry.h
--- a/paddle/framework/dim.h
+++ b/paddle/framework/dim.h
--- a/paddle/framework/dim_test.cu
+++ b/paddle/framework/dim_test.cu
--- a/paddle/framework/eigen.h
+++ b/paddle/framework/eigen.h
--- a/paddle/framework/eigen_test.cc
+++ b/paddle/framework/eigen_test.cc
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
--- a/paddle/framework/feed_fetch_method.h
+++ b/paddle/framework/feed_fetch_method.h
--- a/paddle/framework/feed_fetch_type.h
+++ b/paddle/framework/feed_fetch_type.h
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
--- a/paddle/framework/grad_op_desc_maker.h
+++ b/paddle/framework/grad_op_desc_maker.h
--- a/paddle/framework/init.cc
+++ b/paddle/framework/init.cc
--- a/paddle/framework/init_test.cc
+++ b/paddle/framework/init_test.cc
--- a/paddle/framework/lod_rank_table.cc
+++ b/paddle/framework/lod_rank_table.cc
--- a/paddle/framework/lod_rank_table.h
+++ b/paddle/framework/lod_rank_table.h
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
--- a/paddle/framework/lod_tensor_array.h
+++ b/paddle/framework/lod_tensor_array.h
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
--- a/paddle/framework/lod_tensor_test.cu
+++ b/paddle/framework/lod_tensor_test.cu
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
--- a/paddle/framework/op_info.cc
+++ b/paddle/framework/op_info.cc
--- a/paddle/framework/op_info.h
+++ b/paddle/framework/op_info.h
--- a/paddle/framework/op_kernel_type.h
+++ b/paddle/framework/op_kernel_type.h
--- a/paddle/framework/op_kernel_type_test.cc
+++ b/paddle/framework/op_kernel_type_test.cc
--- a/paddle/framework/op_proto_maker.cc
+++ b/paddle/framework/op_proto_maker.cc
--- a/paddle/framework/op_proto_maker.h
+++ b/paddle/framework/op_proto_maker.h
--- a/paddle/framework/op_proto_maker_test.cc
+++ b/paddle/framework/op_proto_maker_test.cc
--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
--- a/paddle/framework/program_desc.cc
+++ b/paddle/framework/program_desc.cc
--- a/paddle/framework/program_desc.h
+++ b/paddle/framework/program_desc.h
--- a/paddle/framework/program_desc_test.cc
+++ b/paddle/framework/program_desc_test.cc
--- a/paddle/framework/prune.cc
+++ b/paddle/framework/prune.cc
--- a/paddle/framework/prune.h
+++ b/paddle/framework/prune.h
--- a/paddle/framework/prune_test.cc
+++ b/paddle/framework/prune_test.cc
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
--- a/paddle/framework/scope_test.cc
+++ b/paddle/framework/scope_test.cc
--- a/paddle/framework/selected_rows.cc
+++ b/paddle/framework/selected_rows.cc
--- a/paddle/framework/selected_rows.h
+++ b/paddle/framework/selected_rows.h
--- a/paddle/framework/selected_rows_test.cc
+++ b/paddle/framework/selected_rows_test.cc
--- a/paddle/framework/shape_inference.cc
+++ b/paddle/framework/shape_inference.cc
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
--- a/paddle/framework/tensor.cc
+++ b/paddle/framework/tensor.cc
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
--- a/paddle/framework/tensor_util.cc
+++ b/paddle/framework/tensor_util.cc
--- a/paddle/framework/tensor_util.cu
+++ b/paddle/framework/tensor_util.cu
--- a/paddle/framework/tensor_util.h
+++ b/paddle/framework/tensor_util.h
--- a/paddle/framework/tensor_util_test.cc
+++ b/paddle/framework/tensor_util_test.cc
--- a/paddle/framework/tensor_util_test.cu
+++ b/paddle/framework/tensor_util_test.cu
--- a/paddle/framework/threadpool.cc
+++ b/paddle/framework/threadpool.cc
--- a/paddle/framework/threadpool.h
+++ b/paddle/framework/threadpool.h
--- a/paddle/framework/threadpool_test.cc
+++ b/paddle/framework/threadpool_test.cc
--- a/paddle/framework/type_defs.h
+++ b/paddle/framework/type_defs.h
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
--- a/paddle/framework/var_type.h
+++ b/paddle/framework/var_type.h
--- a/paddle/framework/var_type_inference.h
+++ b/paddle/framework/var_type_inference.h
--- a/paddle/framework/var_type_inference_test.cc
+++ b/paddle/framework/var_type_inference_test.cc
--- a/paddle/framework/variable.h
+++ b/paddle/framework/variable.h
--- a/paddle/framework/variable_test.cc
+++ b/paddle/framework/variable_test.cc
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
--- a/paddle/gserver/tests/test_CompareSparse.cpp
+++ b/paddle/gserver/tests/test_CompareSparse.cpp
--- a/paddle/inference/CMakeLists.txt
+++ b/paddle/inference/CMakeLists.txt
--- a/paddle/inference/example.cc
+++ b/paddle/inference/example.cc
--- a/paddle/inference/inference.cc
+++ b/paddle/inference/inference.cc
--- a/paddle/inference/inference.h
+++ b/paddle/inference/inference.h
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
--- a/paddle/math/float16.h
+++ b/paddle/math/float16.h
--- a/paddle/memory/.clang-format
+++ b/paddle/memory/.clang-format
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
--- a/paddle/memory/detail/memory_block.cc
+++ b/paddle/memory/detail/memory_block.cc
--- a/paddle/memory/detail/meta_cache.cc
+++ b/paddle/memory/detail/meta_cache.cc
--- a/paddle/memory/detail/meta_cache.h
+++ b/paddle/memory/detail/meta_cache.h
--- a/paddle/memory/detail/meta_data.cc
+++ b/paddle/memory/detail/meta_data.cc
--- a/paddle/memory/detail/meta_data.h
+++ b/paddle/memory/detail/meta_data.h
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc