Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into seq_error

87d90d2a · wanghaoshuang · 34b4c7d8 · 69643b5e · 87d90d2a · 87d90d2a
1000 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -27,7 +27,6 @@ third_party/
 cmake-build-*

 # generated while compiling
-python/paddle/v2/fluid/core.so
 paddle/pybind/pybind.h
 CMakeFiles
 cmake_install.cmake

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@ set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})

 include(system)

-project(paddle CXX C Go)
+project(paddle CXX C)
 message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
        "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
 message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
@@ -60,6 +60,7 @@ option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
 option(WITH_DISTRIBUTE  "Compile with grpc distributed support"         OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
+option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)

 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -141,11 +142,11 @@ include(external/boost)     # download boost
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
-include(external/nccl)
 include(external/cares)
 include(external/grpc)

 include(cudnn)              # set cudnn libraries, must before configure
+include(cupti)
 include(configure)          # add paddle env configuration
 include(generic)            # simplify cmake module
 include(package)            # set paddle packages
@@ -174,7 +175,7 @@ set(EXTERNAL_LIBS
 )

 if(WITH_GPU)
-  include(cuda)
+    include(cuda)
 endif(WITH_GPU)

 if(WITH_MKLML)
@@ -201,17 +202,18 @@ endif()
 # "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
 # placed after this block, because they depends on it.
 if(WITH_GOLANG)
+    enable_language(Go)
    add_subdirectory(go)
 endif(WITH_GOLANG)

 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")

-SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
-SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
+set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")

 add_subdirectory(paddle)
 if(WITH_PYTHON)
-  add_subdirectory(python)
+    add_subdirectory(python)
 endif()

 if(WITH_DOC)

--- a/Dockerfile
+++ b/Dockerfile
@@ -22,7 +22,8 @@ COPY ./paddle/scripts/docker/root/ /root/

 RUN apt-get update && \
    apt-get install -y \
-    git python-pip python-dev openssh-server bison libnccl-dev \
+    git python-pip python-dev openssh-server bison \
+    libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
    python-matplotlib gcc-4.8 g++-4.8 \

--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -21,16 +21,6 @@ RUN apt-get update && \
    wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
    apt-get clean -y

-# Install Go and glide
-RUN wget -qO- go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
-    tar -xz -C /usr/local && \
-    mkdir /root/gopath && \
-    mkdir /root/gopath/bin && \
-    mkdir /root/gopath/src
-ENV GOROOT=/usr/local/go GOPATH=/root/gopath
-# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
-ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
-
 # git credential to skip password typing
 RUN git config --global credential.helper store


--- a/LICENSE
+++ b/LICENSE
@@ -188,7 +188,7 @@ Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

-   Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.

--- a/benchmark/cluster/vgg16/Dockerfile
+++ b/benchmark/cluster/vgg16/Dockerfile
-#FROM python:2.7.14
 FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
-RUN apt-get update && apt-get install -y python
-RUN pip install -U kubernetes opencv-python &&   apt-get update -y &&   apt-get install -y iputils-ping libgtk2.0-dev
-# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
-#       so we must build one with distribute support to install in this image.
+
+# you can get mirror list here:
+# https://launchpad.net/ubuntu/+archivemirrors
+ARG UBUNTU_MIRROR
+RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
+
+RUN apt-get update && apt-get install -y python python-dev python-pip iputils-ping libgtk2.0-dev
+RUN pip install -U kubernetes opencv-python
+
 RUN pip install paddlepaddle
+# if network is slowly, you may need to add proxy here.
+# ENV https_proxy=
 RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
 RUN pip uninstall -y paddlepaddle
+# unset proxy if it is setted.
+# ENV https_proxy=""
+
+# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
+#       so we must build one with distribute support to install in this image.
+ADD *.whl /
+RUN pip install /*.whl && rm -f /*.whl
+ENV LD_LIBRARY_PATH=/usr/local/lib
+
+# tf k8s
+RUN pip install tensorflow==1.4.0
+ADD tf_k8s /usr/bin
+RUN chmod +x /usr/bin/tf_k8s
+ADD vgg16_tf.py /workspace/

 # below lines may change a lot for debugging
 ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
 ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
-ADD *.whl /
-RUN pip install /*.whl && rm -f /*.whl && \
-chmod +x /usr/bin/paddle_k8s
-ENV LD_LIBRARY_PATH=/usr/local/lib
+RUN chmod +x /usr/bin/paddle_k8s
 ADD vgg16_fluid.py vgg16_v2.py /workspace/
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@@ -8,23 +8,24 @@
 - cpu MHz		: 2101.000
 - cache size	: 20480 KB

+### Blas settings
+
+Setting environment variable: `MKL_NUM_THREADS=1`.
+
 ### Single Node Single Thread

- PServer Count: 10
- Trainer Count: 20
 - Metrics: samples / sec

 | Batch Size | 32 | 64 | 128 | 256 |
 | -- | -- | -- | -- | -- |
 | PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
 | PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
-| TensorFlow | - | - | - | - |
+| TensorFlow | 9.09 | 9.10 | 9.24 | 8.66 |

 ### Different Batch Size

 - PServer Count: 10
 - Trainer Count: 20
- Per trainer CPU Core: 1
 - Metrics: samples / sec

 | Batch Size | 32 | 64 | 128 | 256 |

--- a/benchmark/cluster/vgg16/fluid_trainer.yaml
+++ b/benchmark/cluster/vgg16/fluid_trainer.yaml
@@ -11,7 +11,7 @@ spec:
        paddle-job: vgg16job
    spec:
      imagePullSecrets:
-        - name: job-registry-secret
+      - name: job-registry-secret
      hostNetwork: true
      containers:
      - name: trainer

--- a/benchmark/cluster/vgg16/tf_k8s
+++ b/benchmark/cluster/vgg16/tf_k8s
+#!/bin/bash
+check_trainer_ret() {
+  ret=$1
+  stdbuf -oL echo "job returned $ret...setting pod return message..."
+  stdbuf -oL echo "==============================="
+
+  if [ $ret -eq 136 ] ; then
+    echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
+  elif [ $ret -eq 139 ] ; then
+    echo "Segmentation Fault" > /dev/termination-log
+  elif [ $ret -eq 1 ] ; then
+    echo "General Error" > /dev/termination-log
+  elif [ $ret -eq 134 ] ; then
+    echo "Program Abort" > /dev/termination-log
+  fi
+  stdbuf -oL echo "termination log wroted..."
+  exit $ret
+}
+
+g_pservers=""
+g_trainers=""
+
+wait_running_pods(){
+  pserver_label="tf-job-pserver=${JOB_NAME}"
+  trainer_label="tf-job-trainer=${JOB_NAME}"
+
+  stdbuf -oL python /root/k8s_tools.py wait_pods_running ${pserver_label} ${PSERVERS_NUM}
+  stdbuf -oL python /root/k8s_tools.py wait_pods_running ${trainer_label} ${TRAINERS_NUM}
+
+  g_pservers=$(python /root/k8s_tools.py fetch_endpoints ${pserver_label} ${PORT})
+  g_trainers=$(python /root/k8s_tools.py fetch_endpoints ${trainer_label} ${PORT})
+}
+
+start_tf_pserver(){
+  wait_running_pods
+
+  label="tf-job-pserver=${JOB_NAME}"
+  pserver_id=$(python /root/k8s_tools.py fetch_id ${label})
+
+  cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
+  --job_name=${TF_JOB_NAME} --task_index=${pserver_id}"
+
+  stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
+}
+
+start_tf_trainer(){
+  wait_running_pods
+
+  label="tf-job-trainer=${JOB_NAME}"
+  trainer_id=$(python /root/k8s_tools.py fetch_id ${label})
+
+  cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
+  --job_name=${TF_JOB_NAME} --task_index=${trainer_id} --batch_size=${BATCH_SIZE}"
+
+  stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
+  check_trainer_ret $?
+}
+
+start_tf(){
+    if [[ "${TF_JOB_NAME}" == "worker" ]]; then
+        start_tf_trainer
+    else
+        start_tf_pserver
+    fi
+}
+
+usage() {
+    echo "usage: tf_k8s [<args>]:"
+    echo "  start_tf         Start tensorflow jobs"
+}
+
+case "$1" in
+    start_tf)
+        start_tf
+        ;;
+    --help)
+        usage
+        ;;
+    *)
+        usage
+        ;;
+esac
--- a/benchmark/cluster/vgg16/tf_pserver.yaml
+++ b/benchmark/cluster/vgg16/tf_pserver.yaml
+apiVersion: extensions/v1beta1
+kind: ReplicaSet
+metadata:
+  name: vgg16job-tf-pserver
+spec:
+  replicas: 10
+  template:
+    metadata:
+      labels:
+        tf-job-pserver: vgg16job-tf
+    spec:
+      hostNetwork: true
+      imagePullSecrets:
+      - name: job-registry-secret
+      containers:
+      - name: pserver
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
+        imagePullPolicy: Always
+        command: ["tf_k8s", "start_tf"]
+        ports:
+        - name: jobport-30236
+          containerPort: 30236
+        env:
+        - name: PORT
+          value: "32036"
+        - name: ENTRY
+          value: "python vgg16_tf.py"
+        - name: JOB_NAME
+          value: vgg16job-tf
+        - name: PSERVERS_NUM
+          value: "10"
+        - name: TF_JOB_NAME 
+          value: "ps"
+        - name: TRAINERS_NUM
+          value: "20"
+        - name: BATCH_SIZE
+          value: "128"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: NUM_PASSES
+          value: "1"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        - name: POD_IP
+          valueFrom:
+            fieldRef:
+              fieldPath: "status.podIP"
+        resources:
+          requests:
+            memory: 10Gi
+            cpu: 4
+          limits:
+            memory: 10Gi
+            cpu: 4
--- a/benchmark/cluster/vgg16/tf_trainer.yaml
+++ b/benchmark/cluster/vgg16/tf_trainer.yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: vgg16job-tf-trainer
+spec:
+  parallelism: 20
+  completions: 20
+  template:
+    metadata:
+      labels:
+        tf-job-trainer: vgg16job-tf
+    spec:
+      imagePullSecrets:
+      - name: job-registry-secret
+      hostNetwork: true
+      containers:
+      - name: trainer
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
+        imagePullPolicy: Always
+        command: ["tf_k8s", "start_tf"]
+        ports:
+        - name: jobport-30236
+          containerPort: 30236
+        env:
+        - name: PORT
+          value: "32036"
+        - name: JOB_NAME
+          value: vgg16job-tf
+        - name: TF_JOB_NAME 
+          value: "worker"
+        - name: ENTRY
+          value: "python vgg16_tf.py"
+        - name: PSERVERS_NUM
+          value: "10"
+        - name: BATCH_SIZE
+          value: "128"
+        - name: TRAINERS_NUM
+          value: "20"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: NUM_PASSES
+          value: "1"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        - name: POD_IP
+          valueFrom:
+            fieldRef:
+              fieldPath: "status.podIP"
+        resources:
+          requests:
+            memory: 40Gi
+            cpu: 2
+          limits:
+            memory: 40Gi
+            cpu: 2
+      restartPolicy: Never
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -68,6 +68,21 @@ parser.add_argument(
    type=str2bool,
    default=True,
    help='Whether to run as local mode.')
+
+parser.add_argument(
+    "--ps_hosts",
+    type=str,
+    default="",
+    help="Comma-separated list of hostname:port pairs")
+parser.add_argument(
+    "--trainer_hosts",
+    type=str,
+    default="",
+    help="Comma-separated list of hostname:port pairs")
+
+# Flags for defining the tf.train.Server
+parser.add_argument(
+    "--task_index", type=int, default=0, help="Index of task within the job")
 args = parser.parse_args()


@@ -180,8 +195,9 @@ def main():
                    iters += 1
                    num_samples += len(data)
                    print(
-                        "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f"
-                        % (pass_id, iters, loss, acc, time.time() - ts)
+                        "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
+                        % (pass_id, iters, loss, acc,
+                           len(data) / (time.time() - ts))
                    )  # The accuracy is the accumulation of batches, but not the current batch.

            pass_elapsed = time.time() - start_time
@@ -209,27 +225,24 @@ def main():
            batch_size=args.batch_size)
        train_loop(exe, fluid.default_main_program())
    else:
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # all pserver endpoints
-        eplist = []
-        for ip in pserver_ips.split(","):
-            eplist.append(':'.join([ip, "6174"]))
-        pserver_endpoints = ",".join(eplist)
-        print("pserver endpoints: ", pserver_endpoints)
        trainers = int(os.getenv("TRAINERS"))  # total trainer count
        print("trainers total: ", trainers)
-        current_endpoint = os.getenv(
-            "POD_IP") + ":6174"  # current pserver endpoint
+
        training_role = os.getenv(
            "TRAINING_ROLE",
            "TRAINER")  # get the training role: trainer/pserver
+
        t = fluid.DistributeTranspiler()
        t.transpile(
            optimize_ops,
            params_grads,
-            pservers=pserver_endpoints,
+            trainer_id=args.task_index,
+            pservers=args.ps_hosts,
            trainers=trainers)

        if training_role == "PSERVER":
+            current_endpoint = os.getenv("POD_IP") + ":" + os.getenv(
+                "PADDLE_INIT_PORT")
            if not current_endpoint:
                print("need env SERVER_ENDPOINT")
                exit(1)

--- a/benchmark/cluster/vgg16/vgg16_tf.py
+++ b/benchmark/cluster/vgg16/vgg16_tf.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VGG16 benchmark in TensorFlow
+You can get distribution example template structure here:
+https://medium.com/clusterone/how-to-write-distributed-tensorflow-code-with-an-example-on-tensorport-70bf3306adcb
+https://www.tensorflow.org/deploy/distributed
+"""
+
+import tensorflow as tf
+import paddle.v2 as paddle
+import numpy as np
+import argparse
+import time
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--batch_size', type=int, default=128, help="Batch size for training.")
+parser.add_argument(
+    '--learning_rate',
+    type=float,
+    default=1e-3,
+    help="Learning rate for training.")
+parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
+parser.add_argument(
+    '--device',
+    type=str,
+    default='CPU',
+    choices=['CPU', 'GPU'],
+    help="The device type.")
+parser.add_argument(
+    '--data_format',
+    type=str,
+    default='NHWC',
+    choices=['NCHW', 'NHWC'],
+    help='The data order, NCHW=[batch, channels, height, width].'
+    'Only support NHWC right now.')
+parser.add_argument(
+    '--data_set',
+    type=str,
+    default='cifar10',
+    choices=['cifar10', 'flowers'],
+    help='Optional dataset for benchmark.')
+
+parser.add_argument(
+    "--ps_hosts",
+    type=str,
+    default="",
+    help="Comma-separated list of hostname:port pairs")
+parser.add_argument(
+    "--worker_hosts",
+    type=str,
+    default="",
+    help="Comma-separated list of hostname:port pairs")
+parser.add_argument(
+    "--job_name", type=str, default="", help="One of 'worker', 'ps'")
+# Flags for defining the tf.train.Server
+parser.add_argument(
+    "--task_index", type=int, default=0, help="Index of task within the job")
+
+args = parser.parse_args()
+
+
+class VGG16Model(object):
+    def __init__(self):
+        self.parameters = []
+
+    def batch_norm_relu(self, inputs, is_training):
+        """Performs a batch normalization followed by a ReLU."""
+        # We set fused=True for a significant speed boost. See
+        # https://www.tensorflow.org/speed/speed_guide#common_fused_ops
+        inputs = tf.layers.batch_normalization(
+            inputs=inputs,
+            axis=1 if args.data_format == 'NCHW' else -1,
+            momentum=0.9,
+            epsilon=1e-05,
+            center=True,
+            scale=True,
+            training=is_training,
+            fused=True)
+        inputs = tf.nn.relu(inputs)
+        return inputs
+
+    def conv_bn_layer(self,
+                      name,
+                      images,
+                      kernel_shape,
+                      is_training,
+                      drop_rate=0.0):
+        with tf.name_scope(name) as scope:
+            kernel = tf.Variable(
+                tf.truncated_normal(
+                    kernel_shape, dtype=tf.float32, stddev=1e-1),
+                name='weights')
+            conv = tf.nn.conv2d(
+                images,
+                kernel, [1, 1, 1, 1],
+                data_format=args.data_format,
+                padding='SAME')
+            biases = tf.Variable(
+                tf.constant(
+                    0.0, shape=[kernel_shape[-1]], dtype=tf.float32),
+                trainable=True,
+                name='biases')
+            out = tf.nn.bias_add(conv, biases)
+            out = self.batch_norm_relu(out, is_training)
+            out = tf.layers.dropout(out, rate=drop_rate, training=is_training)
+            return out
+
+    def fc_layer(self, name, inputs, shape):
+        with tf.name_scope(name) as scope:
+            fc_w = tf.Variable(
+                tf.truncated_normal(
+                    shape, dtype=tf.float32, stddev=1e-1),
+                name='weights')
+            fc_b = tf.Variable(
+                tf.constant(
+                    0.0, shape=[shape[-1]], dtype=tf.float32),
+                trainable=True,
+                name='biases')
+            out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b)
+            return out
+
+    def network(self, images, class_dim, is_training):
+        """ VGG16 model structure.
+
+            TODO(kuke): enable this network to support the 'NCHW' data format
+        """
+
+        # conv1
+        conv1_1 = self.conv_bn_layer(
+            'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3)
+        conv1_2 = self.conv_bn_layer(
+            'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0)
+        # pool1
+        pool1 = tf.nn.max_pool(
+            conv1_2,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool1')
+        # conv2
+        conv2_1 = self.conv_bn_layer(
+            'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4)
+        conv2_2 = self.conv_bn_layer(
+            'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0)
+        # pool2
+        pool2 = tf.nn.max_pool(
+            conv2_2,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool2')
+        # conv3
+        conv3_1 = self.conv_bn_layer(
+            'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4)
+        conv3_2 = self.conv_bn_layer(
+            'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4)
+        conv3_3 = self.conv_bn_layer(
+            'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0)
+        # pool3
+        pool3 = tf.nn.max_pool(
+            conv3_3,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool3')
+        # conv4
+        conv4_1 = self.conv_bn_layer(
+            'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4)
+        conv4_2 = self.conv_bn_layer(
+            'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
+        conv4_3 = self.conv_bn_layer(
+            'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
+        # pool4
+        pool4 = tf.nn.max_pool(
+            conv4_3,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool4')
+        # conv5
+        conv5_1 = self.conv_bn_layer(
+            'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4)
+        conv5_2 = self.conv_bn_layer(
+            'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
+        conv5_3 = self.conv_bn_layer(
+            'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
+        # pool5
+        pool5 = tf.nn.max_pool(
+            conv5_3,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool4')
+        # flatten
+        shape = int(np.prod(pool5.get_shape()[1:]))
+        pool5_flat = tf.reshape(pool5, [-1, shape])
+        # fc1
+        drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training)
+        fc1 = self.fc_layer('fc1', drop, [shape, 512])
+        # fc2
+        bn = self.batch_norm_relu(fc1, is_training)
+        drop = tf.layers.dropout(bn, rate=0.5, training=is_training)
+        fc2 = self.fc_layer('fc2', drop, [512, 512])
+
+        fc3 = self.fc_layer('fc3', fc2, [512, class_dim])
+
+        return fc3
+
+
+def run_benchmark(cluster_spec, server):
+    """Run benchmark on cifar10 or flowers."""
+
+    if args.data_set == "cifar10":
+        class_dim = 10
+        raw_shape = (3, 32, 32)
+        dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else (
+            None, 3, 32, 32)
+    else:
+        class_dim = 102
+        raw_shape = (3, 224, 224)
+        dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else (
+            None, 3, 224, 224)
+
+    device = tf.train.replica_device_setter(
+        worker_device="/job:worker/task:{}".format(args.task_index),
+        cluster=cluster_spec)
+
+    with tf.device(device):
+        images = tf.placeholder(tf.float32, shape=dat_shape)
+        labels = tf.placeholder(tf.int64, shape=(None, ))
+        is_training = tf.placeholder('bool')
+        onehot_labels = tf.one_hot(labels, depth=class_dim)
+
+        vgg16 = VGG16Model()
+        logits = vgg16.network(images, class_dim, is_training)
+        loss = tf.losses.softmax_cross_entropy(
+            onehot_labels=onehot_labels, logits=logits)
+        avg_loss = tf.reduce_mean(loss)
+
+        correct = tf.equal(tf.argmax(logits, 1), labels)
+        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
+
+        optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
+        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+        global_step = tf.Variable(0, name='global_step', trainable=False)
+        with tf.control_dependencies(update_ops):
+            train_op = optimizer.minimize(avg_loss, global_step=global_step)
+
+        summary_op = tf.summary.merge_all()
+        init_op = tf.global_variables_initializer()
+
+    # data reader
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.test10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+
+    # test
+    def test():
+        test_accs = []
+        for batch_id, data in enumerate(test_reader()):
+            test_images = np.array(
+         map(lambda x: np.transpose(x[0].reshape(raw_shape),
+         axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
+            test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
+            test_accs.append(
+                accuracy.eval(feed_dict={
+                    images: test_images,
+                    labels: test_labels,
+                    is_training: False
+                }))
+        return np.mean(test_accs)
+
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+
+    hooks = [tf.train.StopAtStepHook(last_step=1000000)]
+
+    with tf.train.MonitoredTrainingSession(
+            master=server.target, is_chief=(args.task_index == 0),
+            hooks=hooks) as sess:
+        iters, num_samples, start_time = 0, 0, 0.0
+        for pass_id in range(args.num_passes):
+            # train
+            num_samples = 0
+            start_time = time.time()
+            for batch_id, data in enumerate(train_reader()):
+                train_images = np.array(
+                    map(lambda x: np.transpose(x[0].reshape(raw_shape),
+                    axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
+                train_labels = np.array(map(lambda x: x[1], data)).astype(
+                    'int64')
+                iter_begin_time = time.time()
+                _, loss, acc = sess.run([train_op, avg_loss, accuracy],
+                                        feed_dict={
+                                            images: train_images,
+                                            labels: train_labels,
+                                            is_training: True
+                                        })
+                iters += 1
+                print(
+                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed=%.2f imgs/sec"
+                    % (pass_id, iters, loss, acc,
+                       len(data) / (time.time() - iter_begin_time)))
+                num_samples += len(data)
+            train_elapsed = time.time() - start_time
+            # test
+            pass_test_acc = test()
+            print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" %
+                  (pass_id, num_samples / train_elapsed, pass_test_acc))
+
+
+def print_arguments():
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == '__main__':
+    print_arguments()
+
+    ps_hosts = args.ps_hosts.split(",")
+    worker_hosts = args.worker_hosts.split(",")
+
+    # Create a cluster from the parameter server and worker hosts.
+    cluster_spec = tf.train.ClusterSpec({
+        "ps": ps_hosts,
+        "worker": worker_hosts
+    })
+
+    # Create and start a server for the local task.
+    server = tf.train.Server(
+        cluster_spec, job_name=args.job_name, task_index=args.task_index)
+
+    if args.job_name == "ps":
+        print("start pserver")
+        server.join()
+    elif args.job_name == "worker":
+        print("start worker")
+        run_benchmark(cluster_spec, server)
--- a/benchmark/cluster/vgg16/vgg16_v2.py
+++ b/benchmark/cluster/vgg16/vgg16_v2.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 #Licensed under the Apache License, Version 2.0 (the "License");
 #you may not use this file except in compliance with the License.

--- a/benchmark/paddle/image/alexnet.py
+++ b/benchmark/paddle/image/alexnet.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/benchmark/paddle/rnn/imdb.py
+++ b/benchmark/paddle/rnn/imdb.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/benchmark/paddle/rnn/provider.py
+++ b/benchmark/paddle/rnn/provider.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/benchmark/tensorflow/image/alexnet.py
+++ b/benchmark/tensorflow/image/alexnet.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/benchmark/tensorflow/image/alexnet_multi_gpu.py
+++ b/benchmark/tensorflow/image/alexnet_multi_gpu.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/benchmark/tensorflow/image/googlenet.py
+++ b/benchmark/tensorflow/image/googlenet.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/benchmark/tensorflow/image/googlenet_multi_gpu.py
+++ b/benchmark/tensorflow/image/googlenet_multi_gpu.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/benchmark/tensorflow/image/smallnet_mnist_cifar.py
+++ b/benchmark/tensorflow/image/smallnet_mnist_cifar.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/benchmark/tensorflow/rnn/reader.py
+++ b/benchmark/tensorflow/rnn/reader.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -59,6 +59,7 @@ endif(NOT WITH_GOLANG)

 if(NOT WITH_GPU)
    add_definitions(-DHPPL_STUB_FUNC)
+    add_definitions("-DCUPTI_LIB_PATH=\"\"")

    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
@@ -73,7 +74,14 @@ else()
    if(NOT CUDNN_FOUND)
        message(FATAL_ERROR "Paddle needs cudnn to compile")
    endif()
-
+    if(CUPTI_FOUND)
+        include_directories(${CUPTI_INCLUDE_DIR})
+        add_definitions(-DPADDLE_WITH_CUPTI)
+        add_definitions("-DCUPTI_LIB_PATH=\"${CUPTI_LIBRARY_PATH}\"")
+    else()
+        add_definitions("-DCUPTI_LIB_PATH=\"\"")
+        message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.")
+    endif()
    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")

    # Include cuda and cudnn

--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/cmake/cross_compiling/host.cmake
+++ b/cmake/cross_compiling/host.cmake
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/cmake/cross_compiling/ios.cmake
+++ b/cmake/cross_compiling/ios.cmake
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/cmake/cross_compiling/raspberry_pi.cmake
+++ b/cmake/cross_compiling/raspberry_pi.cmake
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -155,7 +155,8 @@ endif()
 include_directories(${CUDA_INCLUDE_DIRS})
 list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
 if(NOT WITH_DSO)
-    list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
+    # TODO(panyx0718): CUPTI only allows DSO?
+    list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
 endif(NOT WITH_DSO)

 # setting nvcc arch flags

--- a/cmake/cupti.cmake
+++ b/cmake/cupti.cmake
+if(NOT WITH_GPU)
+    return()
+endif()
+
+
+set(CUPTI_ROOT "/usr" CACHE PATH "CUPTI ROOT")
+find_path(CUPTI_INCLUDE_DIR cupti.h
+        PATHS ${CUPTI_ROOT} ${CUPTI_ROOT}/include
+        $ENV{CUPTI_ROOT} $ENV{CUPTI_ROOT}/include
+        ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/include
+        NO_DEFAULT_PATH
+        )
+
+get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
+
+set(TARGET_ARCH "x86_64")
+if(NOT ${CMAKE_SYSTEM_PROCESSOR})
+    set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
+endif()
+
+list(APPEND CUPTI_CHECK_LIBRARY_DIRS
+        ${CUPTI_ROOT}
+        ${CUPTI_ROOT}/lib64
+        ${CUPTI_ROOT}/lib
+        ${CUPTI_ROOT}/lib/${TARGET_ARCH}-linux-gnu
+        $ENV{CUPTI_ROOT}
+        $ENV{CUPTI_ROOT}/lib64
+        $ENV{CUPTI_ROOT}/lib
+        /usr/lib
+        ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/lib64)
+find_library(CUPTI_LIBRARY NAMES libcupti.so libcupti.dylib # libcupti_static.a
+       PATHS ${CUPTI_CHECK_LIBRARY_DIRS} ${CUPTI_INCLUDE_DIR} ${__libpath_hist}
+       NO_DEFAULT_PATH
+       DOC "Path to cuPTI library.")
+
+get_filename_component(CUPTI_LIBRARY_PATH ${CUPTI_LIBRARY} DIRECTORY)
+if(CUPTI_INCLUDE_DIR AND CUPTI_LIBRARY)
+    set(CUPTI_FOUND ON)
+else()
+    set(CUPTI_FOUND OFF)
+endif()
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,9 +15,16 @@
 include(ExternalProject)

 set(BOOST_PROJECT       "extern_boost")
+# To release PaddlePaddle as a pip package, we have to follow the
+# manylinux1 standard, which features as old Linux kernels and
+# compilers as possible and recommends CentOS 5. Indeed, the earliest
+# CentOS version that works with NVIDIA CUDA is CentOS 6.  And a new
+# version of boost, say, 1.66.0, doesn't build on CentOS 6.  We
+# checked that the devtools package of CentOS 6 installs boost 1.41.0.
+# So we use 1.41.0 here.
 set(BOOST_VER           "1.41.0")
 set(BOOST_TAR           "boost_1_41_0")
-set(BOOST_URL           "http://sourceforge.net/projects/boost/files/boost/${BOOST_VER}/${BOOST_TAR}.tar.gz")
+set(BOOST_URL           "http://paddlepaddledeps.s3-website-us-west-1.amazonaws.com/${BOOST_TAR}.tar.gz")
 set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
 set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
 set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)

--- a/cmake/external/cares.cmake
+++ b/cmake/external/cares.cmake
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
-# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -56,6 +56,7 @@ ExternalProject_Add(
    PREFIX              ${MKLDNN_SOURCES_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
+    CMAKE_ARGS          -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} 
    CMAKE_ARGS          -DMKLROOT=${MKLML_ROOT}
    CMAKE_ARGS          -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
    CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}

--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
-# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/cmake/external/nccl.cmake
+++ b/cmake/external/nccl.cmake
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/cmake/external/pybind11.cmake
+++ b/cmake/external/pybind11.cmake
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/cmake/external/swig.cmake
+++ b/cmake/external/swig.cmake
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -16,12 +16,10 @@ function(copy TARGET)
    foreach(index RANGE ${len})
        list(GET copy_lib_SRCS ${index} src)
        list(GET copy_lib_DSTS ${index} dst)
-        add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND mkdir -p "${dst}")
-        if(IS_DIRECTORY ${src})
-            add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND cp -r "${src}" "${dst}")
-        else()
-            add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND cp "${src}" "${dst}")
-        endif()
+        add_custom_command(TARGET ${TARGET} PRE_BUILD 
+          COMMAND mkdir -p "${dst}"
+          COMMAND cp -r "${src}" "${dst}"
+          COMMENT "copying ${src} -> ${dst}")
    endforeach()
 endfunction()

@@ -53,11 +51,11 @@ IF(NOT PROTOBUF_FOUND)
 ENDIF(NOT PROTOBUF_FOUND)

 # paddle fluid module
-set(src_dir "${PADDLE_SOURCE_DIR}/paddle")
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle")
+set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle/fluid")
 set(module "framework")
 copy(framework_lib DEPS framework_py_proto 
-  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/framework/framework.pb.h
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
 )

@@ -69,7 +67,7 @@ copy(memory_lib

 set(module "inference")
 copy(inference_lib DEPENDS paddle_fluid_shared
-  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/inference/libpaddle_fluid.so
+  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.so
  DSTS ${dst_dir}/${module} ${dst_dir}/${module}
 )


--- a/cmake/make_resource.py
+++ b/cmake/make_resource.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/cmake/system.cmake
+++ b/cmake/system.cmake
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/doc/api/v2/fluid/data_feeder.rst
+++ b/doc/api/v2/fluid/data_feeder.rst
@@ -8,7 +8,7 @@ data_feeder
 DataFeeder
 ----------

-..  autoclass:: paddle.v2.fluid.data_feeder.DataFeeder
+..  autoclass:: paddle.fluid.data_feeder.DataFeeder
    :members:
    :noindex:

--- a/doc/api/v2/fluid/evaluator.rst
+++ b/doc/api/v2/fluid/evaluator.rst
@@ -8,14 +8,14 @@ evaluator
 Accuracy
 --------

-..  autoclass:: paddle.v2.fluid.evaluator.Accuracy
+..  autoclass:: paddle.fluid.evaluator.Accuracy
    :members:
    :noindex:

 ChunkEvaluator
 --------------

-..  autoclass:: paddle.v2.fluid.evaluator.ChunkEvaluator
+..  autoclass:: paddle.fluid.evaluator.ChunkEvaluator
    :members:
    :noindex:

--- a/doc/api/v2/fluid/executor.rst
+++ b/doc/api/v2/fluid/executor.rst
@@ -8,25 +8,25 @@ executor
 Executor
 --------

-..  autoclass:: paddle.v2.fluid.executor.Executor
+..  autoclass:: paddle.fluid.executor.Executor
    :members:
    :noindex:

 global_scope
 ------------

-..  autofunction:: paddle.v2.fluid.executor.global_scope
+..  autofunction:: paddle.fluid.executor.global_scope
    :noindex:

 scope_guard
 -----------

-..  autofunction:: paddle.v2.fluid.executor.scope_guard
+..  autofunction:: paddle.fluid.executor.scope_guard
    :noindex:

 switch_scope
 ------------

-..  autofunction:: paddle.v2.fluid.executor.switch_scope
+..  autofunction:: paddle.fluid.executor.switch_scope
    :noindex:

--- a/doc/api/v2/fluid/gen_doc.py
+++ b/doc/api/v2/fluid/gen_doc.py
@@ -17,7 +17,7 @@ import argparse
 import sys
 import types

-import paddle.v2.fluid as fluid
+import paddle.fluid as fluid


 def parse_arg():
@@ -70,7 +70,7 @@ class DocGenerator(object):

    def print_class(self, name):
        self._print_header_(name, dot='-', is_title=False)
-        self.stream.write('''..  autoclass:: paddle.v2.fluid.{0}.{1}
+        self.stream.write('''..  autoclass:: paddle.fluid.{0}.{1}
    :members:
    :noindex:

@@ -78,7 +78,7 @@ class DocGenerator(object):

    def print_method(self, name):
        self._print_header_(name, dot='-', is_title=False)
-        self.stream.write('''..  autofunction:: paddle.v2.fluid.{0}.{1}
+        self.stream.write('''..  autofunction:: paddle.fluid.{0}.{1}
    :noindex:

 '''.format(self.module_name, name))

--- a/doc/api/v2/fluid/gen_doc.sh
+++ b/doc/api/v2/fluid/gen_doc.sh
--- a/doc/api/fluid/index.rst
+++ b/doc/api/fluid/index.rst
+======================
+Fluid
+======================
+
+..  toctree::
+    :maxdepth: 1
+
+    layers.rst
+    data_feeder.rst
+    executor.rst
+    initializer.rst
+    evaluator.rst
+    nets.rst
+    optimizer.rst
+    param_attr.rst
+    profiler.rst
+    regularizer.rst
+    io.rst
--- a/doc/api/v2/fluid/initializer.rst
+++ b/doc/api/v2/fluid/initializer.rst
@@ -8,28 +8,28 @@ initializer
 Constant
 --------

-..  autoclass:: paddle.v2.fluid.initializer.Constant
+..  autoclass:: paddle.fluid.initializer.Constant
    :members:
    :noindex:

 Uniform
 -------

-..  autoclass:: paddle.v2.fluid.initializer.Uniform
+..  autoclass:: paddle.fluid.initializer.Uniform
    :members:
    :noindex:

 Normal
 ------

-..  autoclass:: paddle.v2.fluid.initializer.Normal
+..  autoclass:: paddle.fluid.initializer.Normal
    :members:
    :noindex:

 Xavier
 ------

-..  autoclass:: paddle.v2.fluid.initializer.Xavier
+..  autoclass:: paddle.fluid.initializer.Xavier
    :members:
    :noindex:

--- a/doc/api/v2/fluid/io.rst
+++ b/doc/api/v2/fluid/io.rst
@@ -8,54 +8,54 @@ io
 save_vars
 ---------

-..  autofunction:: paddle.v2.fluid.io.save_vars
+..  autofunction:: paddle.fluid.io.save_vars
    :noindex:

 save_params
 -----------

-..  autofunction:: paddle.v2.fluid.io.save_params
+..  autofunction:: paddle.fluid.io.save_params
    :noindex:

 save_persistables
 -----------------

-..  autofunction:: paddle.v2.fluid.io.save_persistables
+..  autofunction:: paddle.fluid.io.save_persistables
    :noindex:

 load_vars
 ---------

-..  autofunction:: paddle.v2.fluid.io.load_vars
+..  autofunction:: paddle.fluid.io.load_vars
    :noindex:

 load_params
 -----------

-..  autofunction:: paddle.v2.fluid.io.load_params
+..  autofunction:: paddle.fluid.io.load_params
    :noindex:

 load_persistables
 -----------------

-..  autofunction:: paddle.v2.fluid.io.load_persistables
+..  autofunction:: paddle.fluid.io.load_persistables
    :noindex:

 save_inference_model
 --------------------

-..  autofunction:: paddle.v2.fluid.io.save_inference_model
+..  autofunction:: paddle.fluid.io.save_inference_model
    :noindex:

 load_inference_model
 --------------------

-..  autofunction:: paddle.v2.fluid.io.load_inference_model
+..  autofunction:: paddle.fluid.io.load_inference_model
    :noindex:

 get_inference_program
 ---------------------

-..  autofunction:: paddle.v2.fluid.io.get_inference_program
+..  autofunction:: paddle.fluid.io.get_inference_program
    :noindex:

--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -11,167 +11,167 @@ control_flow
 split_lod_tensor
 ----------------

-..  autofunction:: paddle.v2.fluid.layers.split_lod_tensor
+..  autofunction:: paddle.fluid.layers.split_lod_tensor
    :noindex:

 merge_lod_tensor
 ----------------

-..  autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
+..  autofunction:: paddle.fluid.layers.merge_lod_tensor
    :noindex:

 BlockGuard
 ----------

-..  autoclass:: paddle.v2.fluid.layers.BlockGuard
+..  autoclass:: paddle.fluid.layers.BlockGuard
    :members:
    :noindex:

 BlockGuardWithCompletion
 ------------------------

-..  autoclass:: paddle.v2.fluid.layers.BlockGuardWithCompletion
+..  autoclass:: paddle.fluid.layers.BlockGuardWithCompletion
    :members:
    :noindex:

 StaticRNNMemoryLink
 -------------------

-..  autoclass:: paddle.v2.fluid.layers.StaticRNNMemoryLink
+..  autoclass:: paddle.fluid.layers.StaticRNNMemoryLink
    :members:
    :noindex:

 WhileGuard
 ----------

-..  autoclass:: paddle.v2.fluid.layers.WhileGuard
+..  autoclass:: paddle.fluid.layers.WhileGuard
    :members:
    :noindex:

 While
 -----

-..  autoclass:: paddle.v2.fluid.layers.While
+..  autoclass:: paddle.fluid.layers.While
    :members:
    :noindex:

 lod_rank_table
 --------------

-..  autofunction:: paddle.v2.fluid.layers.lod_rank_table
+..  autofunction:: paddle.fluid.layers.lod_rank_table
    :noindex:

 max_sequence_len
 ----------------

-..  autofunction:: paddle.v2.fluid.layers.max_sequence_len
+..  autofunction:: paddle.fluid.layers.max_sequence_len
    :noindex:

 topk
 ----

-..  autofunction:: paddle.v2.fluid.layers.topk
+..  autofunction:: paddle.fluid.layers.topk
    :noindex:

 lod_tensor_to_array
 -------------------

-..  autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
+..  autofunction:: paddle.fluid.layers.lod_tensor_to_array
    :noindex:

 array_to_lod_tensor
 -------------------

-..  autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
+..  autofunction:: paddle.fluid.layers.array_to_lod_tensor
    :noindex:

 increment
 ---------

-..  autofunction:: paddle.v2.fluid.layers.increment
+..  autofunction:: paddle.fluid.layers.increment
    :noindex:

 array_write
 -----------

-..  autofunction:: paddle.v2.fluid.layers.array_write
+..  autofunction:: paddle.fluid.layers.array_write
    :noindex:

 create_array
 ------------

-..  autofunction:: paddle.v2.fluid.layers.create_array
+..  autofunction:: paddle.fluid.layers.create_array
    :noindex:

 less_than
 ---------

-..  autofunction:: paddle.v2.fluid.layers.less_than
+..  autofunction:: paddle.fluid.layers.less_than
    :noindex:

 array_read
 ----------

-..  autofunction:: paddle.v2.fluid.layers.array_read
+..  autofunction:: paddle.fluid.layers.array_read
    :noindex:

 shrink_memory
 -------------

-..  autofunction:: paddle.v2.fluid.layers.shrink_memory
+..  autofunction:: paddle.fluid.layers.shrink_memory
    :noindex:

 array_length
 ------------

-..  autofunction:: paddle.v2.fluid.layers.array_length
+..  autofunction:: paddle.fluid.layers.array_length
    :noindex:

 IfElse
 ------

-..  autoclass:: paddle.v2.fluid.layers.IfElse
+..  autoclass:: paddle.fluid.layers.IfElse
    :members:
    :noindex:

 DynamicRNN
 ----------

-..  autoclass:: paddle.v2.fluid.layers.DynamicRNN
+..  autoclass:: paddle.fluid.layers.DynamicRNN
    :members:
    :noindex:

 ConditionalBlock
 ----------------

-..  autoclass:: paddle.v2.fluid.layers.ConditionalBlock
+..  autoclass:: paddle.fluid.layers.ConditionalBlock
    :members:
    :noindex:

 StaticRNN
 ---------

-..  autoclass:: paddle.v2.fluid.layers.StaticRNN
+..  autoclass:: paddle.fluid.layers.StaticRNN
    :members:
    :noindex:

 reorder_lod_tensor_by_rank
 --------------------------

-..  autofunction:: paddle.v2.fluid.layers.reorder_lod_tensor_by_rank
+..  autofunction:: paddle.fluid.layers.reorder_lod_tensor_by_rank
    :noindex:

 ParallelDo
 ----------

-..  autoclass:: paddle.v2.fluid.layers.ParallelDo
+..  autoclass:: paddle.fluid.layers.ParallelDo
    :members:
    :noindex:

 Print
 -----

-..  autofunction:: paddle.v2.fluid.layers.Print
+..  autofunction:: paddle.fluid.layers.Print
    :noindex:

 device
@@ -180,7 +180,7 @@ device
 get_places
 ----------

-..  autofunction:: paddle.v2.fluid.layers.get_places
+..  autofunction:: paddle.fluid.layers.get_places
    :noindex:

 io
@@ -189,27 +189,27 @@ io
 data
 ----

-..  autofunction:: paddle.v2.fluid.layers.data
+..  autofunction:: paddle.fluid.layers.data
    :noindex:

 BlockGuardServ
 --------------

-..  autoclass:: paddle.v2.fluid.layers.BlockGuardServ
+..  autoclass:: paddle.fluid.layers.BlockGuardServ
    :members:
    :noindex:

 ListenAndServ
 -------------

-..  autoclass:: paddle.v2.fluid.layers.ListenAndServ
+..  autoclass:: paddle.fluid.layers.ListenAndServ
    :members:
    :noindex:

 Send
 ----

-..  autofunction:: paddle.v2.fluid.layers.Send
+..  autofunction:: paddle.fluid.layers.Send
    :noindex:

 nn
@@ -218,259 +218,259 @@ nn
 fc
 --

-..  autofunction:: paddle.v2.fluid.layers.fc
+..  autofunction:: paddle.fluid.layers.fc
    :noindex:

 embedding
 ---------

-..  autofunction:: paddle.v2.fluid.layers.embedding
+..  autofunction:: paddle.fluid.layers.embedding
    :noindex:

 dynamic_lstm
 ------------

-..  autofunction:: paddle.v2.fluid.layers.dynamic_lstm
+..  autofunction:: paddle.fluid.layers.dynamic_lstm
    :noindex:

 dynamic_lstmp
 -------------

-..  autofunction:: paddle.v2.fluid.layers.dynamic_lstmp
+..  autofunction:: paddle.fluid.layers.dynamic_lstmp
    :noindex:

 dynamic_gru
 -----------

-..  autofunction:: paddle.v2.fluid.layers.dynamic_gru
+..  autofunction:: paddle.fluid.layers.dynamic_gru
    :noindex:

 gru_unit
 --------

-..  autofunction:: paddle.v2.fluid.layers.gru_unit
+..  autofunction:: paddle.fluid.layers.gru_unit
    :noindex:

 linear_chain_crf
 ----------------

-..  autofunction:: paddle.v2.fluid.layers.linear_chain_crf
+..  autofunction:: paddle.fluid.layers.linear_chain_crf
    :noindex:

 crf_decoding
 ------------

-..  autofunction:: paddle.v2.fluid.layers.crf_decoding
+..  autofunction:: paddle.fluid.layers.crf_decoding
    :noindex:

 cos_sim
 -------

-..  autofunction:: paddle.v2.fluid.layers.cos_sim
+..  autofunction:: paddle.fluid.layers.cos_sim
    :noindex:

 cross_entropy
 -------------

-..  autofunction:: paddle.v2.fluid.layers.cross_entropy
+..  autofunction:: paddle.fluid.layers.cross_entropy
    :noindex:

 square_error_cost
 -----------------

-..  autofunction:: paddle.v2.fluid.layers.square_error_cost
+..  autofunction:: paddle.fluid.layers.square_error_cost
    :noindex:

 accuracy
 --------

-..  autofunction:: paddle.v2.fluid.layers.accuracy
+..  autofunction:: paddle.fluid.layers.accuracy
    :noindex:

 chunk_eval
 ----------

-..  autofunction:: paddle.v2.fluid.layers.chunk_eval
+..  autofunction:: paddle.fluid.layers.chunk_eval
    :noindex:

 sequence_conv
 -------------

-..  autofunction:: paddle.v2.fluid.layers.sequence_conv
+..  autofunction:: paddle.fluid.layers.sequence_conv
    :noindex:

 conv2d
 ------

-..  autofunction:: paddle.v2.fluid.layers.conv2d
+..  autofunction:: paddle.fluid.layers.conv2d
    :noindex:

 sequence_pool
 -------------

-..  autofunction:: paddle.v2.fluid.layers.sequence_pool
+..  autofunction:: paddle.fluid.layers.sequence_pool
    :noindex:

 pool2d
 ------

-..  autofunction:: paddle.v2.fluid.layers.pool2d
+..  autofunction:: paddle.fluid.layers.pool2d
    :noindex:

 batch_norm
 ----------

-..  autofunction:: paddle.v2.fluid.layers.batch_norm
+..  autofunction:: paddle.fluid.layers.batch_norm
    :noindex:

 layer_norm
 ----------

-..  autofunction:: paddle.v2.fluid.layers.layer_norm
+..  autofunction:: paddle.fluid.layers.layer_norm
    :noindex:

 beam_search_decode
 ------------------

-..  autofunction:: paddle.v2.fluid.layers.beam_search_decode
+..  autofunction:: paddle.fluid.layers.beam_search_decode
    :noindex:

 conv2d_transpose
 ----------------

-..  autofunction:: paddle.v2.fluid.layers.conv2d_transpose
+..  autofunction:: paddle.fluid.layers.conv2d_transpose
    :noindex:

 sequence_expand
 ---------------

-..  autofunction:: paddle.v2.fluid.layers.sequence_expand
+..  autofunction:: paddle.fluid.layers.sequence_expand
    :noindex:

 lstm_unit
 ---------

-..  autofunction:: paddle.v2.fluid.layers.lstm_unit
+..  autofunction:: paddle.fluid.layers.lstm_unit
    :noindex:

 reduce_sum
 ----------

-..  autofunction:: paddle.v2.fluid.layers.reduce_sum
+..  autofunction:: paddle.fluid.layers.reduce_sum
    :noindex:

 reduce_mean
 -----------

-..  autofunction:: paddle.v2.fluid.layers.reduce_mean
+..  autofunction:: paddle.fluid.layers.reduce_mean
    :noindex:

 reduce_max
 ----------

-..  autofunction:: paddle.v2.fluid.layers.reduce_max
+..  autofunction:: paddle.fluid.layers.reduce_max
    :noindex:

 reduce_min
 ----------

-..  autofunction:: paddle.v2.fluid.layers.reduce_min
+..  autofunction:: paddle.fluid.layers.reduce_min
    :noindex:

 sequence_first_step
 -------------------

-..  autofunction:: paddle.v2.fluid.layers.sequence_first_step
+..  autofunction:: paddle.fluid.layers.sequence_first_step
    :noindex:

 sequence_last_step
 ------------------

-..  autofunction:: paddle.v2.fluid.layers.sequence_last_step
+..  autofunction:: paddle.fluid.layers.sequence_last_step
    :noindex:

 dropout
 -------

-..  autofunction:: paddle.v2.fluid.layers.dropout
+..  autofunction:: paddle.fluid.layers.dropout
    :noindex:

 split
 -----

-..  autofunction:: paddle.v2.fluid.layers.split
+..  autofunction:: paddle.fluid.layers.split
    :noindex:

 ctc_greedy_decoder
 ------------------

-..  autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder
+..  autofunction:: paddle.fluid.layers.ctc_greedy_decoder
    :noindex:

 edit_distance
 -------------

-..  autofunction:: paddle.v2.fluid.layers.edit_distance
+..  autofunction:: paddle.fluid.layers.edit_distance
    :noindex:

 l2_normalize
 ------------

-..  autofunction:: paddle.v2.fluid.layers.l2_normalize
+..  autofunction:: paddle.fluid.layers.l2_normalize
    :noindex:

 matmul
 ------

-..  autofunction:: paddle.v2.fluid.layers.matmul
+..  autofunction:: paddle.fluid.layers.matmul
    :noindex:

 warpctc
 -------

-..  autofunction:: paddle.v2.fluid.layers.warpctc
+..  autofunction:: paddle.fluid.layers.warpctc
    :noindex:

 sequence_reshape
 ----------------

-..  autofunction:: paddle.v2.fluid.layers.sequence_reshape
+..  autofunction:: paddle.fluid.layers.sequence_reshape
    :noindex:

 transpose
 ---------

-..  autofunction:: paddle.v2.fluid.layers.transpose
+..  autofunction:: paddle.fluid.layers.transpose
    :noindex:

 im2sequence
 -----------

-..  autofunction:: paddle.v2.fluid.layers.im2sequence
+..  autofunction:: paddle.fluid.layers.im2sequence
    :noindex:

 nce
 ---

-..  autofunction:: paddle.v2.fluid.layers.nce
+..  autofunction:: paddle.fluid.layers.nce
    :noindex:

 beam_search
 -----------

-..  autofunction:: paddle.v2.fluid.layers.beam_search
+..  autofunction:: paddle.fluid.layers.beam_search
    :noindex:

 row_conv
 --------

-..  autofunction:: paddle.v2.fluid.layers.row_conv
+..  autofunction:: paddle.fluid.layers.row_conv
    :noindex:

 multiplex
 ---------

-..  autofunction:: paddle.v2.fluid.layers.multiplex
+..  autofunction:: paddle.fluid.layers.multiplex
    :noindex:

 ops
@@ -479,259 +479,259 @@ ops
 mean
 ----

-..  autofunction:: paddle.v2.fluid.layers.mean
+..  autofunction:: paddle.fluid.layers.mean
    :noindex:

 mul
 ---

-..  autofunction:: paddle.v2.fluid.layers.mul
+..  autofunction:: paddle.fluid.layers.mul
    :noindex:

 reshape
 -------

-..  autofunction:: paddle.v2.fluid.layers.reshape
+..  autofunction:: paddle.fluid.layers.reshape
    :noindex:

 scale
 -----

-..  autofunction:: paddle.v2.fluid.layers.scale
+..  autofunction:: paddle.fluid.layers.scale
    :noindex:

 sigmoid_cross_entropy_with_logits
 ---------------------------------

-..  autofunction:: paddle.v2.fluid.layers.sigmoid_cross_entropy_with_logits
+..  autofunction:: paddle.fluid.layers.sigmoid_cross_entropy_with_logits
    :noindex:

 elementwise_add
 ---------------

-..  autofunction:: paddle.v2.fluid.layers.elementwise_add
+..  autofunction:: paddle.fluid.layers.elementwise_add
    :noindex:

 elementwise_div
 ---------------

-..  autofunction:: paddle.v2.fluid.layers.elementwise_div
+..  autofunction:: paddle.fluid.layers.elementwise_div
    :noindex:

 elementwise_sub
 ---------------

-..  autofunction:: paddle.v2.fluid.layers.elementwise_sub
+..  autofunction:: paddle.fluid.layers.elementwise_sub
    :noindex:

 elementwise_mul
 ---------------

-..  autofunction:: paddle.v2.fluid.layers.elementwise_mul
+..  autofunction:: paddle.fluid.layers.elementwise_mul
    :noindex:

 elementwise_max
 ---------------

-..  autofunction:: paddle.v2.fluid.layers.elementwise_max
+..  autofunction:: paddle.fluid.layers.elementwise_max
    :noindex:

 elementwise_min
 ---------------

-..  autofunction:: paddle.v2.fluid.layers.elementwise_min
+..  autofunction:: paddle.fluid.layers.elementwise_min
    :noindex:

 elementwise_pow
 ---------------

-..  autofunction:: paddle.v2.fluid.layers.elementwise_pow
+..  autofunction:: paddle.fluid.layers.elementwise_pow
    :noindex:

 clip
 ----

-..  autofunction:: paddle.v2.fluid.layers.clip
+..  autofunction:: paddle.fluid.layers.clip
    :noindex:

 clip_by_norm
 ------------

-..  autofunction:: paddle.v2.fluid.layers.clip_by_norm
+..  autofunction:: paddle.fluid.layers.clip_by_norm
    :noindex:

 sequence_softmax
 ----------------

-..  autofunction:: paddle.v2.fluid.layers.sequence_softmax
+..  autofunction:: paddle.fluid.layers.sequence_softmax
    :noindex:

 sigmoid
 -------

-..  autofunction:: paddle.v2.fluid.layers.sigmoid
+..  autofunction:: paddle.fluid.layers.sigmoid
    :noindex:

 logsigmoid
 ----------

-..  autofunction:: paddle.v2.fluid.layers.logsigmoid
+..  autofunction:: paddle.fluid.layers.logsigmoid
    :noindex:

 exp
 ---

-..  autofunction:: paddle.v2.fluid.layers.exp
+..  autofunction:: paddle.fluid.layers.exp
    :noindex:

 relu
 ----

-..  autofunction:: paddle.v2.fluid.layers.relu
+..  autofunction:: paddle.fluid.layers.relu
    :noindex:

 tanh
 ----

-..  autofunction:: paddle.v2.fluid.layers.tanh
+..  autofunction:: paddle.fluid.layers.tanh
    :noindex:

 tanh_shrink
 -----------

-..  autofunction:: paddle.v2.fluid.layers.tanh_shrink
+..  autofunction:: paddle.fluid.layers.tanh_shrink
    :noindex:

 softshrink
 ----------

-..  autofunction:: paddle.v2.fluid.layers.softshrink
+..  autofunction:: paddle.fluid.layers.softshrink
    :noindex:

 sqrt
 ----

-..  autofunction:: paddle.v2.fluid.layers.sqrt
+..  autofunction:: paddle.fluid.layers.sqrt
    :noindex:

 abs
 ---

-..  autofunction:: paddle.v2.fluid.layers.abs
+..  autofunction:: paddle.fluid.layers.abs
    :noindex:

 ceil
 ----

-..  autofunction:: paddle.v2.fluid.layers.ceil
+..  autofunction:: paddle.fluid.layers.ceil
    :noindex:

 floor
 -----

-..  autofunction:: paddle.v2.fluid.layers.floor
+..  autofunction:: paddle.fluid.layers.floor
    :noindex:

 round
 -----

-..  autofunction:: paddle.v2.fluid.layers.round
+..  autofunction:: paddle.fluid.layers.round
    :noindex:

 reciprocal
 ----------

-..  autofunction:: paddle.v2.fluid.layers.reciprocal
+..  autofunction:: paddle.fluid.layers.reciprocal
    :noindex:

 log
 ---

-..  autofunction:: paddle.v2.fluid.layers.log
+..  autofunction:: paddle.fluid.layers.log
    :noindex:

 square
 ------

-..  autofunction:: paddle.v2.fluid.layers.square
+..  autofunction:: paddle.fluid.layers.square
    :noindex:

 softplus
 --------

-..  autofunction:: paddle.v2.fluid.layers.softplus
+..  autofunction:: paddle.fluid.layers.softplus
    :noindex:

 softsign
 --------

-..  autofunction:: paddle.v2.fluid.layers.softsign
+..  autofunction:: paddle.fluid.layers.softsign
    :noindex:

 brelu
 -----

-..  autofunction:: paddle.v2.fluid.layers.brelu
+..  autofunction:: paddle.fluid.layers.brelu
    :noindex:

 leaky_relu
 ----------

-..  autofunction:: paddle.v2.fluid.layers.leaky_relu
+..  autofunction:: paddle.fluid.layers.leaky_relu
    :noindex:

 soft_relu
 ---------

-..  autofunction:: paddle.v2.fluid.layers.soft_relu
+..  autofunction:: paddle.fluid.layers.soft_relu
    :noindex:

 elu
 ---

-..  autofunction:: paddle.v2.fluid.layers.elu
+..  autofunction:: paddle.fluid.layers.elu
    :noindex:

 relu6
 -----

-..  autofunction:: paddle.v2.fluid.layers.relu6
+..  autofunction:: paddle.fluid.layers.relu6
    :noindex:

 pow
 ---

-..  autofunction:: paddle.v2.fluid.layers.pow
+..  autofunction:: paddle.fluid.layers.pow
    :noindex:

 stanh
 -----

-..  autofunction:: paddle.v2.fluid.layers.stanh
+..  autofunction:: paddle.fluid.layers.stanh
    :noindex:

 hard_shrink
 -----------

-..  autofunction:: paddle.v2.fluid.layers.hard_shrink
+..  autofunction:: paddle.fluid.layers.hard_shrink
    :noindex:

 thresholded_relu
 ----------------

-..  autofunction:: paddle.v2.fluid.layers.thresholded_relu
+..  autofunction:: paddle.fluid.layers.thresholded_relu
    :noindex:

 hard_sigmoid
 ------------

-..  autofunction:: paddle.v2.fluid.layers.hard_sigmoid
+..  autofunction:: paddle.fluid.layers.hard_sigmoid
    :noindex:

 swish
 -----

-..  autofunction:: paddle.v2.fluid.layers.swish
+..  autofunction:: paddle.fluid.layers.swish
    :noindex:

 tensor
@@ -740,66 +740,66 @@ tensor
 create_tensor
 -------------

-..  autofunction:: paddle.v2.fluid.layers.create_tensor
+..  autofunction:: paddle.fluid.layers.create_tensor
    :noindex:

 create_parameter
 ----------------

-..  autofunction:: paddle.v2.fluid.layers.create_parameter
+..  autofunction:: paddle.fluid.layers.create_parameter
    :noindex:

 create_global_var
 -----------------

-..  autofunction:: paddle.v2.fluid.layers.create_global_var
+..  autofunction:: paddle.fluid.layers.create_global_var
    :noindex:

 cast
 ----

-..  autofunction:: paddle.v2.fluid.layers.cast
+..  autofunction:: paddle.fluid.layers.cast
    :noindex:

 concat
 ------

-..  autofunction:: paddle.v2.fluid.layers.concat
+..  autofunction:: paddle.fluid.layers.concat
    :noindex:

 sums
 ----

-..  autofunction:: paddle.v2.fluid.layers.sums
+..  autofunction:: paddle.fluid.layers.sums
    :noindex:

 assign
 ------

-..  autofunction:: paddle.v2.fluid.layers.assign
+..  autofunction:: paddle.fluid.layers.assign
    :noindex:

 fill_constant_batch_size_like
 -----------------------------

-..  autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
+..  autofunction:: paddle.fluid.layers.fill_constant_batch_size_like
    :noindex:

 fill_constant
 -------------

-..  autofunction:: paddle.v2.fluid.layers.fill_constant
+..  autofunction:: paddle.fluid.layers.fill_constant
    :noindex:

 ones
 ----

-..  autofunction:: paddle.v2.fluid.layers.ones
+..  autofunction:: paddle.fluid.layers.ones
    :noindex:

 zeros
 -----

-..  autofunction:: paddle.v2.fluid.layers.zeros
+..  autofunction:: paddle.fluid.layers.zeros
    :noindex:

--- a/doc/api/v2/fluid/nets.rst
+++ b/doc/api/v2/fluid/nets.rst
@@ -8,24 +8,24 @@ nets
 simple_img_conv_pool
 --------------------

-..  autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
+..  autofunction:: paddle.fluid.nets.simple_img_conv_pool
    :noindex:

 sequence_conv_pool
 ------------------

-..  autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
+..  autofunction:: paddle.fluid.nets.sequence_conv_pool
    :noindex:

 glu
 ---

-..  autofunction:: paddle.v2.fluid.nets.glu
+..  autofunction:: paddle.fluid.nets.glu
    :noindex:

 scaled_dot_product_attention
 ----------------------------

-..  autofunction:: paddle.v2.fluid.nets.scaled_dot_product_attention
+..  autofunction:: paddle.fluid.nets.scaled_dot_product_attention
    :noindex:

--- a/doc/api/v2/fluid/optimizer.rst
+++ b/doc/api/v2/fluid/optimizer.rst
@@ -8,42 +8,42 @@ optimizer
 SGD
 ---

-..  autoclass:: paddle.v2.fluid.optimizer.SGD
+..  autoclass:: paddle.fluid.optimizer.SGD
    :members:
    :noindex:

 Momentum
 --------

-..  autoclass:: paddle.v2.fluid.optimizer.Momentum
+..  autoclass:: paddle.fluid.optimizer.Momentum
    :members:
    :noindex:

 Adagrad
 -------

-..  autoclass:: paddle.v2.fluid.optimizer.Adagrad
+..  autoclass:: paddle.fluid.optimizer.Adagrad
    :members:
    :noindex:

 Adam
 ----

-..  autoclass:: paddle.v2.fluid.optimizer.Adam
+..  autoclass:: paddle.fluid.optimizer.Adam
    :members:
    :noindex:

 Adamax
 ------

-..  autoclass:: paddle.v2.fluid.optimizer.Adamax
+..  autoclass:: paddle.fluid.optimizer.Adamax
    :members:
    :noindex:

 DecayedAdagrad
 --------------

-..  autoclass:: paddle.v2.fluid.optimizer.DecayedAdagrad
+..  autoclass:: paddle.fluid.optimizer.DecayedAdagrad
    :members:
    :noindex:

--- a/doc/api/v2/fluid/param_attr.rst
+++ b/doc/api/v2/fluid/param_attr.rst
@@ -8,14 +8,14 @@ param_attr
 ParamAttr
 ---------

-..  autoclass:: paddle.v2.fluid.param_attr.ParamAttr
+..  autoclass:: paddle.fluid.param_attr.ParamAttr
    :members:
    :noindex:

 WeightNormParamAttr
 -------------------

-..  autoclass:: paddle.v2.fluid.param_attr.WeightNormParamAttr
+..  autoclass:: paddle.fluid.param_attr.WeightNormParamAttr
    :members:
    :noindex:

--- a/doc/api/v2/fluid/profiler.rst
+++ b/doc/api/v2/fluid/profiler.rst
@@ -8,18 +8,18 @@ profiler
 cuda_profiler
 -------------

-..  autofunction:: paddle.v2.fluid.profiler.cuda_profiler
+..  autofunction:: paddle.fluid.profiler.cuda_profiler
    :noindex:

 reset_profiler
 --------------

-..  autofunction:: paddle.v2.fluid.profiler.reset_profiler
+..  autofunction:: paddle.fluid.profiler.reset_profiler
    :noindex:

 profiler
 --------

-..  autofunction:: paddle.v2.fluid.profiler.profiler
+..  autofunction:: paddle.fluid.profiler.profiler
    :noindex:

--- a/doc/api/v2/fluid/regularizer.rst
+++ b/doc/api/v2/fluid/regularizer.rst
@@ -8,20 +8,20 @@ regularizer
 append_regularization_ops
 -------------------------

-..  autofunction:: paddle.v2.fluid.regularizer.append_regularization_ops
+..  autofunction:: paddle.fluid.regularizer.append_regularization_ops
    :noindex:

 L1Decay
 -------

-..  autoclass:: paddle.v2.fluid.regularizer.L1Decay
+..  autoclass:: paddle.fluid.regularizer.L1Decay
    :members:
    :noindex:

 L2Decay
 -------

-..  autoclass:: paddle.v2.fluid.regularizer.L2Decay
+..  autoclass:: paddle.fluid.regularizer.L2Decay
    :members:
    :noindex:

--- a/doc/api/index_cn.rst
+++ b/doc/api/index_cn.rst
-API
-===
-
-..  toctree::
-    :maxdepth: 1
-
-    模型配置 <v2/model_configs.rst>
-    数据访问 <v2/data.rst>
-    训练与应用 <v2/run_logic.rst>
-    v2/fluid.rst
--- a/doc/api/index_en.rst
+++ b/doc/api/index_en.rst
@@ -4,7 +4,8 @@ API
 ..  toctree::
    :maxdepth: 1

+    overview.rst
    v2/model_configs.rst
    v2/data.rst
    v2/run_logic.rst
-    v2/fluid.rst
+    fluid/index.rst
--- a/doc/api/overview.rst
+++ b/doc/api/overview.rst
+V2 API Overview
+================
+
+The PaddlePaddle V2 API is designed to provide a modern user interface for PaddlePaddle V1(the original layer-based platform of PaddlePaddle),
+it proposes some high-level concepts such as `Layers <http://www.paddlepaddle.org/docs/develop/api/en/v2/config/layer.html>`_ , `Optimizer <http://www.paddlepaddle.org/docs/develop/api/en/v2/config/optimizer.html>`_ , `Evaluator <http://www.paddlepaddle.org/docs/develop/api/en/v2/config/evaluators.html>`_  and `Data Reader <http://www.paddlepaddle.org/docs/develop/api/en/v2/data/data_reader.html>`_ to make the model configuration more familiar to users.
+
+A model is composed of the computation described by a group of `Layers`, with `Evaluator` to define the error, `Optimizer` to update the parameters and `Data Reader` to feed in the data.
+
+We also provide the `interface for Training and Inference <http://www.paddlepaddle.org/docs/develop/api/en/v2/run_logic.html>`_ to help control the training and inference phrase,
+it has several easy to use methods
+
+- `paddle.train` 
+- `paddle.test`
+- `paddle.infer`
+
+to better expose the internal running details, different `events <http://www.paddlepaddle.org/docs/develop/api/en/v2/run_logic.html#event>`_ are available to users by writing some callbacks.
--- a/doc/api/v2/fluid.rst
+++ b/doc/api/v2/fluid.rst
-======================
-Fluid
-======================
-
-..  toctree::
-    :maxdepth: 1
-
-    fluid/layers.rst
-    fluid/data_feeder.rst
-    fluid/executor.rst
-    fluid/initializer.rst
-    fluid/evaluator.rst
-    fluid/nets.rst
-    fluid/optimizer.rst
-    fluid/param_attr.rst
-    fluid/profiler.rst
-    fluid/regularizer.rst
-    fluid/io.rst
--- a/doc/build_and_install/build_cn.md
+++ b/doc/build_and_install/build_cn.md
-# 用Docker编译和测试PaddlePaddle
-
-## 需要的软硬件
-
-为了开发PaddlePaddle，我们需要
-
-1. 一台电脑，可以装的是 Linux, BSD, Windows 或者 MacOS 操作系统，以及
-1. Docker。
-
-不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要，因为我们会把所有编译工具都安装进一个 Docker image 里。
-
-## 总体流程
-
-1. 获取源码
-
-   ```bash
-   git clone https://github.com/paddlepaddle/paddle
-   ```
-
-2. 安装开发工具到 Docker image 里
-
-   ```bash
-   cd paddle; docker build -t paddle:dev .
-   ```
-
-   请注意这个命令结尾处的 `.`；它表示 `docker build` 应该读取当前目录下的 [`Dockerfile`文件](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile)，按照其内容创建一个名为 `paddle:dev` 的 Docker image，并且把各种开发工具安装进去。
-
-3. 编译
-
-   以下命令启动一个 Docker container 来执行 `paddle:dev` 这个 Docker image，同时把当前目录（源码树根目录）映射为 container 里的 `/paddle` 目录，并且运行 `Dockerfile` 描述的默认入口程序 [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `cmake` 和 `make` 来编译 `/paddle` 里的源码，结果输出到 `/paddle/build`，也就是本地的源码树根目录里的 `build` 子目录。
-
-   ```bash
-   docker run --rm -v $PWD:/paddle paddle:dev
-   ```
-
-   上述命令编译出一个 CUDA-enabled 版本。如果我们只需要编译一个只支持 CPU 的版本，可以用
-
-   ```bash
-   docker run --rm -e WITH_GPU=OFF -v $PWD:/paddle paddle:dev
-   ```
-
-4. 运行单元测试
-
-   用本机的第一个 GPU 来运行包括 GPU 单元测试在内的所有单元测试：
-
-   ```bash
-   NV_GPU=0 nvidia-docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
-   ```
-
-   如果编译的时候我们用了 `WITH_GPU=OFF` 选项，那么编译过程只会产生 CPU-based 单元测试，那么我们也就不需要 nvidia-docker 来运行单元测试了。我们只需要：
-
-   ```bash
-   docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
-   ```
-
-   有时候我们只想运行一个特定的单元测试，比如 `memory_test`，我们可以
-
-   ```bash
-   nvidia-docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest -V -R memory_test"
-   ```
-
-5. 清理
-
-   有时候我们会希望清理掉已经下载的第三方依赖以及已经编译的二进制文件。此时只需要：
-
-   ```bash
-   rm -rf build
-   ```
-
-## 为什么要 Docker 呀？
-
- 什么是 Docker?
-
-  如果您没有听说 Docker，可以把它想象为一个类似 virtualenv 的系统，但是虚拟的不仅仅是 Python 的运行环境。
-
- Docker 还是虚拟机？
-
-  有人用虚拟机来类比 Docker。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。
-
- 为什么用 Docker?
-
-  把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题，其他人可以复现问题以便帮助。
-
-  另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。
-
- 我可以选择不用Docker吗？
-
-  当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式，把这些工具安装到本机。这篇文档介绍基于 Docker 的开发流程，是因为这个流程比其他方法都更简便。
-
- 学习 Docker 有多难？
-
-  理解 Docker 并不难，大概花十分钟看一下[这篇文章](https://zhuanlan.zhihu.com/p/19902938)。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
-
- 我可以用 IDE 吗？
-
-  当然可以，因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码，我们只需要配置 IDE 来调用 Docker 命令编译源码即可。
-
-  很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行
-
-  ```emacs
-  (global-set-key "\C-cc" 'compile)
-  (setq compile-command
-   "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
-  ```
-
-  就可以按 `Ctrl-C` 和 `c` 键来启动编译了。
-
- 可以并行编译吗？
-
-  是的。我们的 Docker image 运行一个 [Bash 脚本](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
-
-## 可能碰到的问题
-
- Docker 需要 sudo
-
-  如果用自己的电脑开发，自然也就有管理员权限（sudo）了。如果用公用的电脑开发，需要请管理员安装和配置好 Docker。此外，PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术，比如 rkt。
-
- 在 Windows/MacOS 上编译很慢
-
-  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考[这个issue](https://github.com/PaddlePaddle/Paddle/issues/627)。
-
- 磁盘不够
-
-  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考[这篇文章](https://zaiste.net/posts/removing_docker_containers/)来清理这些内容。
--- a/doc/build_and_install/build_en.md
+++ b/doc/build_and_install/build_en.md
-# Build using Docker
-
-## What Developers Need
-
-To contribute to PaddlePaddle, you need
-
-1. A computer -- Linux, BSD, Windows, MacOS, and
-1. Docker.
-
-Nothing else.  Not even Python and GCC, because you can install all build tools into a Docker image.  We run all the tools by running this image.
-
-## General Process
-
-1. Retrieve source code.
-
-   ```bash
-   git clone https://github.com/paddlepaddle/paddle
-   ```
-
-2. Install build tools into a Docker image.
-
-   ```bash
-   cd paddle; docker build -t paddle:dev .
-   ```
-
-   Please be aware of the `.` at the end of the command, which refers to the [`./Dockerfile` file](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile).  `docker build` follows instructions in this file to create a Docker image named `paddle:dev`, and installs building tools into it.
-
-3. Build from source.
-
-   This following command starts a Docker container that executes the Docker image `paddle:dev`, mapping the current directory to `/paddle/` in the container, and runs the default entry-point [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh) as specified in the Dockefile.  `build.sh` invokes `cmake` and `make` to build PaddlePaddle source code, which had been mapped to `/paddle`, and writes outputs to `/paddle/build`, which maps to `build` in the current source directory on the computer.
-
-   ```bash
-   docker run -v $PWD:/paddle paddle:dev
-   ```
-
-   Above command builds a CUDA-enabled version.  If we want to build a CPU-only version, we can type
-
-   ```bash
-   docker run -e WITH_GPU=OFF -v $PWD:/paddle paddle:dev
-   ```
-
-4. Run unit tests.
-
-   To run all unit tests using the first GPU of a node:
-
-   ```bash
-   NV_GPU=0 nvidia-docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
-   ```
-
-   If we used `WITH_GPU=OFF` at build time, it generates only CPU-based unit tests, and we don't need nvidia-docker to run them.  We can just run
-
-   ```bash
-   docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
-   ```
-
-   Sometimes we want to run a specific unit test, say `memory_test`, we can run
-
-   ```bash
-   nvidia-docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest -V -R memory_test"
-   ```
-
-5. Clean Build.
-
-   Sometimes, we might want to clean all thirt-party dependents and built binaries.  To do so, just
-
-   ```bash
-   rm -rf build
-   ```
-
-## Docker, Or Not?
-
- What is Docker?
-
-  If you haven't heard of it, consider it something like Python's virtualenv.
-
- Docker or virtual machine?
-
-  Some people compare Docker with VMs, but Docker doesn't virtualize any hardware nor running a guest OS, which means there is no compromise on the performance.
-
- Why Docker?
-
-  Using a Docker image of build tools standardizes the building environment, which makes it easier for others to reproduce your problems and to help.
-
-  Also, some build tools don't run on Windows or Mac or BSD, but Docker runs almost everywhere, so developers can use whatever computer they want.
-
- Can I choose not to use Docker?
-
-  Sure, you don't have to install build tools into a Docker image; instead, you can install them in your local computer.  This document exists because Docker would make the development way easier.
-
- How difficult is it to learn Docker?
-
-    It takes you ten minutes to read [an introductory article](https://docs.docker.com/get-started) and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools.  Not even to mention the time saved when other people trying to reproduce the issue you have.
-
- Can I use my favorite IDE?
-
-  Yes, of course.  The source code resides on your local computer, and you can edit it using whatever editor you like.
-
-  Many PaddlePaddle developers are using Emacs.  They add the following few lines into their `~/.emacs` configure file:
-
-  ```emacs
-  (global-set-key "\C-cc" 'compile)
-  (setq compile-command
-   "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
-  ```
-
-  so they could type `Ctrl-C` and `c` to build PaddlePaddle from source.
-
- Does Docker do parallel building?
-
-  Our building Docker image runs a [Bash script](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh), which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
-
-## Some Gotchas
-
- Docker requires sudo
-
-  An owner of a computer has the administrative privilege, a.k.a., sudo, and Docker requires this privilege to work properly.  If you use a shared computer for development, please ask the administrator to install and configure Docker.  We will do our best to support rkt, another container technology that doesn't require sudo.
-
- Docker on Windows/MacOS builds slowly
-
-  On Windows and MacOS, Docker containers run in a Linux VM.  You might want to give this VM some more memory and CPUs so to make the building efficient.  Please refer to [this issue](https://github.com/PaddlePaddle/Paddle/issues/627) for details.
-
- Not enough disk space
-
-  Examples in this article uses option `--rm` with the `docker run` command.  This option ensures that stopped containers do not exist on hard disks.  We can use `docker ps -a` to list all containers, including stopped.  Sometimes `docker build` generates some intermediate dangling images, which also take disk space.  To clean them, please refer to [this article](https://zaiste.net/posts/removing_docker_containers/).
--- a/doc/build_and_install/build_from_source_cn.rst
+++ b/doc/build_and_install/build_from_source_cn.rst
 从源码编译
 ======================

+.. _requirements:
+
+需要的软硬件
+----------------
+
+为了编译PaddlePaddle，我们需要
+
+1. 一台电脑，可以装的是 Linux, Windows 或者 MacOS 操作系统
+1. Docker
+
+不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要，因为我们会把所有编译工具都安装进一个 Docker 镜像里。
+
 .. _build_step:

 编译方法
 ----------------

-PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译工具。
-我们推荐您使用PaddlePaddle Docker编译环境镜像完成编译，这样可以免去单独安装编译依赖的步骤，可选的不同编译环境Docker镜像
-可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到。
+PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安装编译依赖的步骤，可选的不同编译环境Docker镜像
+可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到。或者
+参考下述可选步骤，从源码中构建用于编译PaddlePaddle的Docker镜像。

 如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。

@@ -16,15 +28,19 @@ PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译

 .. code-block:: bash

+   # 1. 获取源码
   git clone https://github.com/PaddlePaddle/Paddle.git
   cd Paddle
-   # 如果使用Docker编译环境，执行下面的命令编译CPU-Only的二进制
+   # 2. 可选步骤：源码中构建用于编译PaddlePaddle的Docker镜像
+   docker build -t paddle:dev .
+   # 3. 执行下面的命令编译CPU-Only的二进制
   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
-   # 如果不使用Docker编译环境，执行下面的命令
-   mkdir build
-   cd build
-   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
-   make
+   # 4. 或者也可以使用为上述可选步骤构建的镜像（必须先执行第2步）
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev
+
+注：上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。如果使用自行
+构建的镜像（上述第4步）会执行 :code:`Dockerfile` 描述的默认入口程序 :code:`build.sh` 可以省略步骤3中
+最后的执行脚本的命令。

 编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：

@@ -50,28 +66,83 @@ PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译

 如果您期望在编译完成后立即执行所有的单元测试，可以按照下面的方法：

-使用Docker的情况下，设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后，立即执行单元测试。
+设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后，立即执行单元测试。
 开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。

 .. code-block:: bash

   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh

-如果不使用Docker，可以执行ctest命令即可：
+如果期望执行其中一个单元测试，（比如 :code:`test_sum_op` ）：

 .. code-block:: bash

-   mkdir build
-   cd build
-   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
-   make
-   ctest
-   # 指定执行其中一个单元测试 test_mul_op
-   ctest -R test_mul_op
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+   bash /paddle/paddle/scripts/docker/build.sh
+   cd /paddle/build
+   ctest -R test_sum_op -V
+
+.. _faq_docker:
+
+常见问题
+----------------
+
+- 什么是 Docker?
+
+  如果您没有听说 Docker，可以把它想象为一个类似 virtualenv 的系统，但是虚拟的不仅仅是 Python 的运行环境。
+
+- Docker 还是虚拟机？
+
+  有人用虚拟机来类比 Docker。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。
+
+- 为什么用 Docker?
+
+  把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题，其他人可以复现问题以便帮助。
+
+  另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。
+
+- 我可以选择不用Docker吗？
+
+  当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式，把这些工具安装到本机。这篇文档介绍基于 Docker 的开发流程，是因为这个流程比其他方法都更简便。
+
+- 学习 Docker 有多难？
+
+  理解 Docker 并不难，大概花十分钟看一下[这篇文章](https://zhuanlan.zhihu.com/p/19902938)。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
+
+- 我可以用 IDE 吗？
+
+  当然可以，因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码，我们只需要配置 IDE 来调用 Docker 命令编译源码即可。
+
+  很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行
+
+  ```emacs
+  (global-set-key "\C-cc" 'compile)
+  (setq compile-command
+   "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
+  ```
+
+  就可以按 `Ctrl-C` 和 `c` 键来启动编译了。
+
+- 可以并行编译吗？
+
+  是的。我们的 Docker image 运行一个 [Bash 脚本](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
+
+- Docker 需要 sudo
+
+  如果用自己的电脑开发，自然也就有管理员权限（sudo）了。如果用公用的电脑开发，需要请管理员安装和配置好 Docker。此外，PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术，比如 rkt。
+
+- 在 Windows/MacOS 上编译很慢
+
+  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考[这个issue](https://github.com/PaddlePaddle/Paddle/issues/627)。
+
+- 磁盘不够
+
+  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考[这篇文章](https://zaiste.net/posts/removing_docker_containers/)来清理这些内容。
+

 .. _compile_deps:

-编译依赖
+附录：编译依赖
 ----------------

 PaddlePaddle编译需要使用到下面的依赖（包含但不限于），其他的依赖软件，会自动在编译时下载。
@@ -91,7 +162,7 @@ PaddlePaddle编译需要使用到下面的依赖（包含但不限于），其

 .. _build_options:

-编译选项
+附录：编译选项
 ----------------

 PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。
@@ -118,7 +189,7 @@ PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种B
    "WITH_TESTING", "是否开启单元测试", "OFF"
    "WITH_DOC", "是否编译中英文文档", "OFF"
    "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练", "Auto"
-    "WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON"
+    "WITH_GOLANG", "是否编译go语言的可容错parameter server", "OFF"
    "WITH_MKL", "是否使用MKL数学库，如果为否则是用OpenBLAS", "ON"

 BLAS

--- a/doc/build_and_install/build_from_source_en.rst
+++ b/doc/build_and_install/build_from_source_en.rst
 Build from Sources
 ==========================

-.. _build_step:
+.. _requirements:

-How To Build
+Requirements
 ----------------

-PaddlePaddle mainly uses `CMake <https://cmake.org>`_ and GCC, G++ as compile
-tools. We recommend you to use our pre-built Docker image to run the build
-to avoid installing dependencies by yourself. We have several build environment
-Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ .
+To build PaddlePaddle, you need
+
+1. A computer -- Linux, Windows, MacOS.
+1. Docker.
+
+Nothing else.  Not even Python and GCC, because you can install all build tools into a Docker image. 
+We run all the tools by running this image.
+
+.. _build_step:

-If you choose not to use Docker image for your build, you need to install the
-below `Compile Dependencies`_ before run the build.
+How To Build
+----------------

-Then run:
+You need to use Docker to build PaddlePaddle
+to avoid installing dependencies by yourself. We have several pre-built
+Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ ,
+Or you can build your own image from source as the optional step below:

 .. code-block:: bash

+   # 1. clone the source code
   git clone https://github.com/PaddlePaddle/Paddle.git
   cd Paddle
-   # run the following command to build a CPU-Only binaries if you are using docker
+   # 2. Optional: build development docker image from source
+   docker build -t paddle:dev .
+   # 3. Run the following command to build a CPU-Only binaries
   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
-   # else run these commands
-   mkdir build
-   cd build
-   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
-   make
+   # 4. Or, use your built Docker image to build PaddlePaddle (must run step 2)
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev
+
+NOTE: The above command try to mount the current working directory (root directory of source code)
+into :code:`/paddle` directory inside docker container. If you are using your own image
+(Step 4) it will run default entry-point :code:`build.sh` , so you could omit the last
+command in step 3.

 When the compile finishes, you can get the output whl package under
 build/python/dist, then you can choose to install the whl on local
@@ -61,22 +74,75 @@ Set :code:`WITH_GPU=ON` Can also run tests on GPU.

   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/paddle/scripts/docker/build.sh

-If you don't use Docker, just run ctest will start the tests:
+If you wish to run only one unit test, like :code:`test_sum_op`:

 .. code-block:: bash

-   mkdir build
-   cd build
-   cmake -DWITH_GPU=OFF -DWITH_TESTING=ON ..
-   make
-   ctest
-   # run a single test like test_mul_op
-   ctest -R test_mul_op
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+   bash /paddle/paddle/scripts/docker/build.sh
+   cd /paddle/build
+   ctest -R test_sum_op -V
+
+.. _faq_docker:
+
+Frequently Asked Questions
+----------------
+
+- What is Docker?
+
+  If you haven't heard of it, consider it something like Python's virtualenv.
+
+- Docker or virtual machine?
+
+  Some people compare Docker with VMs, but Docker doesn't virtualize any hardware nor running a guest OS, which means there is no compromise on the performance.
+
+- Why Docker?
+
+  Using a Docker image of build tools standardizes the building environment, which makes it easier for others to reproduce your problems and to help.
+
+  Also, some build tools don't run on Windows or Mac or BSD, but Docker runs almost everywhere, so developers can use whatever computer they want.

+- Can I choose not to use Docker?
+
+  Sure, you don't have to install build tools into a Docker image; instead, you can install them on your local computer.  This document exists because Docker would make the development way easier.
+
+- How difficult is it to learn Docker?
+
+    It takes you ten minutes to read [an introductory article](https://docs.docker.com/get-started) and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools.  Not even to mention the time saved when other people trying to reproduce the issue you have.
+
+- Can I use my favorite IDE?
+
+  Yes, of course.  The source code resides on your local computer, and you can edit it using whatever editor you like.
+
+  Many PaddlePaddle developers are using Emacs.  They add the following few lines into their `~/.emacs` configure file:
+
+  ```emacs
+  (global-set-key "\C-cc" 'compile)
+  (setq compile-command
+   "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
+  ```
+
+  so they could type `Ctrl-C` and `c` to build PaddlePaddle from source.
+
+- Does Docker do parallel building?
+
+  Our building Docker image runs a [Bash script](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh), which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
+
+- Docker requires sudo
+
+  An owner of a computer has the administrative privilege, a.k.a., sudo, and Docker requires this privilege to work properly.  If you use a shared computer for development, please ask the administrator to install and configure Docker.  We will do our best to support rkt, another container technology that doesn't require sudo.
+
+- Docker on Windows/MacOS builds slowly
+
+  On Windows and MacOS, Docker containers run in a Linux VM.  You might want to give this VM some more memory and CPUs so to make the building efficient.  Please refer to [this issue](https://github.com/PaddlePaddle/Paddle/issues/627) for details.
+
+- Not enough disk space
+
+  Examples in this article use option `--rm` with the `docker run` command.  This option ensures that stopped containers do not exist on hard disks.  We can use `docker ps -a` to list all containers, including stopped.  Sometimes `docker build` generates some intermediate dangling images, which also take disk space.  To clean them, please refer to [this article](https://zaiste.net/posts/removing_docker_containers/).

 .. _compile_deps:

-Compile Dependencies
+Appendix: Compile Dependencies
 ----------------

 PaddlePaddle need the following dependencies when compiling, other dependencies
@@ -97,17 +163,13 @@ will be downloaded automatically.

 .. _build_options:

-Build Options
+Appendix: Build Options
 ----------------

 Build options include whether build binaries for CPU or GPU, which BLAS
 library to use etc. You may pass these settings when running cmake.
 For detailed cmake tutorial please refer to `here <https://cmake.org/cmake-tutorial>`_ 。

-.. _build_options_bool:
-
-Bool Type Options
----------------

 You can add :code:`-D` argument to pass such options, like:

@@ -129,7 +191,7 @@ You can add :code:`-D` argument to pass such options, like:
    "WITH_TESTING", "Build unit tests", "OFF"
    "WITH_DOC", "Build documentations", "OFF"
    "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
-    "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"
+    "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "OFF"
    "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON"



--- a/doc/build_and_install/index_cn.rst
+++ b/doc/build_and_install/index_cn.rst
@@ -13,7 +13,6 @@ PaddlePaddle提供pip和Docker的安装方式：

   pip_install_cn.rst
   docker_install_cn.rst
-   build_cn.md

 编译流程
 ++++++++

--- a/doc/build_and_install/index_en.rst
+++ b/doc/build_and_install/index_en.rst
@@ -13,8 +13,6 @@ You can choose either pip or Docker to complete your install:

   pip_install_en.rst
   docker_install_en.rst
-   build_en.md
-

 Build from Source
 -----------------

--- a/doc/design/concurrent_programming.md
+++ b/doc/design/concurrent_programming.md
@@ -12,7 +12,7 @@ The following table compares concepts in Fluid and Go

 | Go | Fluid |
 |----|-------|
-|user-defined functions | [layers](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid) |
+|user-defined functions | [layers](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid) |
 | control-flow and built-in functions | [intrinsics/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators) |
 | goroutines, channels | [class ThreadPool](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h) |
 | runtime | [class Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) |

--- a/doc/design/fluid.md
+++ b/doc/design/fluid.md
@@ -89,7 +89,7 @@ with train_loop.block():
    h[t] = the_step(input[t])
 ```    

-An actual Fluid example is described  [here](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44).
+An actual Fluid example is described  [here](https://github.com/PaddlePaddle/Paddle/blob/bde090a97564b9c61a6aaa38b72ccc4889d102d9/python/paddle/fluid/tests/unittests/test_while_op.py#L50-L58).

 From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.


--- a/doc/design/switch_kernel.md
+++ b/doc/design/switch_kernel.md
--- a/doc/design/memory_optimization.md
+++ b/doc/design/memory_optimization.md
@@ -101,7 +101,7 @@ In-place is a built-in attribute of an operator. Since we treat in-place and oth

 #### contruct control flow graph

-Following is the ProgramDesc protobuf of [machine translation](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book/test_machine_translation.py) example.
+Following is the ProgramDesc protobuf of [machine translation](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py) example.

 - Block0:


--- a/doc/design/parallel_do.md
+++ b/doc/design/parallel_do.md
+# Design Doc: Parallel_Do in PaddlePaddle
+
+In PaddlePaddle, we use parallel_do primitive to represent multithread data parallel processing.
+
+## Design overview
+
+The definition of a parallel_do op looks like the following
+
+```c++
+AddInput(kInputs, "Inputs needed to be split onto different devices").AsDuplicable();
+AddInput(kParameters, "Parameters are duplicated over different devices")
+    .AsDuplicable();
+AddInput(kPlaces, "Devices used for parallel processing");
+AddOutput(kOutputs, "Outputs needed to be merged from different devices").AsDuplicable();
+AddOutput(kParallelScopes,
+          "Scopes for all local variables in forward pass. One scope for each device");
+AddAttr<framework::BlockDesc *>(kParallelBlock,
+                                "List of operaters to be executed in parallel");
+```
+
+A vanilla implementation of parallel_do can be shown as the following (`|` means single thread and
+`||||` means multiple threads)
+
+```
+In the forward pass
+  |      Split input onto different devices
+  |      Copy parameter onto different devices
+  ||||   Compute forward pass in parallel
+  |      Merge output from different devices
+
+In the backward pass
+  |      Split output@grad onto different devices
+  ||||   Compute backward pass in parallel
+  |      accumulate param@grad from different devices to the first device
+  |      Merge input@grad from different devices
+  |      Copy param@grad to the place of parallel_do_op
+```
+
+This implementation allows to write mixed device program like this
+
+```python
+# get embedding feature on CPU
+feature = some_cpu_only_op(data)
+
+gpu_places = get_place(use_gpu=True)
+# parallel processing on multiple GPUs
+pd = ParallelDo(gpu_places)
+with pd.do():
+    read_input(feature)
+    prediction = my_net(feature)
+    write_output(prediction)
+prediction = pd()
+loss = cross_entropy(prediction, label)
+```
+
+And the programDesc are like the following
+
+```
+# start_program will be run by executor(CPUPlace), all w1, w2 will be allocated on CPU
+start_program
+{
+  vars: w1, w2
+  ops: init(w1), init(w2)
+}
+
+main_program
+{
+block0 {
+  vars: data, places, w1, w2
+  ops: data, get_place, parallel_do(block1),
+       parallel_do_grad(block2),
+       sgd(w2, w2_grad),
+       sgd(w1, w1_grad)
+}
+block1 {
+  parent_block: 0
+  vars: data, h1, h2, loss
+  ops: fc, fc, softmax
+}
+block2 {
+  parent_block: 1
+  vars: data_grad, h1_grad, h2_grad, loss_gard, w1_grad, w2_grad
+  ops: softmax_grad,
+       fc_grad
+       fc_grad
+}
+}
+```
+
+## Performance Imporvement
+
+There are serial places we can make this parallel_do faster.
+
+### forward: split input onto different devices
+
+If the input of the parallel_do is independent from any prior opeartors, we can avoid this step by 
+prefetching the input onto different devices in a seperate background thread. And the python code
+looks like this.
+```python
+pd = ParallelDo(gpu_places)
+with pd.do():
+    feature = get_data_from_prefetch_queue(gpu_places)
+    prediction = my_net(feature)
+    write_output(activation)
+```
+
+### forward: Copy parameter to onto different devices
+
+We can avoid this step by making each device have a copy of the parameter. This requires:
+
+1. `fluid.default_start_up_program()` to be run on all devices
+1. In the backward, allreduce param@grad at different devices, this requires
+    1. `backward.py` add `allreduce` operators at parallel_do_grad
+    1. `allreduce` operators need to be called in async mode to achieve maximum throughput
+1. apply gradients related op(i.e. cliping, normalization, decay, sgd) on different devices in parallel
+
+By doing so, we also avoided "backward: accumulate param@grad from different devices to the first device".
+And the ProgramDesc looks like the following
+
+```
+# w1, w2 will be allocated on all GPUs
+start_program
+{
+block0 {
+  parallel_do(block1)
+}
+block1 {
+  parent_block: 0
+  vars: w1, w2
+  ops: init(w1), init(w2)
+}
+}
+
+main_program
+{
+block0 {
+  vars: data, places, w1, w2
+  ops: data, get_place, parallel_do(block1),
+       parallel_do_grad(block2),      # append_backward
+       parallel_do(block3)            # append_optimization
+       
+}
+block1 {
+  parent_block: 0
+  vars: data, h1, h2, loss
+  ops: fc, fc, softmax
+}
+block2 {
+  parent_block: 1
+  vars: data_grad, h1_grad, h2_grad, loss_gard, w1_grad, w2_grad
+  ops: softmax_grad,
+       fc_grad, allreduce(places, scopes, w1_grad),
+       fc_grad, allreduce(places, scopes, w2_grad)
+}
+block3 {
+  parent_block: 0
+  vars: lr
+  ops: sgd(w2, w2_grad),
+       sgd(w1, w1_grad)
+}
+}
+```
--- a/doc/design/var_desc.md
+++ b/doc/design/var_desc.md
 ## Background
 PaddlePaddle divides the description of neural network computation into two stages: compile time and runtime. At compile time, the neural network computation is described as a `ProgramDesc` whereas at runtime an `Executor` interprets the `ProgramDesc` to compute the operations.

-PaddlePaddle use proto message to describe compile time program because
+PaddlePaddle uses proto message to describe compile time program because :

 1. The computation program description must be serializable and saved in a file.
-1. During distributed training, the sreialized program will be sent to multiple workers. It should also be possible to break the program into different components, each of which can be executed on different workers.
+1. During distributed training, the serialized program will be sent to multiple workers. It should also be possible to break the program into different components, each of which can be executed on a different worker.

 The computation `Program` consists of nested `Blocks`. Each `Block` will consist of data(i.e. `Variable`)  and  `Operations`. The concept to represent them is in the table below.

@@ -14,28 +14,33 @@ The computation `Program` consists of nested `Blocks`. Each `Block` will consist
 |Operation|OpDesc(proto)|Operator(cpp)|


-## Definition of VarDesc
+## Definition of VarType

-A VarDesc should have a name, and value. The are two kinds of variable type in compile time, they are `LoDTensor` and `SelectedRows`. 
+A VarDesc should have a name, type and whether or not it is persistable. The are different kinds of variable types supported in PaddlePaddle, apart from the POD_Types like: `LOD_TENSOR`, `SELECTED_ROWS`, `FEED_MINIBATCH`, `FETCH_LIST`, `STEP_SCOPES`, `LOD_RANK_TABLE`, `LOD_TENSOR_ARRAY`, `PLACE_LIST`, `READER` and `CHANNEL`. These are declared inside `VarType`. A `VarDesc` then looks as the following:

 ```proto
 message VarDesc {
  required string name = 1;
-  enum VarType {
-    LOD_TENSOR = 0;
-    SELECTED_ROWS = 1;
-  }
  required VarType type = 2;
-  optional LoDTensorDesc lod_desc = 3;
-  optional TensorDesc selected_rows_desc = 4;
-  optional bool persistable = 5 [ default = false ];
+  optional bool persistable = 3 [ default = false ];
 }
 ```

 ## Definition of TensorDesc

 ```proto
-enum DataType {
+message TensorDesc {
+  // Should only be PODType. Is enforced in C++
+  required Type data_type = 1;
+  repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+}
+```
+
+The `Type` here comes from the enum defined inside of `VarType` :
+
+```proto
+enum Type {
+  // Pod Types
  BOOL = 0;
  INT16 = 1;
  INT32 = 2;
@@ -43,11 +48,18 @@ enum DataType {
  FP16 = 4;
  FP32 = 5;
  FP64 = 6;
-}

-message TensorDesc {
-  required DataType data_type = 1;
-  repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+  // Other types that may need additional descriptions
+  LOD_TENSOR = 7;
+  SELECTED_ROWS = 8;
+  FEED_MINIBATCH = 9;
+  FETCH_LIST = 10;
+  STEP_SCOPES = 11;
+  LOD_RANK_TABLE = 12;
+  LOD_TENSOR_ARRAY = 13;
+  PLACE_LIST = 14;
+  READER = 15;
+  CHANNEL = 16;
 }
 ```

@@ -58,7 +70,7 @@ A TensorDesc describes `SelectedRows` and `LoDTensor`. For details of `SelectedR
 ```proto
 message LoDTensorDesc {
  required TensorDesc tensor = 1;
-  optional int lod_level = 2;
+  optional int32 lod_level = 2 [ default = 0 ];
 }
 ```


--- a/doc/faq/local/src/reduce_min_pool_size.py
+++ b/doc/faq/local/src/reduce_min_pool_size.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/doc/faq/local/src/word2vec_config.py
+++ b/doc/faq/local/src/word2vec_config.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/doc/faq/local/src/word2vec_dataprovider.py
+++ b/doc/faq/local/src/word2vec_dataprovider.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/doc/getstarted/concepts/src/infer.py
+++ b/doc/getstarted/concepts/src/infer.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/doc/getstarted/concepts/src/train.py
+++ b/doc/getstarted/concepts/src/train.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/doc/howto/capi/index_cn.rst
+++ b/doc/howto/capi/index_cn.rst
 C-API预测库
 ==================

+当我们训练完一个神经网络模型之后，下一步就是用模型来做预测。预测就是准备输入数据，经过模型处理之后，得到预测结果的过程。
+
+相比于模型训练，预测有如下特点：
+
+#. 预测不需要训练过程中反向传播和参数更新的部分。
+#. 预测不需要标签(label)。
+#. 预测很多时候需要和用户系统整合在一起。
+
+因为上述特点，模型预测SDK需要单独设计，并具备以下特点：
+
+#. 预测SDK不包含反向传播和参数更新部分，以减小SDK的体积。
+#. 预测SDK需要提供一个简洁的用户接口，方便使用。
+#. 因为输入数据可能有多种结构，对输入数据的格式做清晰简洁的封装。
+#. 为了和用户系统兼容，SDK的接口需要是满足C标准的接口。
+
+PaddlePaddle提供了C-API，用于解决上述问题。关于C-API的使用，我们提供了如下指南：
+
 ..  toctree::
  :maxdepth: 1


--- a/doc/howto/capi/workflow_of_capi_cn.md
+++ b/doc/howto/capi/workflow_of_capi_cn.md
@@ -65,6 +65,7 @@
    output_file = "output.paddle.model"
    merge_v2_model(net, param_file, output_file)
    ```
+
    对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)这个示例，可直接运行 `python` [merge_v2_model.py](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense/merge_v2_model.py)。序列化结果会写入当前运行目录下的`output.paddle.model`文件中。使用这种方式，运行时C-API可以通过指定`output.paddle.model`文件的路径来加载预测模型。

 #### 注意事项

--- a/doc/howto/cluster/fluid_cluster_train_en.md
+++ b/doc/howto/cluster/fluid_cluster_train_en.md
@@ -32,7 +32,7 @@ The non-cluster version of this demo with fluid API is as follows:

 ``` python
 import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
+import paddle.fluid as fluid

 x = fluid.layers.data(name='x', shape=[13], dtype='float32')
 y_predict = fluid.layers.fc(input=x, size=1, act=None)
@@ -125,11 +125,11 @@ for pass_id in range(100):

 ### E2E demo

-Please find the complete demo from [here](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py).
+Please find the complete demo from [here](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book_distribute/notest_dist_fit_a_line.py).
 First `cd` into the folder that contains the `python` files. In this case:

 ```bash
-cd /paddle/python/paddle/v2/fluid/tests/book_distribute
+cd /paddle/python/paddle/fluid/tests/book_distribute
 ```

 In parameter server node run the following in the command line:

--- a/doc/howto/cluster/multi_cluster/index_cn.rst
+++ b/doc/howto/cluster/multi_cluster/index_cn.rst
 在不同集群中运行
 ================
+用户的集群环境不尽相同，为了方便大家的部署，我们提供了多种的集群部署方式，方便提交集群训练任务，以下将一一介绍:

-PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务，包括：
- `Kubernetes <http://kubernetes.io>`_ Google开源的容器集群的调度框架，支持大规模集群生产环境的完整集群方案。
- `OpenMPI <https://www.open-mpi.org>`_ 成熟的高性能并行计算框架。
- `Fabric <http://www.fabfile.org>`_ 集群管理工具。可以使用`Fabric`编写集群任务提交和管理脚本。
+`Kubernetes <http://kubernetes.io>`_ 是Google开源的容器集群的调度框架，支持大规模集群生产环境的完整集群方案。以下指南展示了PaddlePaddle对Kubernetes的支持：

-对于不同的集群平台，会分别介绍集群作业的启动和停止方法。这些例子都可以在 `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ 找到。
+..  toctree::
+  :maxdepth: 1
+
+  k8s_cn.md
+  k8s_distributed_cn.md

-在使用分布式计算平台进行训练时，任务被调度在集群中时，分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数，比如节点的ID、IP和任务节点个数等。
+`OpenMPI <https://www.open-mpi.org>`_  是成熟的高性能并行计算框架，在HPC领域使用非常的广泛。以下指南介绍了如何使用OpenMPI来搭建PaddlePaddle的集群训练任务:

 ..  toctree::
  :maxdepth: 1

-  fabric_cn.md
  openmpi_cn.md
-  k8s_cn.md
-  k8s_distributed_cn.md
+
+`Fabric <http://www.fabfile.org>`_ 是一个方便的程序部署和管理工具。我们提供了使用Fabric 进行部署、管理的方法，如果想详细了解，请阅读以下指南:
+
+..  toctree::
+  :maxdepth: 1
+
+  fabric_cn.md
+
+我们也支持在AWS上部署PaddlePaddle，详细请了解:
+
+..  toctree::
+  :maxdepth: 1
+
  k8s_aws_cn.md
+
+您可以在 `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ 找到以上相关的例子。
+
--- a/doc/howto/cluster/src/word2vec/api_train_v2.py
+++ b/doc/howto/cluster/src/word2vec/api_train_v2.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/doc/howto/cluster/src/word2vec/api_train_v2_cluster.py
+++ b/doc/howto/cluster/src/word2vec/api_train_v2_cluster.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/doc/howto/cluster/src/word2vec/prepare.py
+++ b/doc/howto/cluster/src/word2vec/prepare.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/doc/howto/optimization/cpu_profiling_cn.md
+++ b/doc/howto/optimization/cpu_profiling_cn.md
@@ -35,7 +35,7 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
 ```
   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.284    0.284   29.514   29.514 main.py:1(<module>)
-     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
+     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/executor.py:20(run)
     4696   12.040    0.003   12.040    0.003 {built-in method run}
        1    0.144    0.144    6.534    6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
 ```
@@ -61,9 +61,9 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
 ```text
     4696   12.040    0.003   12.040    0.003 {built-in method run}
   300005    0.874    0.000    1.681    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
-   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
-     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
-        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(<module>)
+   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:219(__init__)
+     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp)
+        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/__init__.py:1(<module>)
 ```

 可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长，每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息，了解其调用关系。
@@ -76,9 +76,9 @@ Called By:

 Function                                                                                                 was called by...
                                                                                                             ncalls  tottime  cumtime
-/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)
-/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone)
-                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:487(clone)
+                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:534(append_backward)


 Called:

--- a/doc/howto/optimization/cpu_profiling_en.md
+++ b/doc/howto/optimization/cpu_profiling_en.md
@@ -49,7 +49,7 @@ port, we will see the output like the following:
 ```
   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.284    0.284   29.514   29.514 main.py:1(<module>)
-     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
+     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/executor.py:20(run)
     4696   12.040    0.003   12.040    0.003 {built-in method run}
        1    0.144    0.144    6.534    6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
 ```
@@ -74,9 +74,9 @@ focus on. We can sort above profiling file by tottime:
 ```text
     4696   12.040    0.003   12.040    0.003 {built-in method run}
   300005    0.874    0.000    1.681    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
-   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
-     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
-        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(<module>)
+   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:219(__init__)
+     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp)
+        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/__init__.py:1(<module>)
 ```

 We can see that the most time-consuming function is the `built-in
@@ -93,9 +93,9 @@ Called By:

 Function                                                                                                 was called by...
                                                                                                             ncalls  tottime  cumtime
-/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)
-/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone)
-                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:487(clone)
+                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:534(append_backward)


 Called:

--- a/doc/howto/read_source.md
+++ b/doc/howto/read_source.md
 # PaddlePaddle Fluid Source Code Overview

-Examples: https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid/tests/book
+Examples: https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/tests/book

 Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework

@@ -26,16 +26,16 @@ sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
 sgd_optimizer.minimize(avg_cost)
 ```

- Variables: `x`,  `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/framework.py#)
- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid/layers)
+- Variables: `x`,  `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/framework.py#)
+- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/layers)
  - Every Layer has one or more operators and variables/parameters
    - All the operators are defined at [`paddle/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators). Other worth-looking files:
      - Base class: [`paddle/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h)
      - Operator Registration: [`paddle/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h) 
      - Operator Lookup: [`paddle/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_info.h)
 - Optimizer: `fluid.optimizer.SGD`. It does the following
-  - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/backward.py)]
-  - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/optimizer.py)]
+  - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/backward.py)]
+  - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/optimizer.py)]

 # Run Time

@@ -57,7 +57,7 @@ exe.run(fluid.default_main_program(),

 - Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h)
  - The device handle are at [paddle/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h)
- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)]
+- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)]
  - Feeds the data: `feed=feeder.feed(data)`
  - Evaluates all the operators
  - Fetches the result: `fetch_list=[avg_cost]`

--- a/doc/mobile/cross_compiling_for_android_cn.md
+++ b/doc/mobile/cross_compiling_for_android_cn.md
@@ -23,6 +23,12 @@ $ docker build -t username/paddle-android:dev . -f Dockerfile.android
 $ docker pull paddlepaddle/paddle:latest-dev-android
 ```

+对于国内用户，我们提供了加速访问的镜像源：
+
+```bash
+$ docker pull docker.paddlepaddlehub.com/paddle:latest-dev-android
+```
+
 ### 编译PaddlePaddle C-API库
 构建好开发镜像后，即可使用开发镜像来编译Android版PaddlePaddle C-API库。
 Android的Docker开发镜像向用户提供两个可配置的参数：
@@ -56,15 +62,15 @@ Android的Docker开发镜像向用户提供两个可配置的参数：

 - 编译`armeabi-v7a`，`Android API 21`的PaddlePaddle库

-  ```bash
-  $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
-  ```
+```bash
+$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
+```

 - 编译`arm64-v8a`，`Android API 21`的PaddlePaddle库

-  ```bash
-  $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
-  ```
+```bash
+$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
+```

 执行上述`docker run`命令时，容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置，并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`，`ANDROID_API<21`时，Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节，根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后，PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录，所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。

@@ -155,7 +161,11 @@ cmake -DCMAKE_SYSTEM_NAME=Android \
      ..
 ```

-用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小，可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`；若希望最快的执行速度，则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS`来影响PaddlePaddle的编译过程。
+用户还可根据自己的需求设置其他编译参数。
+
+- 设置`CMAKE_BUILD_TYPE`为`MinSizeRel`，最小化生成的库的大小。
+- 设置`CMAKE_BUILD_TYPE`为`Release`，获得最快的执行速度，
+- 用户亦可以通过手动设置`CMAKE_C/CXX_FLAGS`来影响PaddlePaddle的编译过程。

 **性能TIPS**，为了达到最快的计算速度，在CMake参数配置上，有以下建议：


--- a/doc/mobile/cross_compiling_for_android_en.md
+++ b/doc/mobile/cross_compiling_for_android_en.md
@@ -25,6 +25,12 @@ Users can directly use the published Docker image.
 $ docker pull paddlepaddle/paddle:latest-dev-android
 ```

+For users in China, we provide a faster mirror.
+
+```bash
+$ docker pull docker.paddlepaddlehub.com/paddle:latest-dev-android
+```
+
 ### Build the Inference Library

 We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below:
@@ -86,19 +92,19 @@ Android NDK includes everything we need to build the [*standalone toolchain*](ht

 - To build the standalone toolchain for `armeabi-v7a` and Android API level 21:

-  ```bash
-  your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
-          --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain
-  ```
+```bash
+your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+        --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain
+```
  
  The generated standalone toolchain will be in `your/path/to/arm_standalone_toolchain`.

 - To build the standalone toolchain for `arm64-v8a` and Android API level 21:

-  ```bash
-  your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
-          --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
-  ```
+```bash
+your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+        --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
+```

  The generated standalone toolchain will be in `your/path/to/arm64_standalone_toolchain`.


--- a/doc/templates/conf.py.cn.in
+++ b/doc/templates/conf.py.cn.in
@@ -82,7 +82,7 @@ language = 'zh_CN'

 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
-exclude_patterns = ['_build', '**/*_en*', '*_en*']
+exclude_patterns = ['_build', '**/*_en*', '*_en*', 'api/*']

 # The reST default role (used for this markup: `text`) to use for all
 # documents.

--- a/doc/templates/conf.py.en.in
+++ b/doc/templates/conf.py.en.in
@@ -82,7 +82,7 @@ language = None

 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
-exclude_patterns = ['_build', '**/*_cn*', '*_cn*']
+exclude_patterns = ['_build', '**/*_cn*', '*_cn*', 'api/*']

 # The reST default role (used for this markup: `text`) to use for all
 # documents.

--- a/go/CMakeLists.txt
+++ b/go/CMakeLists.txt
--- a/go/cmd/master/CMakeLists.txt
+++ b/go/cmd/master/CMakeLists.txt
--- a/go/cmd/master/master.go
+++ b/go/cmd/master/master.go
--- a/go/cmd/pserver/CMakeLists.txt
+++ b/go/cmd/pserver/CMakeLists.txt
--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
--- a/go/connection/conn.go
+++ b/go/connection/conn.go
--- a/go/master/CMakeLists.txt
+++ b/go/master/CMakeLists.txt
--- a/go/master/c/CMakeLists.txt
+++ b/go/master/c/CMakeLists.txt
--- a/go/master/c/client.go
+++ b/go/master/c/client.go
--- a/go/master/client.go
+++ b/go/master/client.go
--- a/go/master/client_internal_test.go
+++ b/go/master/client_internal_test.go
--- a/go/master/client_test.go
+++ b/go/master/client_test.go
--- a/go/master/etcd_client.go
+++ b/go/master/etcd_client.go
--- a/go/master/inmem_store.go
+++ b/go/master/inmem_store.go
--- a/go/master/service.go
+++ b/go/master/service.go
--- a/go/master/service_internal_test.go
+++ b/go/master/service_internal_test.go
--- a/go/pserver/CMakeLists.txt
+++ b/go/pserver/CMakeLists.txt
--- a/go/pserver/client/CMakeLists.txt
+++ b/go/pserver/client/CMakeLists.txt
--- a/go/pserver/client/c/CMakeLists.txt
+++ b/go/pserver/client/c/CMakeLists.txt
--- a/go/pserver/client/c/cclient.go
+++ b/go/pserver/client/c/cclient.go
--- a/go/pserver/client/c/test/CMakeLists.txt
+++ b/go/pserver/client/c/test/CMakeLists.txt
--- a/go/pserver/client/c/test/test_cclient.c
+++ b/go/pserver/client/c/test/test_cclient.c
--- a/go/pserver/client/c/test/test_mnist.py
+++ b/go/pserver/client/c/test/test_mnist.py
--- a/go/pserver/client/c/test/test_train.py
+++ b/go/pserver/client/c/test/test_train.py
--- a/go/pserver/client/client.go
+++ b/go/pserver/client/client.go
--- a/go/pserver/client/client_test.go
+++ b/go/pserver/client/client_test.go
--- a/go/pserver/client/etcd_client.go
+++ b/go/pserver/client/etcd_client.go
--- a/go/pserver/etcd_client.go
+++ b/go/pserver/etcd_client.go
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
--- a/go/pserver/optimizer_test.go
+++ b/go/pserver/optimizer_test.go
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
--- a/go/pserver/service_test.go
+++ b/go/pserver/service_test.go
--- a/go/utils/networkhelper/CMakeLists.txt
+++ b/go/utils/networkhelper/CMakeLists.txt
--- a/go/utils/networkhelper/helper.go
+++ b/go/utils/networkhelper/helper.go
--- a/go/utils/networkhelper/helper_test.go
+++ b/go/utils/networkhelper/helper_test.go
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
--- a/paddle/api/Arguments.cpp
+++ b/paddle/api/Arguments.cpp
--- a/paddle/api/ConfigParser.cpp
+++ b/paddle/api/ConfigParser.cpp
--- a/paddle/api/Evaluator.cpp
+++ b/paddle/api/Evaluator.cpp
--- a/paddle/api/GradientMachine.cpp
+++ b/paddle/api/GradientMachine.cpp
--- a/paddle/api/Internal.h
+++ b/paddle/api/Internal.h
--- a/paddle/api/Matrix.cpp
+++ b/paddle/api/Matrix.cpp
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
--- a/paddle/api/PaddleAPIPrivate.h
+++ b/paddle/api/PaddleAPIPrivate.h
--- a/paddle/api/Parameter.cpp
+++ b/paddle/api/Parameter.cpp
--- a/paddle/api/ParameterOptimizer.cpp
+++ b/paddle/api/ParameterOptimizer.cpp
--- a/paddle/api/ParameterUpdater.cpp
+++ b/paddle/api/ParameterUpdater.cpp
--- a/paddle/api/SequenceGenerator.cpp
+++ b/paddle/api/SequenceGenerator.cpp
--- a/paddle/api/Trainer.cpp
+++ b/paddle/api/Trainer.cpp
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
--- a/paddle/api/Vector.cpp
+++ b/paddle/api/Vector.cpp
--- a/paddle/api/test/testTrainConfig.py
+++ b/paddle/api/test/testTrainConfig.py
--- a/paddle/capi/Arguments.cpp
+++ b/paddle/capi/Arguments.cpp
--- a/paddle/capi/Main.cpp
+++ b/paddle/capi/Main.cpp
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
--- a/paddle/capi/Vector.cpp
+++ b/paddle/capi/Vector.cpp
--- a/paddle/capi/arguments.h
+++ b/paddle/capi/arguments.h
--- a/paddle/capi/capi.h
+++ b/paddle/capi/capi.h
--- a/paddle/capi/capi_private.h
+++ b/paddle/capi/capi_private.h
--- a/paddle/capi/error.cpp
+++ b/paddle/capi/error.cpp
--- a/paddle/capi/error.h
+++ b/paddle/capi/error.h
--- a/paddle/capi/examples/model_inference/common/common.h
+++ b/paddle/capi/examples/model_inference/common/common.h
--- a/paddle/capi/examples/model_inference/dense/main.c
+++ b/paddle/capi/examples/model_inference/dense/main.c
--- a/paddle/capi/examples/model_inference/dense/merge_v2_model.py
+++ b/paddle/capi/examples/model_inference/dense/merge_v2_model.py
--- a/paddle/capi/examples/model_inference/dense/mnist_v2.py
+++ b/paddle/capi/examples/model_inference/dense/mnist_v2.py
--- a/paddle/capi/examples/model_inference/dense/trainer_config.py
+++ b/paddle/capi/examples/model_inference/dense/trainer_config.py
--- a/paddle/capi/examples/model_inference/multi_thread/main.c
+++ b/paddle/capi/examples/model_inference/multi_thread/main.c
--- a/paddle/capi/examples/model_inference/multi_thread/main_gpu.c
+++ b/paddle/capi/examples/model_inference/multi_thread/main_gpu.c
--- a/paddle/capi/examples/model_inference/multi_thread/trainer_config.py
+++ b/paddle/capi/examples/model_inference/multi_thread/trainer_config.py
--- a/paddle/capi/examples/model_inference/multi_thread/trainer_config.py
+++ b/paddle/capi/examples/model_inference/multi_thread/trainer_config.py
--- a/paddle/capi/examples/model_inference/sequence/main.c
+++ b/paddle/capi/examples/model_inference/sequence/main.c
--- a/paddle/capi/examples/model_inference/sequence/trainer_config.py
+++ b/paddle/capi/examples/model_inference/sequence/trainer_config.py
--- a/paddle/capi/examples/model_inference/sparse_binary/main.c
+++ b/paddle/capi/examples/model_inference/sparse_binary/main.c
--- a/paddle/capi/examples/model_inference/sparse_binary/trainer_config.py
+++ b/paddle/capi/examples/model_inference/sparse_binary/trainer_config.py
--- a/paddle/capi/examples/model_inference/sparse_binary/trainer_config.py
+++ b/paddle/capi/examples/model_inference/sparse_binary/trainer_config.py
--- a/paddle/capi/gradient_machine.cpp
+++ b/paddle/capi/gradient_machine.cpp
--- a/paddle/capi/gradient_machine.h
+++ b/paddle/capi/gradient_machine.h
--- a/paddle/capi/main.h
+++ b/paddle/capi/main.h
--- a/paddle/capi/matrix.h
+++ b/paddle/capi/matrix.h
--- a/paddle/capi/tests/test_Arguments.cpp
+++ b/paddle/capi/tests/test_Arguments.cpp
--- a/paddle/capi/tests/test_GradientMachine.cpp
+++ b/paddle/capi/tests/test_GradientMachine.cpp
--- a/paddle/capi/tests/test_Matrix.cpp
+++ b/paddle/capi/tests/test_Matrix.cpp
--- a/paddle/capi/tests/test_Vector.cpp
+++ b/paddle/capi/tests/test_Vector.cpp
--- a/paddle/capi/tests/test_predict_network.py
+++ b/paddle/capi/tests/test_predict_network.py
--- a/paddle/capi/vector.h
+++ b/paddle/capi/vector.h
--- a/paddle/cuda/include/hl_activation_functions.h
+++ b/paddle/cuda/include/hl_activation_functions.h
--- a/paddle/cuda/include/hl_aggregate.h
+++ b/paddle/cuda/include/hl_aggregate.h
--- a/paddle/cuda/include/hl_avx_functions.h
+++ b/paddle/cuda/include/hl_avx_functions.h
--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
--- a/paddle/cuda/include/hl_batch_norm.h
+++ b/paddle/cuda/include/hl_batch_norm.h
--- a/paddle/cuda/include/hl_batch_transpose.h
+++ b/paddle/cuda/include/hl_batch_transpose.h
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
--- a/paddle/cuda/include/hl_cpu_gru.cuh
+++ b/paddle/cuda/include/hl_cpu_gru.cuh
--- a/paddle/cuda/include/hl_cpu_lstm.cuh
+++ b/paddle/cuda/include/hl_cpu_lstm.cuh
--- a/paddle/cuda/include/hl_cpu_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_cpu_matrix_kernel.cuh
--- a/paddle/cuda/include/hl_cpu_matrix_kernel_detail.cuh
+++ b/paddle/cuda/include/hl_cpu_matrix_kernel_detail.cuh
--- a/paddle/cuda/include/hl_cpu_scalar.cuh
+++ b/paddle/cuda/include/hl_cpu_scalar.cuh
--- a/paddle/cuda/include/hl_cpu_simd_neon.cuh
+++ b/paddle/cuda/include/hl_cpu_simd_neon.cuh
--- a/paddle/cuda/include/hl_cpu_simd_sse.cuh
+++ b/paddle/cuda/include/hl_cpu_simd_sse.cuh
--- a/paddle/cuda/include/hl_cuda.h
+++ b/paddle/cuda/include/hl_cuda.h
--- a/paddle/cuda/include/hl_cuda.ph
+++ b/paddle/cuda/include/hl_cuda.ph
--- a/paddle/cuda/include/hl_cuda_cublas.h
+++ b/paddle/cuda/include/hl_cuda_cublas.h
--- a/paddle/cuda/include/hl_cuda_cudnn.h
+++ b/paddle/cuda/include/hl_cuda_cudnn.h
--- a/paddle/cuda/include/hl_cuda_cudnn.ph
+++ b/paddle/cuda/include/hl_cuda_cudnn.ph
--- a/paddle/cuda/include/hl_device_functions.cuh
+++ b/paddle/cuda/include/hl_device_functions.cuh
--- a/paddle/cuda/include/hl_functions.h
+++ b/paddle/cuda/include/hl_functions.h
--- a/paddle/cuda/include/hl_gpu.h
+++ b/paddle/cuda/include/hl_gpu.h
--- a/paddle/cuda/include/hl_gpu_functions.cuh
+++ b/paddle/cuda/include/hl_gpu_functions.cuh
--- a/paddle/cuda/include/hl_gpu_gru.cuh
+++ b/paddle/cuda/include/hl_gpu_gru.cuh
--- a/paddle/cuda/include/hl_gpu_lstm.cuh
+++ b/paddle/cuda/include/hl_gpu_lstm.cuh
--- a/paddle/cuda/include/hl_gpu_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_gpu_matrix_kernel.cuh
--- a/paddle/cuda/include/hl_gru_ops.cuh
+++ b/paddle/cuda/include/hl_gru_ops.cuh
--- a/paddle/cuda/include/hl_lstm.h
+++ b/paddle/cuda/include/hl_lstm.h
--- a/paddle/cuda/include/hl_lstm_ops.cuh
+++ b/paddle/cuda/include/hl_lstm_ops.cuh
--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
--- a/paddle/cuda/include/hl_matrix_apply.cuh
+++ b/paddle/cuda/include/hl_matrix_apply.cuh
--- a/paddle/cuda/include/hl_matrix_base.cuh
+++ b/paddle/cuda/include/hl_matrix_base.cuh
--- a/paddle/cuda/include/hl_matrix_base_detail.cuh
+++ b/paddle/cuda/include/hl_matrix_base_detail.cuh
--- a/paddle/cuda/include/hl_matrix_ops.cuh
+++ b/paddle/cuda/include/hl_matrix_ops.cuh
--- a/paddle/cuda/include/hl_matrix_type.cuh
+++ b/paddle/cuda/include/hl_matrix_type.cuh
--- a/paddle/cuda/include/hl_perturbation_util.cuh
+++ b/paddle/cuda/include/hl_perturbation_util.cuh
--- a/paddle/cuda/include/hl_recurrent_apply.cuh
+++ b/paddle/cuda/include/hl_recurrent_apply.cuh
--- a/paddle/cuda/include/hl_sequence.h
+++ b/paddle/cuda/include/hl_sequence.h
--- a/paddle/cuda/include/hl_sparse.h
+++ b/paddle/cuda/include/hl_sparse.h
--- a/paddle/cuda/include/hl_sparse.ph
+++ b/paddle/cuda/include/hl_sparse.ph
--- a/paddle/cuda/include/hl_table_apply.h
+++ b/paddle/cuda/include/hl_table_apply.h
--- a/paddle/cuda/include/hl_tensor_ops.h
+++ b/paddle/cuda/include/hl_tensor_ops.h
--- a/paddle/cuda/include/hl_thread.ph
+++ b/paddle/cuda/include/hl_thread.ph
--- a/paddle/cuda/include/hl_time.h
+++ b/paddle/cuda/include/hl_time.h
--- a/paddle/cuda/include/hl_top_k.h
+++ b/paddle/cuda/include/hl_top_k.h
--- a/paddle/cuda/include/hl_warpctc_wrap.h
+++ b/paddle/cuda/include/hl_warpctc_wrap.h
--- a/paddle/cuda/include/stub/hl_aggregate_stub.h
+++ b/paddle/cuda/include/stub/hl_aggregate_stub.h
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
--- a/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
--- a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
--- a/paddle/cuda/include/stub/hl_cuda_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_stub.h
--- a/paddle/cuda/include/stub/hl_lstm_stub.h
+++ b/paddle/cuda/include/stub/hl_lstm_stub.h
--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
--- a/paddle/cuda/include/stub/hl_sequence_stub.h
+++ b/paddle/cuda/include/stub/hl_sequence_stub.h
--- a/paddle/cuda/include/stub/hl_sparse_stub.h
+++ b/paddle/cuda/include/stub/hl_sparse_stub.h
--- a/paddle/cuda/src/avx_mathfun.h
+++ b/paddle/cuda/src/avx_mathfun.h
--- a/paddle/cuda/src/hl_avx_functions.cc
+++ b/paddle/cuda/src/hl_avx_functions.cc
--- a/paddle/cuda/src/hl_batch_norm.cu
+++ b/paddle/cuda/src/hl_batch_norm.cu
--- a/paddle/cuda/src/hl_batch_transpose.cu
+++ b/paddle/cuda/src/hl_batch_transpose.cu
--- a/paddle/cuda/src/hl_cpu_functions.cc
+++ b/paddle/cuda/src/hl_cpu_functions.cc
--- a/paddle/cuda/src/hl_cuda_aggregate.cu
+++ b/paddle/cuda/src/hl_cuda_aggregate.cu
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ b/paddle/cuda/src/hl_cuda_cublas.cc
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
--- a/paddle/cuda/src/hl_cuda_lstm.cu
+++ b/paddle/cuda/src/hl_cuda_lstm.cu
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
--- a/paddle/cuda/src/hl_cuda_sparse.cu
+++ b/paddle/cuda/src/hl_cuda_sparse.cu
--- a/paddle/cuda/src/hl_cuda_sparse.cuh
+++ b/paddle/cuda/src/hl_cuda_sparse.cuh
--- a/paddle/cuda/src/hl_math.cc
+++ b/paddle/cuda/src/hl_math.cc
--- a/paddle/cuda/src/hl_perturbation_util.cu
+++ b/paddle/cuda/src/hl_perturbation_util.cu
--- a/paddle/cuda/src/hl_table_apply.cu
+++ b/paddle/cuda/src/hl_table_apply.cu
--- a/paddle/cuda/src/hl_time.cc
+++ b/paddle/cuda/src/hl_time.cc
--- a/paddle/cuda/src/hl_top_k.cu
+++ b/paddle/cuda/src/hl_top_k.cu
--- a/paddle/cuda/src/hl_warpctc_wrap.cc
+++ b/paddle/cuda/src/hl_warpctc_wrap.cc
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
--- a/paddle/fluid/framework/attribute.cc
+++ b/paddle/fluid/framework/attribute.cc
--- a/paddle/fluid/framework/attribute.h
+++ b/paddle/fluid/framework/attribute.h
--- a/paddle/fluid/framework/backward.cc
+++ b/paddle/fluid/framework/backward.cc
--- a/paddle/fluid/framework/backward.h
+++ b/paddle/fluid/framework/backward.h
--- a/paddle/fluid/framework/backward_test.cc
+++ b/paddle/fluid/framework/backward_test.cc
--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
--- a/paddle/fluid/framework/channel_test.cc
+++ b/paddle/fluid/framework/channel_test.cc
--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
--- a/paddle/fluid/framework/data_device_transform.h
+++ b/paddle/fluid/framework/data_device_transform.h
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
--- a/paddle/fluid/framework/data_layout.h
+++ b/paddle/fluid/framework/data_layout.h
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
--- a/paddle/fluid/framework/data_layout_transform_test.cc
+++ b/paddle/fluid/framework/data_layout_transform_test.cc
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
--- a/paddle/fluid/framework/data_transform.h
+++ b/paddle/fluid/framework/data_transform.h
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
--- a/paddle/fluid/framework/data_type_transform.h
+++ b/paddle/fluid/framework/data_type_transform.h
--- a/paddle/fluid/framework/data_type_transform_test.cc
+++ b/paddle/fluid/framework/data_type_transform_test.cc
--- a/paddle/fluid/framework/ddim.cc
+++ b/paddle/fluid/framework/ddim.cc
--- a/paddle/fluid/framework/ddim.h
+++ b/paddle/fluid/framework/ddim.h
--- a/paddle/fluid/framework/ddim_test.cc
+++ b/paddle/fluid/framework/ddim_test.cc
--- a/paddle/fluid/framework/dim.h
+++ b/paddle/fluid/framework/dim.h
--- a/paddle/fluid/framework/dim_test.cu
+++ b/paddle/fluid/framework/dim_test.cu
--- a/paddle/fluid/framework/eigen.h
+++ b/paddle/fluid/framework/eigen.h
--- a/paddle/fluid/framework/eigen_test.cc
+++ b/paddle/fluid/framework/eigen_test.cc
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
--- a/paddle/fluid/framework/feed_fetch_method.h
+++ b/paddle/fluid/framework/feed_fetch_method.h
--- a/paddle/fluid/framework/feed_fetch_type.h
+++ b/paddle/fluid/framework/feed_fetch_type.h
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
--- a/paddle/fluid/framework/grad_op_desc_maker.h
+++ b/paddle/fluid/framework/grad_op_desc_maker.h
--- a/paddle/fluid/framework/init.cc
+++ b/paddle/fluid/framework/init.cc
--- a/paddle/fluid/framework/init.h
+++ b/paddle/fluid/framework/init.h
--- a/paddle/fluid/framework/init_test.cc
+++ b/paddle/fluid/framework/init_test.cc
--- a/paddle/fluid/framework/library_type.h
+++ b/paddle/fluid/framework/library_type.h
--- a/paddle/fluid/framework/lod_rank_table.cc
+++ b/paddle/fluid/framework/lod_rank_table.cc
--- a/paddle/fluid/framework/lod_rank_table.h
+++ b/paddle/fluid/framework/lod_rank_table.h
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
--- a/paddle/fluid/framework/lod_tensor_array.h
+++ b/paddle/fluid/framework/lod_tensor_array.h
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
--- a/paddle/fluid/framework/lod_tensor_test.cu
+++ b/paddle/fluid/framework/lod_tensor_test.cu
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
--- a/paddle/fluid/framework/mixed_vector_test.cu
+++ b/paddle/fluid/framework/mixed_vector_test.cu
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
--- a/paddle/fluid/framework/op_info.cc
+++ b/paddle/fluid/framework/op_info.cc
--- a/paddle/fluid/framework/op_info.h
+++ b/paddle/fluid/framework/op_info.h
--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
--- a/paddle/fluid/framework/op_kernel_type_test.cc
+++ b/paddle/fluid/framework/op_kernel_type_test.cc
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
--- a/paddle/fluid/framework/op_proto_maker_test.cc
+++ b/paddle/fluid/framework/op_proto_maker_test.cc
--- a/paddle/fluid/framework/op_registry.cc
+++ b/paddle/fluid/framework/op_registry.cc
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
--- a/paddle/fluid/framework/op_registry_test.cc
+++ b/paddle/fluid/framework/op_registry_test.cc
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
--- a/paddle/fluid/framework/program_desc_test.cc
+++ b/paddle/fluid/framework/program_desc_test.cc
--- a/paddle/fluid/framework/proto_desc.h
+++ b/paddle/fluid/framework/proto_desc.h
--- a/paddle/fluid/framework/prune.cc
+++ b/paddle/fluid/framework/prune.cc
--- a/paddle/fluid/framework/prune.h
+++ b/paddle/fluid/framework/prune.h
--- a/paddle/fluid/framework/prune_test.cc
+++ b/paddle/fluid/framework/prune_test.cc
--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
--- a/paddle/fluid/framework/scope_test.cc
+++ b/paddle/fluid/framework/scope_test.cc
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
--- a/paddle/fluid/framework/selected_rows_test.cc
+++ b/paddle/fluid/framework/selected_rows_test.cc
--- a/paddle/fluid/framework/shape_inference.cc
+++ b/paddle/fluid/framework/shape_inference.cc
--- a/paddle/fluid/framework/shape_inference.h
+++ b/paddle/fluid/framework/shape_inference.h
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
--- a/paddle/fluid/framework/tensor_util.cu
+++ b/paddle/fluid/framework/tensor_util.cu
--- a/paddle/fluid/framework/tensor_util.cu
+++ b/paddle/fluid/framework/tensor_util.cu
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
--- a/paddle/fluid/framework/tensor_util_test.cu
+++ b/paddle/fluid/framework/tensor_util_test.cu
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
--- a/paddle/fluid/framework/threadpool_test.cc
+++ b/paddle/fluid/framework/threadpool_test.cc
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
--- a/paddle/fluid/framework/var_desc.cc
+++ b/paddle/fluid/framework/var_desc.cc
--- a/paddle/fluid/framework/var_desc.h
+++ b/paddle/fluid/framework/var_desc.h
--- a/paddle/fluid/framework/var_type.h
+++ b/paddle/fluid/framework/var_type.h
--- a/paddle/fluid/framework/var_type_inference.h
+++ b/paddle/fluid/framework/var_type_inference.h
--- a/paddle/fluid/framework/var_type_inference_test.cc
+++ b/paddle/fluid/framework/var_type_inference_test.cc
--- a/paddle/fluid/framework/variable.h
+++ b/paddle/fluid/framework/variable.h
--- a/paddle/fluid/framework/variable_test.cc
+++ b/paddle/fluid/framework/variable_test.cc
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
--- a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
--- a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
--- a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
--- a/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
--- a/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
--- a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
--- a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
--- a/paddle/fluid/memory/detail/memory_block.cc
+++ b/paddle/fluid/memory/detail/memory_block.cc
--- a/paddle/fluid/memory/detail/memory_block.h
+++ b/paddle/fluid/memory/detail/memory_block.h
--- a/paddle/fluid/memory/detail/meta_cache.cc
+++ b/paddle/fluid/memory/detail/meta_cache.cc
--- a/paddle/fluid/memory/detail/meta_cache.h
+++ b/paddle/fluid/memory/detail/meta_cache.h
--- a/paddle/fluid/memory/detail/meta_data.cc
+++ b/paddle/fluid/memory/detail/meta_data.cc
--- a/paddle/fluid/memory/detail/meta_data.h
+++ b/paddle/fluid/memory/detail/meta_data.h
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
--- a/paddle/fluid/memory/detail/system_allocator_test.cc
+++ b/paddle/fluid/memory/detail/system_allocator_test.cc
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
--- a/paddle/fluid/memory/memcpy.h
+++ b/paddle/fluid/memory/memcpy.h
--- a/paddle/fluid/memory/memory.cc
+++ b/paddle/fluid/memory/memory.cc
--- a/paddle/fluid/memory/memory.h
+++ b/paddle/fluid/memory/memory.h
--- a/paddle/fluid/memory/memory_test.cc
+++ b/paddle/fluid/memory/memory_test.cc
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
--- a/paddle/fluid/operators/accuracy_op.cc
+++ b/paddle/fluid/operators/accuracy_op.cc
--- a/paddle/fluid/operators/accuracy_op.cu
+++ b/paddle/fluid/operators/accuracy_op.cu
--- a/paddle/fluid/operators/accuracy_op.h
+++ b/paddle/fluid/operators/accuracy_op.h
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
--- a/paddle/fluid/operators/adadelta_op.cc
+++ b/paddle/fluid/operators/adadelta_op.cc
--- a/paddle/fluid/operators/adadelta_op.cu
+++ b/paddle/fluid/operators/adadelta_op.cu
--- a/paddle/fluid/operators/adadelta_op.h
+++ b/paddle/fluid/operators/adadelta_op.h
--- a/paddle/fluid/operators/adagrad_op.cc
+++ b/paddle/fluid/operators/adagrad_op.cc
--- a/paddle/fluid/operators/adagrad_op.cu
+++ b/paddle/fluid/operators/adagrad_op.cu
--- a/paddle/fluid/operators/adagrad_op.h
+++ b/paddle/fluid/operators/adagrad_op.h
--- a/paddle/fluid/operators/adam_op.cc
+++ b/paddle/fluid/operators/adam_op.cc
--- a/paddle/fluid/operators/adam_op.cu
+++ b/paddle/fluid/operators/adam_op.cu
--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
--- a/paddle/fluid/operators/adamax_op.cc
+++ b/paddle/fluid/operators/adamax_op.cc
--- a/paddle/fluid/operators/adamax_op.cu
+++ b/paddle/fluid/operators/adamax_op.cu
--- a/paddle/fluid/operators/adamax_op.h
+++ b/paddle/fluid/operators/adamax_op.h
--- a/paddle/fluid/operators/array_operator.h
+++ b/paddle/fluid/operators/array_operator.h
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
--- a/paddle/fluid/operators/assign_op.cc
+++ b/paddle/fluid/operators/assign_op.cc
--- a/paddle/fluid/operators/assign_value_op.cc
+++ b/paddle/fluid/operators/assign_value_op.cc
--- a/paddle/fluid/operators/assign_value_op.cu.cc
+++ b/paddle/fluid/operators/assign_value_op.cu.cc
--- a/paddle/fluid/operators/assign_value_op.h
+++ b/paddle/fluid/operators/assign_value_op.h
--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
--- a/paddle/fluid/operators/batch_norm_op.cu.cc
+++ b/paddle/fluid/operators/batch_norm_op.cu.cc
--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
--- a/paddle/fluid/operators/batch_size_like.h
+++ b/paddle/fluid/operators/batch_size_like.h
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
--- a/paddle/fluid/operators/beam_search_decode_op.h
+++ b/paddle/fluid/operators/beam_search_decode_op.h
--- a/paddle/fluid/operators/beam_search_decode_op_test.cc
+++ b/paddle/fluid/operators/beam_search_decode_op_test.cc
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
--- a/paddle/fluid/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
--- a/paddle/fluid/operators/beam_search_op_test.cc
+++ b/paddle/fluid/operators/beam_search_op_test.cc
--- a/paddle/fluid/operators/bilinear_tensor_product_op.cc
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc
--- a/paddle/fluid/operators/bilinear_tensor_product_op.cu
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.cu
--- a/paddle/fluid/operators/bilinear_tensor_product_op.h
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.h
--- a/paddle/fluid/operators/bipartite_match_op.cc
+++ b/paddle/fluid/operators/bipartite_match_op.cc
--- a/paddle/fluid/operators/box_coder_op.cc
+++ b/paddle/fluid/operators/box_coder_op.cc
--- a/paddle/fluid/operators/box_coder_op.cu
+++ b/paddle/fluid/operators/box_coder_op.cu
--- a/paddle/fluid/operators/box_coder_op.h
+++ b/paddle/fluid/operators/box_coder_op.h
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
--- a/paddle/fluid/operators/cast_op.cu
+++ b/paddle/fluid/operators/cast_op.cu
--- a/paddle/fluid/operators/cast_op.h
+++ b/paddle/fluid/operators/cast_op.h
--- a/paddle/fluid/operators/chunk_eval_op.cc
+++ b/paddle/fluid/operators/chunk_eval_op.cc
--- a/paddle/fluid/operators/chunk_eval_op.h
+++ b/paddle/fluid/operators/chunk_eval_op.h
--- a/paddle/fluid/operators/clip_by_norm_op.cc
+++ b/paddle/fluid/operators/clip_by_norm_op.cc
--- a/paddle/fluid/operators/clip_by_norm_op.cu
+++ b/paddle/fluid/operators/clip_by_norm_op.cu
--- a/paddle/fluid/operators/clip_by_norm_op.h
+++ b/paddle/fluid/operators/clip_by_norm_op.h
--- a/paddle/fluid/operators/clip_op.cc
+++ b/paddle/fluid/operators/clip_op.cc
--- a/paddle/fluid/operators/clip_op.cu
+++ b/paddle/fluid/operators/clip_op.cu
--- a/paddle/fluid/operators/clip_op.h
+++ b/paddle/fluid/operators/clip_op.h
--- a/paddle/fluid/operators/compare_op.cc
+++ b/paddle/fluid/operators/compare_op.cc
--- a/paddle/fluid/operators/compare_op.cu
+++ b/paddle/fluid/operators/compare_op.cu
--- a/paddle/fluid/operators/compare_op.h
+++ b/paddle/fluid/operators/compare_op.h
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
--- a/paddle/fluid/operators/concat_op.cu.cc
+++ b/paddle/fluid/operators/concat_op.cu.cc
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
--- a/paddle/fluid/operators/cond_op.cc
+++ b/paddle/fluid/operators/cond_op.cc
--- a/paddle/fluid/operators/cond_op.h
+++ b/paddle/fluid/operators/cond_op.h
--- a/paddle/fluid/operators/conditional_block_op.cc
+++ b/paddle/fluid/operators/conditional_block_op.cc
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
--- a/paddle/fluid/operators/conv_op.cu.cc
+++ b/paddle/fluid/operators/conv_op.cu.cc
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
--- a/paddle/fluid/operators/conv_shift_op.cc
+++ b/paddle/fluid/operators/conv_shift_op.cc
--- a/paddle/fluid/operators/conv_shift_op.cu
+++ b/paddle/fluid/operators/conv_shift_op.cu
--- a/paddle/fluid/operators/conv_shift_op.h
+++ b/paddle/fluid/operators/conv_shift_op.h
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
--- a/paddle/fluid/operators/conv_transpose_op.cu.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cu.cc
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
--- a/paddle/fluid/operators/cos_sim_op.cc
+++ b/paddle/fluid/operators/cos_sim_op.cc
--- a/paddle/fluid/operators/cos_sim_op.cu
+++ b/paddle/fluid/operators/cos_sim_op.cu
--- a/paddle/fluid/operators/cos_sim_op.h
+++ b/paddle/fluid/operators/cos_sim_op.h
--- a/paddle/fluid/operators/create_reader_op.cc
+++ b/paddle/fluid/operators/create_reader_op.cc
--- a/paddle/fluid/operators/crf_decoding_op.cc
+++ b/paddle/fluid/operators/crf_decoding_op.cc
--- a/paddle/fluid/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
--- a/paddle/fluid/operators/crop_op.cu
+++ b/paddle/fluid/operators/crop_op.cu
--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
--- a/paddle/fluid/operators/cross_entropy_op.cu
+++ b/paddle/fluid/operators/cross_entropy_op.cu
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
--- a/paddle/fluid/operators/ctc_align_op.cc
+++ b/paddle/fluid/operators/ctc_align_op.cc
--- a/paddle/fluid/operators/ctc_align_op.cu
+++ b/paddle/fluid/operators/ctc_align_op.cu
--- a/paddle/fluid/operators/ctc_align_op.h
+++ b/paddle/fluid/operators/ctc_align_op.h
--- a/paddle/fluid/operators/cum_op.h
+++ b/paddle/fluid/operators/cum_op.h
--- a/paddle/fluid/operators/cumsum_op.cc
+++ b/paddle/fluid/operators/cumsum_op.cc
--- a/paddle/fluid/operators/cumsum_op.cu
+++ b/paddle/fluid/operators/cumsum_op.cu
--- a/paddle/fluid/operators/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/decayed_adagrad_op.cc
--- a/paddle/fluid/operators/decayed_adagrad_op.cu
+++ b/paddle/fluid/operators/decayed_adagrad_op.cu
--- a/paddle/fluid/operators/decayed_adagrad_op.h
+++ b/paddle/fluid/operators/decayed_adagrad_op.h
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
--- a/paddle/fluid/operators/detail/grpc_client.h
+++ b/paddle/fluid/operators/detail/grpc_client.h
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
--- a/paddle/fluid/operators/detail/safe_ref.h
+++ b/paddle/fluid/operators/detail/safe_ref.h
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
--- a/paddle/fluid/operators/detail/sendrecvop_utils.h
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.h
--- a/paddle/fluid/operators/detail/simple_block_queue.h
+++ b/paddle/fluid/operators/detail/simple_block_queue.h
--- a/paddle/fluid/operators/detail/strided_memcpy.h
+++ b/paddle/fluid/operators/detail/strided_memcpy.h
--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
--- a/paddle/fluid/operators/detection_map_op.h
+++ b/paddle/fluid/operators/detection_map_op.h
--- a/paddle/fluid/operators/detection_output_op.cc
+++ b/paddle/fluid/operators/detection_output_op.cc
--- a/paddle/fluid/operators/detection_output_op.cu.cc
+++ b/paddle/fluid/operators/detection_output_op.cu.cc
--- a/paddle/fluid/operators/detection_output_op.h
+++ b/paddle/fluid/operators/detection_output_op.h
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
--- a/paddle/fluid/operators/dropout_op.h
+++ b/paddle/fluid/operators/dropout_op.h
--- a/paddle/fluid/operators/edit_distance_op.cc
+++ b/paddle/fluid/operators/edit_distance_op.cc
--- a/paddle/fluid/operators/edit_distance_op.cu
+++ b/paddle/fluid/operators/edit_distance_op.cu
--- a/paddle/fluid/operators/edit_distance_op.h
+++ b/paddle/fluid/operators/edit_distance_op.h
--- a/paddle/fluid/operators/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise_add_op.cc
--- a/paddle/fluid/operators/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise_add_op.cu
--- a/paddle/fluid/operators/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise_add_op.h
--- a/paddle/fluid/operators/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise_div_op.cc
--- a/paddle/fluid/operators/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise_div_op.cu
--- a/paddle/fluid/operators/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise_div_op.h
--- a/paddle/fluid/operators/elementwise_max_op.cc
+++ b/paddle/fluid/operators/elementwise_max_op.cc
--- a/paddle/fluid/operators/elementwise_max_op.cu
+++ b/paddle/fluid/operators/elementwise_max_op.cu
--- a/paddle/fluid/operators/elementwise_max_op.h
+++ b/paddle/fluid/operators/elementwise_max_op.h
--- a/paddle/fluid/operators/elementwise_min_op.cc
+++ b/paddle/fluid/operators/elementwise_min_op.cc
--- a/paddle/fluid/operators/elementwise_min_op.cu
+++ b/paddle/fluid/operators/elementwise_min_op.cu
--- a/paddle/fluid/operators/elementwise_min_op.h
+++ b/paddle/fluid/operators/elementwise_min_op.h
--- a/paddle/fluid/operators/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_op.cc
--- a/paddle/fluid/operators/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise_mul_op.cu
--- a/paddle/fluid/operators/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise_mul_op.h
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
--- a/paddle/fluid/operators/elementwise_pow_op.cc
+++ b/paddle/fluid/operators/elementwise_pow_op.cc
--- a/paddle/fluid/operators/elementwise_pow_op.cu
+++ b/paddle/fluid/operators/elementwise_pow_op.cu
--- a/paddle/fluid/operators/elementwise_pow_op.h
+++ b/paddle/fluid/operators/elementwise_pow_op.h
--- a/paddle/fluid/operators/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise_sub_op.cc
--- a/paddle/fluid/operators/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise_sub_op.cu
--- a/paddle/fluid/operators/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise_sub_op.h
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
--- a/paddle/fluid/operators/expand_op.cu
+++ b/paddle/fluid/operators/expand_op.cu
--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
--- a/paddle/fluid/operators/feed_op.cc
+++ b/paddle/fluid/operators/feed_op.cc
--- a/paddle/fluid/operators/fetch_op.cc
+++ b/paddle/fluid/operators/fetch_op.cc
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.h
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.h
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
--- a/paddle/fluid/operators/fill_op.cc
+++ b/paddle/fluid/operators/fill_op.cc
--- a/paddle/fluid/operators/fill_zeros_like_op.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cc
--- a/paddle/fluid/operators/fill_zeros_like_op.cu.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
--- a/paddle/fluid/operators/fill_zeros_like_op.h
+++ b/paddle/fluid/operators/fill_zeros_like_op.h
--- a/paddle/fluid/operators/ftrl_op.cc
+++ b/paddle/fluid/operators/ftrl_op.cc
--- a/paddle/fluid/operators/ftrl_op.cu
+++ b/paddle/fluid/operators/ftrl_op.cu
--- a/paddle/fluid/operators/ftrl_op.h
+++ b/paddle/fluid/operators/ftrl_op.h
--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/fluid/operators/gather.cu.h
--- a/paddle/fluid/operators/gather.h
+++ b/paddle/fluid/operators/gather.h
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
--- a/paddle/fluid/operators/gather_op.h
+++ b/paddle/fluid/operators/gather_op.h
--- a/paddle/fluid/operators/gather_test.cc
+++ b/paddle/fluid/operators/gather_test.cc
--- a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
--- a/paddle/fluid/operators/get_places_op.cc
+++ b/paddle/fluid/operators/get_places_op.cc
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
--- a/paddle/fluid/operators/gru_op.cu.cc
+++ b/paddle/fluid/operators/gru_op.cu.cc
--- a/paddle/fluid/operators/gru_op.h
+++ b/paddle/fluid/operators/gru_op.h
--- a/paddle/fluid/operators/gru_unit_op.cc
+++ b/paddle/fluid/operators/gru_unit_op.cc
--- a/paddle/fluid/operators/gru_unit_op.cu
+++ b/paddle/fluid/operators/gru_unit_op.cu
--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
--- a/paddle/fluid/operators/hinge_loss_op.cu
+++ b/paddle/fluid/operators/hinge_loss_op.cu
--- a/paddle/fluid/operators/hinge_loss_op.h
+++ b/paddle/fluid/operators/hinge_loss_op.h
--- a/paddle/fluid/operators/huber_loss_op.cc
+++ b/paddle/fluid/operators/huber_loss_op.cc
--- a/paddle/fluid/operators/huber_loss_op.cu
+++ b/paddle/fluid/operators/huber_loss_op.cu
--- a/paddle/fluid/operators/huber_loss_op.h
+++ b/paddle/fluid/operators/huber_loss_op.h
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
--- a/paddle/fluid/operators/im2sequence_op.cu
+++ b/paddle/fluid/operators/im2sequence_op.cu
--- a/paddle/fluid/operators/im2sequence_op.h
+++ b/paddle/fluid/operators/im2sequence_op.h
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
--- a/paddle/fluid/operators/iou_similarity_op.cc
+++ b/paddle/fluid/operators/iou_similarity_op.cc
--- a/paddle/fluid/operators/iou_similarity_op.cu
+++ b/paddle/fluid/operators/iou_similarity_op.cu
--- a/paddle/fluid/operators/iou_similarity_op.h
+++ b/paddle/fluid/operators/iou_similarity_op.h
--- a/paddle/fluid/operators/is_empty_op.cc
+++ b/paddle/fluid/operators/is_empty_op.cc
--- a/paddle/fluid/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
--- a/paddle/fluid/operators/l1_norm_op.cu
+++ b/paddle/fluid/operators/l1_norm_op.cu
--- a/paddle/fluid/operators/l1_norm_op.h
+++ b/paddle/fluid/operators/l1_norm_op.h
--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
--- a/paddle/fluid/operators/label_smooth_op.cu
+++ b/paddle/fluid/operators/label_smooth_op.cu
--- a/paddle/fluid/operators/label_smooth_op.h
+++ b/paddle/fluid/operators/label_smooth_op.h
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
--- a/paddle/fluid/operators/layer_norm_op.cu
+++ b/paddle/fluid/operators/layer_norm_op.cu
--- a/paddle/fluid/operators/layer_norm_op.h
+++ b/paddle/fluid/operators/layer_norm_op.h
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
--- a/paddle/fluid/operators/linear_chain_crf_op.cu
+++ b/paddle/fluid/operators/linear_chain_crf_op.cu
--- a/paddle/fluid/operators/linear_chain_crf_op.h
+++ b/paddle/fluid/operators/linear_chain_crf_op.h
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
--- a/paddle/fluid/operators/lod_array_length_op.cc
+++ b/paddle/fluid/operators/lod_array_length_op.cc
--- a/paddle/fluid/operators/lod_rank_table_op.cc
+++ b/paddle/fluid/operators/lod_rank_table_op.cc
--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
--- a/paddle/fluid/operators/lod_reset_op.cu
+++ b/paddle/fluid/operators/lod_reset_op.cu
--- a/paddle/fluid/operators/lod_reset_op.h
+++ b/paddle/fluid/operators/lod_reset_op.h
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
--- a/paddle/fluid/operators/log_loss_op.cu
+++ b/paddle/fluid/operators/log_loss_op.cu
--- a/paddle/fluid/operators/log_loss_op.h
+++ b/paddle/fluid/operators/log_loss_op.h
--- a/paddle/fluid/operators/logical_op.cc
+++ b/paddle/fluid/operators/logical_op.cc
--- a/paddle/fluid/operators/logical_op.cu
+++ b/paddle/fluid/operators/logical_op.cu
--- a/paddle/fluid/operators/logical_op.h
+++ b/paddle/fluid/operators/logical_op.h
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
--- a/paddle/fluid/operators/lrn_op.cu
+++ b/paddle/fluid/operators/lrn_op.cu
--- a/paddle/fluid/operators/lrn_op.h
+++ b/paddle/fluid/operators/lrn_op.h
--- a/paddle/fluid/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
--- a/paddle/fluid/operators/lstm_op.cu.cc
+++ b/paddle/fluid/operators/lstm_op.cu.cc
--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
--- a/paddle/fluid/operators/lstm_unit_op.cc
+++ b/paddle/fluid/operators/lstm_unit_op.cc
--- a/paddle/fluid/operators/lstm_unit_op.cu
+++ b/paddle/fluid/operators/lstm_unit_op.cu
--- a/paddle/fluid/operators/lstm_unit_op.h
+++ b/paddle/fluid/operators/lstm_unit_op.h
--- a/paddle/fluid/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
--- a/paddle/fluid/operators/lstmp_op.cu
+++ b/paddle/fluid/operators/lstmp_op.cu
--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
--- a/paddle/fluid/operators/margin_rank_loss_op.cc
+++ b/paddle/fluid/operators/margin_rank_loss_op.cc
--- a/paddle/fluid/operators/margin_rank_loss_op.cu
+++ b/paddle/fluid/operators/margin_rank_loss_op.cu
--- a/paddle/fluid/operators/margin_rank_loss_op.h
+++ b/paddle/fluid/operators/margin_rank_loss_op.h
--- a/paddle/fluid/operators/math/context_project.cc
+++ b/paddle/fluid/operators/math/context_project.cc
--- a/paddle/fluid/operators/math/context_project.cu
+++ b/paddle/fluid/operators/math/context_project.cu
--- a/paddle/fluid/operators/math/context_project.h
+++ b/paddle/fluid/operators/math/context_project.h
--- a/paddle/fluid/operators/math/cos_sim_functor.cc
+++ b/paddle/fluid/operators/math/cos_sim_functor.cc
--- a/paddle/fluid/operators/math/cos_sim_functor.cu
+++ b/paddle/fluid/operators/math/cos_sim_functor.cu
--- a/paddle/fluid/operators/math/cos_sim_functor.h
+++ b/paddle/fluid/operators/math/cos_sim_functor.h
--- a/paddle/fluid/operators/math/cross_entropy.cc
+++ b/paddle/fluid/operators/math/cross_entropy.cc
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
--- a/paddle/fluid/operators/math/cross_entropy.h
+++ b/paddle/fluid/operators/math/cross_entropy.h
--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/fluid/operators/math/depthwise_conv.cu
--- a/paddle/fluid/operators/math/depthwise_conv.h
+++ b/paddle/fluid/operators/math/depthwise_conv.h
--- a/paddle/fluid/operators/math/detail/activation_functions.h
+++ b/paddle/fluid/operators/math/detail/activation_functions.h
--- a/paddle/fluid/operators/math/detail/avx_functions.cc
+++ b/paddle/fluid/operators/math/detail/avx_functions.cc
--- a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
--- a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
--- a/paddle/fluid/operators/math/detail/gru_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_kernel.h
--- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
--- a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
--- a/paddle/fluid/operators/math/detail/lstm_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_kernel.h
--- a/paddle/fluid/operators/math/detection_util.h
+++ b/paddle/fluid/operators/math/detection_util.h
--- a/paddle/fluid/operators/math/gru_compute.cc
+++ b/paddle/fluid/operators/math/gru_compute.cc
--- a/paddle/fluid/operators/math/gru_compute.cu
+++ b/paddle/fluid/operators/math/gru_compute.cu
--- a/paddle/fluid/operators/math/gru_compute.h
+++ b/paddle/fluid/operators/math/gru_compute.h
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
--- a/paddle/fluid/operators/math/im2col.cu
+++ b/paddle/fluid/operators/math/im2col.cu
--- a/paddle/fluid/operators/math/im2col.h
+++ b/paddle/fluid/operators/math/im2col.h
--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
--- a/paddle/fluid/operators/math/lstm_compute.cc
+++ b/paddle/fluid/operators/math/lstm_compute.cc
--- a/paddle/fluid/operators/math/lstm_compute.cu
+++ b/paddle/fluid/operators/math/lstm_compute.cu
--- a/paddle/fluid/operators/math/lstm_compute.h
+++ b/paddle/fluid/operators/math/lstm_compute.h
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
--- a/paddle/fluid/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
--- a/paddle/fluid/operators/math/math_function_impl.h
+++ b/paddle/fluid/operators/math/math_function_impl.h
--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
--- a/paddle/fluid/operators/math/math_function_test.cu
+++ b/paddle/fluid/operators/math/math_function_test.cu
--- a/paddle/fluid/operators/math/matmul.h
+++ b/paddle/fluid/operators/math/matmul.h
--- a/paddle/fluid/operators/math/maxouting.cc
+++ b/paddle/fluid/operators/math/maxouting.cc
--- a/paddle/fluid/operators/math/maxouting.cu
+++ b/paddle/fluid/operators/math/maxouting.cu
--- a/paddle/fluid/operators/math/maxouting.h
+++ b/paddle/fluid/operators/math/maxouting.h
--- a/paddle/fluid/operators/math/pooling.cc
+++ b/paddle/fluid/operators/math/pooling.cc
--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/fluid/operators/math/pooling.cu
--- a/paddle/fluid/operators/math/pooling.h
+++ b/paddle/fluid/operators/math/pooling.h
--- a/paddle/fluid/operators/math/sampler.cc
+++ b/paddle/fluid/operators/math/sampler.cc
--- a/paddle/fluid/operators/math/sampler.h
+++ b/paddle/fluid/operators/math/sampler.h
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
--- a/paddle/fluid/operators/math/selected_rows_functor.h
+++ b/paddle/fluid/operators/math/selected_rows_functor.h
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu
--- a/paddle/fluid/operators/math/sequence2batch.cc
+++ b/paddle/fluid/operators/math/sequence2batch.cc
--- a/paddle/fluid/operators/math/sequence2batch.cu
+++ b/paddle/fluid/operators/math/sequence2batch.cu
--- a/paddle/fluid/operators/math/sequence2batch.h
+++ b/paddle/fluid/operators/math/sequence2batch.h
--- a/paddle/fluid/operators/math/sequence_padding.cc
+++ b/paddle/fluid/operators/math/sequence_padding.cc
--- a/paddle/fluid/operators/math/sequence_padding.cu
+++ b/paddle/fluid/operators/math/sequence_padding.cu
--- a/paddle/fluid/operators/math/sequence_padding.h
+++ b/paddle/fluid/operators/math/sequence_padding.h
--- a/paddle/fluid/operators/math/sequence_padding_test.cc
+++ b/paddle/fluid/operators/math/sequence_padding_test.cc
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
--- a/paddle/fluid/operators/math/sequence_pooling.h
+++ b/paddle/fluid/operators/math/sequence_pooling.h
--- a/paddle/fluid/operators/math/sequence_scale.cc
+++ b/paddle/fluid/operators/math/sequence_scale.cc
--- a/paddle/fluid/operators/math/sequence_scale.cu
+++ b/paddle/fluid/operators/math/sequence_scale.cu
--- a/paddle/fluid/operators/math/sequence_scale.h
+++ b/paddle/fluid/operators/math/sequence_scale.h
--- a/paddle/fluid/operators/math/softmax.cc
+++ b/paddle/fluid/operators/math/softmax.cc
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
--- a/paddle/fluid/operators/math/softmax.h
+++ b/paddle/fluid/operators/math/softmax.h
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
--- a/paddle/fluid/operators/math/unpooling.cc
+++ b/paddle/fluid/operators/math/unpooling.cc
--- a/paddle/fluid/operators/math/unpooling.cu
+++ b/paddle/fluid/operators/math/unpooling.cu
--- a/paddle/fluid/operators/math/unpooling.h
+++ b/paddle/fluid/operators/math/unpooling.h
--- a/paddle/fluid/operators/math/vol2col.cc
+++ b/paddle/fluid/operators/math/vol2col.cc
--- a/paddle/fluid/operators/math/vol2col.cu
+++ b/paddle/fluid/operators/math/vol2col.cu
--- a/paddle/fluid/operators/math/vol2col.h
+++ b/paddle/fluid/operators/math/vol2col.h
--- a/paddle/fluid/operators/math/vol2col_test.cc
+++ b/paddle/fluid/operators/math/vol2col_test.cc
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
--- a/paddle/fluid/operators/matmul_op.cu.cc
+++ b/paddle/fluid/operators/matmul_op.cu.cc
--- a/paddle/fluid/operators/matmul_op.h
+++ b/paddle/fluid/operators/matmul_op.h
--- a/paddle/fluid/operators/max_sequence_len_op.cc
+++ b/paddle/fluid/operators/max_sequence_len_op.cc
--- a/paddle/fluid/operators/maxout_op.cc
+++ b/paddle/fluid/operators/maxout_op.cc
--- a/paddle/fluid/operators/maxout_op.cu.cc
+++ b/paddle/fluid/operators/maxout_op.cu.cc
--- a/paddle/fluid/operators/maxout_op.h
+++ b/paddle/fluid/operators/maxout_op.h
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
--- a/paddle/fluid/operators/mean_op.h
+++ b/paddle/fluid/operators/mean_op.h
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
--- a/paddle/fluid/operators/mine_hard_examples_op.cc
+++ b/paddle/fluid/operators/mine_hard_examples_op.cc
--- a/paddle/fluid/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
--- a/paddle/fluid/operators/minus_op.cu
+++ b/paddle/fluid/operators/minus_op.cu
--- a/paddle/fluid/operators/minus_op.h
+++ b/paddle/fluid/operators/minus_op.h
--- a/paddle/fluid/operators/modified_huber_loss_op.cc
+++ b/paddle/fluid/operators/modified_huber_loss_op.cc
--- a/paddle/fluid/operators/modified_huber_loss_op.cu
+++ b/paddle/fluid/operators/modified_huber_loss_op.cu
--- a/paddle/fluid/operators/modified_huber_loss_op.h
+++ b/paddle/fluid/operators/modified_huber_loss_op.h
--- a/paddle/fluid/operators/momentum_op.cc
+++ b/paddle/fluid/operators/momentum_op.cc
--- a/paddle/fluid/operators/momentum_op.cu
+++ b/paddle/fluid/operators/momentum_op.cu
--- a/paddle/fluid/operators/momentum_op.h
+++ b/paddle/fluid/operators/momentum_op.h
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
--- a/paddle/fluid/operators/mul_op.cu.cc
+++ b/paddle/fluid/operators/mul_op.cu.cc
--- a/paddle/fluid/operators/mul_op.h
+++ b/paddle/fluid/operators/mul_op.h
--- a/paddle/fluid/operators/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/multiclass_nms_op.cc
--- a/paddle/fluid/operators/multiplex_op.cc
+++ b/paddle/fluid/operators/multiplex_op.cc
--- a/paddle/fluid/operators/multiplex_op.cu
+++ b/paddle/fluid/operators/multiplex_op.cu
--- a/paddle/fluid/operators/multiplex_op.h
+++ b/paddle/fluid/operators/multiplex_op.h
--- a/paddle/fluid/operators/nccl/nccl_gpu_common.cc
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.cc
--- a/paddle/fluid/operators/nccl/nccl_gpu_common.h
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.h
--- a/paddle/fluid/operators/nccl_op.cc
+++ b/paddle/fluid/operators/nccl_op.cc
--- a/paddle/fluid/operators/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl_op.cu.cc
--- a/paddle/fluid/operators/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl_op_test.cu.cc
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
--- a/paddle/fluid/operators/net_op.cc
+++ b/paddle/fluid/operators/net_op.cc
--- a/paddle/fluid/operators/net_op.h
+++ b/paddle/fluid/operators/net_op.h
--- a/paddle/fluid/operators/net_op_test.cc
+++ b/paddle/fluid/operators/net_op_test.cc
--- a/paddle/fluid/operators/norm_op.cc
+++ b/paddle/fluid/operators/norm_op.cc
--- a/paddle/fluid/operators/norm_op.cu
+++ b/paddle/fluid/operators/norm_op.cu
--- a/paddle/fluid/operators/norm_op.h
+++ b/paddle/fluid/operators/norm_op.h
--- a/paddle/fluid/operators/one_hot_op.cc
+++ b/paddle/fluid/operators/one_hot_op.cc
--- a/paddle/fluid/operators/one_hot_op.cu
+++ b/paddle/fluid/operators/one_hot_op.cu
--- a/paddle/fluid/operators/one_hot_op.h
+++ b/paddle/fluid/operators/one_hot_op.h
--- a/paddle/fluid/operators/pad_op.cc
+++ b/paddle/fluid/operators/pad_op.cc
--- a/paddle/fluid/operators/pad_op.cu
+++ b/paddle/fluid/operators/pad_op.cu
--- a/paddle/fluid/operators/pad_op.h
+++ b/paddle/fluid/operators/pad_op.h
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
--- a/paddle/fluid/operators/pool_op.cu.cc
+++ b/paddle/fluid/operators/pool_op.cu.cc
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
--- a/paddle/fluid/operators/pool_with_index_op.cu.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cu.cc
--- a/paddle/fluid/operators/pool_with_index_op.h
+++ b/paddle/fluid/operators/pool_with_index_op.h
--- a/paddle/fluid/operators/precision_recall_op.cc
+++ b/paddle/fluid/operators/precision_recall_op.cc
--- a/paddle/fluid/operators/precision_recall_op.h
+++ b/paddle/fluid/operators/precision_recall_op.h
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
--- a/paddle/fluid/operators/prelu_op.cu
+++ b/paddle/fluid/operators/prelu_op.cu
--- a/paddle/fluid/operators/prelu_op.h
+++ b/paddle/fluid/operators/prelu_op.h
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
--- a/paddle/fluid/operators/prior_box_op.cc
+++ b/paddle/fluid/operators/prior_box_op.cc
--- a/paddle/fluid/operators/prior_box_op.h
+++ b/paddle/fluid/operators/prior_box_op.h
--- a/paddle/fluid/operators/proximal_adagrad_op.cc
+++ b/paddle/fluid/operators/proximal_adagrad_op.cc
--- a/paddle/fluid/operators/proximal_adagrad_op.cu
+++ b/paddle/fluid/operators/proximal_adagrad_op.cu
--- a/paddle/fluid/operators/proximal_adagrad_op.h
+++ b/paddle/fluid/operators/proximal_adagrad_op.h
--- a/paddle/fluid/operators/proximal_gd_op.cc
+++ b/paddle/fluid/operators/proximal_gd_op.cc
--- a/paddle/fluid/operators/proximal_gd_op.cu
+++ b/paddle/fluid/operators/proximal_gd_op.cu
--- a/paddle/fluid/operators/proximal_gd_op.h
+++ b/paddle/fluid/operators/proximal_gd_op.h
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
--- a/paddle/fluid/operators/rank_loss_op.cu
+++ b/paddle/fluid/operators/rank_loss_op.cu
--- a/paddle/fluid/operators/rank_loss_op.h
+++ b/paddle/fluid/operators/rank_loss_op.h
--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
--- a/paddle/fluid/operators/reduce_op.cc
+++ b/paddle/fluid/operators/reduce_op.cc
--- a/paddle/fluid/operators/reduce_op.cu
+++ b/paddle/fluid/operators/reduce_op.cu
--- a/paddle/fluid/operators/reduce_op.h
+++ b/paddle/fluid/operators/reduce_op.h
--- a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
--- a/paddle/fluid/operators/reshape_op.cu
+++ b/paddle/fluid/operators/reshape_op.cu
--- a/paddle/fluid/operators/reshape_op.h
+++ b/paddle/fluid/operators/reshape_op.h
--- a/paddle/fluid/operators/rmsprop_op.cc
+++ b/paddle/fluid/operators/rmsprop_op.cc
--- a/paddle/fluid/operators/rmsprop_op.cu
+++ b/paddle/fluid/operators/rmsprop_op.cu
--- a/paddle/fluid/operators/rmsprop_op.h
+++ b/paddle/fluid/operators/rmsprop_op.h
--- a/paddle/fluid/operators/rnn_memory_helper_op.cc
+++ b/paddle/fluid/operators/rnn_memory_helper_op.cc
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
--- a/paddle/fluid/operators/roi_pool_op.h
+++ b/paddle/fluid/operators/roi_pool_op.h
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
--- a/paddle/fluid/operators/row_conv_op.cu
+++ b/paddle/fluid/operators/row_conv_op.cu
--- a/paddle/fluid/operators/row_conv_op.h
+++ b/paddle/fluid/operators/row_conv_op.h
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
--- a/paddle/fluid/operators/save_load_combine_op_test.cc
+++ b/paddle/fluid/operators/save_load_combine_op_test.cc
--- a/paddle/fluid/operators/save_load_op_test.cc
+++ b/paddle/fluid/operators/save_load_op_test.cc
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
--- a/paddle/fluid/operators/scale_op.cu
+++ b/paddle/fluid/operators/scale_op.cu
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
--- a/paddle/fluid/operators/scatter.cu.h
+++ b/paddle/fluid/operators/scatter.cu.h
--- a/paddle/fluid/operators/scatter.h
+++ b/paddle/fluid/operators/scatter.h
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
--- a/paddle/fluid/operators/scatter_op.cu
+++ b/paddle/fluid/operators/scatter_op.cu
--- a/paddle/fluid/operators/scatter_op.h
+++ b/paddle/fluid/operators/scatter_op.h
--- a/paddle/fluid/operators/scatter_test.cc
+++ b/paddle/fluid/operators/scatter_test.cc
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
--- a/paddle/fluid/operators/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cc
--- a/paddle/fluid/operators/sequence_concat_op.cu.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cu.cc
--- a/paddle/fluid/operators/sequence_concat_op.h
+++ b/paddle/fluid/operators/sequence_concat_op.h
--- a/paddle/fluid/operators/sequence_conv_op.cc
+++ b/paddle/fluid/operators/sequence_conv_op.cc
--- a/paddle/fluid/operators/sequence_conv_op.cu.cc
+++ b/paddle/fluid/operators/sequence_conv_op.cu.cc
--- a/paddle/fluid/operators/sequence_conv_op.h
+++ b/paddle/fluid/operators/sequence_conv_op.h
--- a/paddle/fluid/operators/sequence_erase_op.cc
+++ b/paddle/fluid/operators/sequence_erase_op.cc
--- a/paddle/fluid/operators/sequence_erase_op.cu
+++ b/paddle/fluid/operators/sequence_erase_op.cu
--- a/paddle/fluid/operators/sequence_erase_op.h
+++ b/paddle/fluid/operators/sequence_erase_op.h
--- a/paddle/fluid/operators/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_expand_op.cc
--- a/paddle/fluid/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
--- a/paddle/fluid/operators/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_pool_op.cc
--- a/paddle/fluid/operators/sequence_pool_op.cu
+++ b/paddle/fluid/operators/sequence_pool_op.cu
--- a/paddle/fluid/operators/sequence_pool_op.h
+++ b/paddle/fluid/operators/sequence_pool_op.h
--- a/paddle/fluid/operators/sequence_reshape_op.cc
+++ b/paddle/fluid/operators/sequence_reshape_op.cc
--- a/paddle/fluid/operators/sequence_reshape_op.cu
+++ b/paddle/fluid/operators/sequence_reshape_op.cu
--- a/paddle/fluid/operators/sequence_reshape_op.h
+++ b/paddle/fluid/operators/sequence_reshape_op.h
--- a/paddle/fluid/operators/sequence_slice_op.cc
+++ b/paddle/fluid/operators/sequence_slice_op.cc
--- a/paddle/fluid/operators/sequence_slice_op.cu
+++ b/paddle/fluid/operators/sequence_slice_op.cu
--- a/paddle/fluid/operators/sequence_slice_op.h
+++ b/paddle/fluid/operators/sequence_slice_op.h
--- a/paddle/fluid/operators/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cc
--- a/paddle/fluid/operators/sequence_softmax_op.cu.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cu.cc
--- a/paddle/fluid/operators/sequence_softmax_op.h
+++ b/paddle/fluid/operators/sequence_softmax_op.h
--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
--- a/paddle/fluid/operators/sgd_op.cu
+++ b/paddle/fluid/operators/sgd_op.cu
--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
--- a/paddle/fluid/operators/shrink_rnn_memory_op.cc
+++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
--- a/paddle/fluid/operators/sign_op.cc
+++ b/paddle/fluid/operators/sign_op.cc
--- a/paddle/fluid/operators/sign_op.cu
+++ b/paddle/fluid/operators/sign_op.cu
--- a/paddle/fluid/operators/sign_op.h
+++ b/paddle/fluid/operators/sign_op.h
--- a/paddle/fluid/operators/smooth_l1_loss_op.cc
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cc
--- a/paddle/fluid/operators/smooth_l1_loss_op.cu
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cu
--- a/paddle/fluid/operators/smooth_l1_loss_op.h
+++ b/paddle/fluid/operators/smooth_l1_loss_op.h
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
--- a/paddle/fluid/operators/softmax_op.cu.cc
+++ b/paddle/fluid/operators/softmax_op.cu.cc
--- a/paddle/fluid/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
--- a/paddle/fluid/operators/split_op.cu.cc
+++ b/paddle/fluid/operators/split_op.cu.cc
--- a/paddle/fluid/operators/split_op.h
+++ b/paddle/fluid/operators/split_op.h
--- a/paddle/fluid/operators/split_selected_rows_op.cc
+++ b/paddle/fluid/operators/split_selected_rows_op.cc
--- a/paddle/fluid/operators/split_selected_rows_op.cu
+++ b/paddle/fluid/operators/split_selected_rows_op.cu
--- a/paddle/fluid/operators/split_selected_rows_op.h
+++ b/paddle/fluid/operators/split_selected_rows_op.h
--- a/paddle/fluid/operators/spp_op.cc
+++ b/paddle/fluid/operators/spp_op.cc
--- a/paddle/fluid/operators/spp_op.cu.cc
+++ b/paddle/fluid/operators/spp_op.cu.cc
--- a/paddle/fluid/operators/spp_op.h
+++ b/paddle/fluid/operators/spp_op.h
--- a/paddle/fluid/operators/squared_l2_distance_op.cc
+++ b/paddle/fluid/operators/squared_l2_distance_op.cc
--- a/paddle/fluid/operators/squared_l2_distance_op.cu
+++ b/paddle/fluid/operators/squared_l2_distance_op.cu
--- a/paddle/fluid/operators/squared_l2_distance_op.h
+++ b/paddle/fluid/operators/squared_l2_distance_op.h
--- a/paddle/fluid/operators/squared_l2_norm_op.cc
+++ b/paddle/fluid/operators/squared_l2_norm_op.cc
--- a/paddle/fluid/operators/squared_l2_norm_op.cu
+++ b/paddle/fluid/operators/squared_l2_norm_op.cu
--- a/paddle/fluid/operators/squared_l2_norm_op.h
+++ b/paddle/fluid/operators/squared_l2_norm_op.h
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
--- a/paddle/fluid/operators/strided_memcpy_test.cc
+++ b/paddle/fluid/operators/strided_memcpy_test.cc
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
--- a/paddle/fluid/operators/target_assign_op.cc
+++ b/paddle/fluid/operators/target_assign_op.cc
--- a/paddle/fluid/operators/target_assign_op.cu
+++ b/paddle/fluid/operators/target_assign_op.cu
--- a/paddle/fluid/operators/target_assign_op.h
+++ b/paddle/fluid/operators/target_assign_op.h
--- a/paddle/fluid/operators/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/tensor_array_read_write_op.cc
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
--- a/paddle/fluid/operators/transpose_op.cu.cc
+++ b/paddle/fluid/operators/transpose_op.cu.cc
--- a/paddle/fluid/operators/transpose_op.h
+++ b/paddle/fluid/operators/transpose_op.h
--- a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
--- a/paddle/fluid/operators/unpool_op.cc
+++ b/paddle/fluid/operators/unpool_op.cc
--- a/paddle/fluid/operators/unpool_op.cu.cc
+++ b/paddle/fluid/operators/unpool_op.cu.cc
--- a/paddle/fluid/operators/unpool_op.h
+++ b/paddle/fluid/operators/unpool_op.h
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
--- a/paddle/fluid/operators/warpctc_op.cu.cc
+++ b/paddle/fluid/operators/warpctc_op.cu.cc
--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
--- a/paddle/fluid/platform/assert.h
+++ b/paddle/fluid/platform/assert.h
--- a/paddle/fluid/platform/call_once.h
+++ b/paddle/fluid/platform/call_once.h
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
--- a/paddle/fluid/platform/cpu_info_test.cc
+++ b/paddle/fluid/platform/cpu_info_test.cc
--- a/paddle/fluid/platform/cuda_helper.h
+++ b/paddle/fluid/platform/cuda_helper.h
--- a/paddle/fluid/platform/cuda_profiler.h
+++ b/paddle/fluid/platform/cuda_profiler.h
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
--- a/paddle/fluid/platform/cudnn_helper_test.cc
+++ b/paddle/fluid/platform/cudnn_helper_test.cc
--- a/paddle/fluid/platform/details/device_ptr_cast.h
+++ b/paddle/fluid/platform/details/device_ptr_cast.h
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
--- a/paddle/fluid/platform/device_context_test.cu
+++ b/paddle/fluid/platform/device_context_test.cu
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
--- a/paddle/fluid/platform/dynload/cublas.cc
+++ b/paddle/fluid/platform/dynload/cublas.cc
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
--- a/paddle/fluid/platform/dynload/cupti.cc
+++ b/paddle/fluid/platform/dynload/cupti.cc
--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
--- a/paddle/fluid/platform/dynload/curand.cc
+++ b/paddle/fluid/platform/dynload/curand.cc
--- a/paddle/fluid/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
--- a/paddle/fluid/platform/dynload/nccl.cc
+++ b/paddle/fluid/platform/dynload/nccl.cc
--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
--- a/paddle/fluid/platform/dynload/warpctc.cc
+++ b/paddle/fluid/platform/dynload/warpctc.cc
--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
--- a/paddle/fluid/platform/enforce.cc
+++ b/paddle/fluid/platform/enforce.cc
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
--- a/paddle/math/float16.h
+++ b/paddle/math/float16.h
--- a/paddle/math/tests/test_float16.cpp
+++ b/paddle/math/tests/test_float16.cpp
--- a/paddle/math/tests/test_float16.cu
+++ b/paddle/math/tests/test_float16.cu
--- a/paddle/fluid/platform/for_range.h
+++ b/paddle/fluid/platform/for_range.h
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
--- a/paddle/fluid/platform/hostdevice.h
+++ b/paddle/fluid/platform/hostdevice.h
--- a/paddle/fluid/platform/macros.h
+++ b/paddle/fluid/platform/macros.h
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
--- a/paddle/fluid/platform/nccl_test.cu
+++ b/paddle/fluid/platform/nccl_test.cu
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
--- a/paddle/fluid/platform/place_test.cc
+++ b/paddle/fluid/platform/place_test.cc
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
--- a/paddle/fluid/platform/profiler.proto
+++ b/paddle/fluid/platform/profiler.proto
--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
--- a/paddle/fluid/platform/transform.h
+++ b/paddle/fluid/platform/transform.h
--- a/paddle/fluid/platform/transform_test.cu
+++ b/paddle/fluid/platform/transform_test.cu
--- a/paddle/fluid/platform/variant.h
+++ b/paddle/fluid/platform/variant.h
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
--- a/paddle/fluid/pybind/const_value.h
+++ b/paddle/fluid/pybind/const_value.h
--- a/paddle/fluid/pybind/exception.cc
+++ b/paddle/fluid/pybind/exception.cc
--- a/paddle/fluid/pybind/exception.h
+++ b/paddle/fluid/pybind/exception.h
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
--- a/paddle/fluid/pybind/protobuf.h
+++ b/paddle/fluid/pybind/protobuf.h
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
--- a/paddle/string/.clang-format
+++ b/paddle/string/.clang-format
--- a/paddle/string/CMakeLists.txt
+++ b/paddle/string/CMakeLists.txt
--- a/paddle/string/piece.cc
+++ b/paddle/string/piece.cc
--- a/paddle/string/piece.h
+++ b/paddle/string/piece.h
--- a/paddle/string/piece_test.cc
+++ b/paddle/string/piece_test.cc
--- a/paddle/string/printf.h
+++ b/paddle/string/printf.h
--- a/paddle/string/printf_test.cc
+++ b/paddle/string/printf_test.cc
--- a/paddle/string/tinyformat/tinyformat.h
+++ b/paddle/string/tinyformat/tinyformat.h
--- a/paddle/string/to_string.h
+++ b/paddle/string/to_string.h
--- a/paddle/string/to_string_test.cc
+++ b/paddle/string/to_string_test.cc
--- a/paddle/function/BlockExpandOp.cpp
+++ b/paddle/function/BlockExpandOp.cpp
--- a/paddle/function/BlockExpandOpTest.cpp
+++ b/paddle/function/BlockExpandOpTest.cpp
--- a/paddle/function/BufferArg.cpp
+++ b/paddle/function/BufferArg.cpp
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
--- a/paddle/function/BufferArgTest.cpp
+++ b/paddle/function/BufferArgTest.cpp
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
--- a/paddle/function/ContextProjectionOp.h
+++ b/paddle/function/ContextProjectionOp.h
--- a/paddle/function/ContextProjectionOpGpu.cu
+++ b/paddle/function/ContextProjectionOpGpu.cu
--- a/paddle/function/ContextProjectionOpTest.cpp
+++ b/paddle/function/ContextProjectionOpTest.cpp
--- a/paddle/function/ConvOp.h
+++ b/paddle/function/ConvOp.h
--- a/paddle/function/ConvOpTest.h
+++ b/paddle/function/ConvOpTest.h
--- a/paddle/function/CosSimOp.cpp
+++ b/paddle/function/CosSimOp.cpp
--- a/paddle/function/CosSimOp.h
+++ b/paddle/function/CosSimOp.h
--- a/paddle/function/CosSimOpGpu.cu
+++ b/paddle/function/CosSimOpGpu.cu
--- a/paddle/function/CosSimOpTest.cpp
+++ b/paddle/function/CosSimOpTest.cpp
--- a/paddle/function/CropOp.cpp
+++ b/paddle/function/CropOp.cpp
--- a/paddle/function/CropOp.h
+++ b/paddle/function/CropOp.h
--- a/paddle/function/CropOpGpu.cu
+++ b/paddle/function/CropOpGpu.cu
--- a/paddle/function/CropOpTest.cpp
+++ b/paddle/function/CropOpTest.cpp
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
--- a/paddle/function/CrossMapNormalOp.h
+++ b/paddle/function/CrossMapNormalOp.h
--- a/paddle/function/CrossMapNormalOpGpu.cu
+++ b/paddle/function/CrossMapNormalOpGpu.cu
--- a/paddle/function/CrossMapNormalOpTest.cpp
+++ b/paddle/function/CrossMapNormalOpTest.cpp
--- a/paddle/function/DepthwiseConvOp.cpp
+++ b/paddle/function/DepthwiseConvOp.cpp
--- a/paddle/function/DepthwiseConvOp.h
+++ b/paddle/function/DepthwiseConvOp.h
--- a/paddle/function/DepthwiseConvOpGpu.cu
+++ b/paddle/function/DepthwiseConvOpGpu.cu
--- a/paddle/function/DepthwiseConvOpTest.cpp
+++ b/paddle/function/DepthwiseConvOpTest.cpp
--- a/paddle/function/EigenGemm.cpp
+++ b/paddle/function/EigenGemm.cpp
--- a/paddle/function/Function.cpp
+++ b/paddle/function/Function.cpp
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
--- a/paddle/function/FunctionTest.cpp
+++ b/paddle/function/FunctionTest.cpp
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
--- a/paddle/function/GemmConvOpTest.cpp
+++ b/paddle/function/GemmConvOpTest.cpp
--- a/paddle/function/GemmFunctor.cpp
+++ b/paddle/function/GemmFunctor.cpp
--- a/paddle/function/GemmFunctor.h
+++ b/paddle/function/GemmFunctor.h
--- a/paddle/function/GruFunctor.h
+++ b/paddle/function/GruFunctor.h
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
--- a/paddle/function/Im2ColOp.cpp
+++ b/paddle/function/Im2ColOp.cpp
--- a/paddle/function/Im2ColOpGpu.cu
+++ b/paddle/function/Im2ColOpGpu.cu
--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp