Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into remove_evaluator

101378c8 · fengjiayi · d01318c0 · f9c1b6f9 · 101378c8 · 101378c8
374 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -27,7 +27,6 @@ third_party/
 cmake-build-*

 # generated while compiling
-python/paddle/v2/fluid/core.so
 paddle/pybind/pybind.h
 CMakeFiles
 cmake_install.cmake

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,7 +19,7 @@ set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})

 include(system)

-project(paddle CXX C Go)
+project(paddle CXX C)
 message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
        "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
 message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
@@ -60,7 +60,7 @@ option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
 option(WITH_DISTRIBUTE  "Compile with grpc distributed support"         OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
-option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   ON)
+option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)

 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -146,6 +146,7 @@ include(external/cares)
 include(external/grpc)

 include(cudnn)              # set cudnn libraries, must before configure
+include(cupti)
 include(configure)          # add paddle env configuration
 include(generic)            # simplify cmake module
 include(package)            # set paddle packages
@@ -174,7 +175,7 @@ set(EXTERNAL_LIBS
 )

 if(WITH_GPU)
-  include(cuda)
+    include(cuda)
 endif(WITH_GPU)

 if(WITH_MKLML)
@@ -201,17 +202,18 @@ endif()
 # "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
 # placed after this block, because they depends on it.
 if(WITH_GOLANG)
+    enable_language(Go)
    add_subdirectory(go)
 endif(WITH_GOLANG)

 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")

-SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
-SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
+set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")

 add_subdirectory(paddle)
 if(WITH_PYTHON)
-  add_subdirectory(python)
+    add_subdirectory(python)
 endif()

 if(WITH_DOC)

--- a/Dockerfile
+++ b/Dockerfile
@@ -22,7 +22,8 @@ COPY ./paddle/scripts/docker/root/ /root/

 RUN apt-get update && \
    apt-get install -y \
-    git python-pip python-dev openssh-server bison libnccl-dev \
+    git python-pip python-dev openssh-server bison \
+    libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
    python-matplotlib gcc-4.8 g++-4.8 \

--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -21,16 +21,6 @@ RUN apt-get update && \
    wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
    apt-get clean -y

-# Install Go and glide
-RUN wget -qO- go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
-    tar -xz -C /usr/local && \
-    mkdir /root/gopath && \
-    mkdir /root/gopath/bin && \
-    mkdir /root/gopath/src
-ENV GOROOT=/usr/local/go GOPATH=/root/gopath
-# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
-ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
-
 # git credential to skip password typing
 RUN git config --global credential.helper store


--- a/benchmark/cluster/vgg16/Dockerfile
+++ b/benchmark/cluster/vgg16/Dockerfile
-#FROM python:2.7.14
 FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
-RUN apt-get update && apt-get install -y python
-RUN pip install -U kubernetes opencv-python &&   apt-get update -y &&   apt-get install -y iputils-ping libgtk2.0-dev
-# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
-#       so we must build one with distribute support to install in this image.
+
+# you can get mirror list here:
+# https://launchpad.net/ubuntu/+archivemirrors
+ARG UBUNTU_MIRROR
+RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
+
+RUN apt-get update && apt-get install -y python python-dev python-pip iputils-ping libgtk2.0-dev
+RUN pip install -U kubernetes opencv-python
+
 RUN pip install paddlepaddle
+# if network is slowly, you may need to add proxy here.
+# ENV https_proxy=
 RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
 RUN pip uninstall -y paddlepaddle
+# unset proxy if it is setted.
+# ENV https_proxy=""
+
+# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
+#       so we must build one with distribute support to install in this image.
+ADD *.whl /
+RUN pip install /*.whl && rm -f /*.whl
+ENV LD_LIBRARY_PATH=/usr/local/lib
+
+# tf k8s
+RUN pip install tensorflow==1.4.0
+ADD tf_k8s /usr/bin
+RUN chmod +x /usr/bin/tf_k8s
+ADD vgg16_tf.py /workspace/

 # below lines may change a lot for debugging
 ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
 ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
-ADD *.whl /
-RUN pip install /*.whl && rm -f /*.whl && \
-chmod +x /usr/bin/paddle_k8s
-ENV LD_LIBRARY_PATH=/usr/local/lib
+RUN chmod +x /usr/bin/paddle_k8s
 ADD vgg16_fluid.py vgg16_v2.py /workspace/
--- a/benchmark/cluster/vgg16/fluid_trainer.yaml
+++ b/benchmark/cluster/vgg16/fluid_trainer.yaml
@@ -11,7 +11,7 @@ spec:
        paddle-job: vgg16job
    spec:
      imagePullSecrets:
-        - name: job-registry-secret
+      - name: job-registry-secret
      hostNetwork: true
      containers:
      - name: trainer

--- a/benchmark/cluster/vgg16/tf_k8s
+++ b/benchmark/cluster/vgg16/tf_k8s
+#!/bin/bash
+check_trainer_ret() {
+  ret=$1
+  stdbuf -oL echo "job returned $ret...setting pod return message..."
+  stdbuf -oL echo "==============================="
+
+  if [ $ret -eq 136 ] ; then
+    echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
+  elif [ $ret -eq 139 ] ; then
+    echo "Segmentation Fault" > /dev/termination-log
+  elif [ $ret -eq 1 ] ; then
+    echo "General Error" > /dev/termination-log
+  elif [ $ret -eq 134 ] ; then
+    echo "Program Abort" > /dev/termination-log
+  fi
+  stdbuf -oL echo "termination log wroted..."
+  exit $ret
+}
+
+g_pservers=""
+g_trainers=""
+
+wait_running_pods(){
+  pserver_label="tf-job-pserver=${JOB_NAME}"
+  trainer_label="tf-job-trainer=${JOB_NAME}"
+
+  stdbuf -oL python /root/k8s_tools.py wait_pods_running ${pserver_label} ${PSERVERS_NUM}
+  stdbuf -oL python /root/k8s_tools.py wait_pods_running ${trainer_label} ${TRAINERS_NUM}
+
+  g_pservers=$(python /root/k8s_tools.py fetch_endpoints ${pserver_label} ${PORT})
+  g_trainers=$(python /root/k8s_tools.py fetch_endpoints ${trainer_label} ${PORT})
+}
+
+start_tf_pserver(){
+  wait_running_pods
+
+  label="tf-job-pserver=${JOB_NAME}"
+  pserver_id=$(python /root/k8s_tools.py fetch_id ${label})
+
+  cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
+  --job_name=${TF_JOB_NAME} --task_index=${pserver_id}"
+
+  stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
+}
+
+start_tf_trainer(){
+  wait_running_pods
+
+  label="tf-job-trainer=${JOB_NAME}"
+  trainer_id=$(python /root/k8s_tools.py fetch_id ${label})
+
+  cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
+  --job_name=${TF_JOB_NAME} --task_index=${trainer_id} --batch_size=${BATCH_SIZE}"
+
+  stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
+  check_trainer_ret $?
+}
+
+start_tf(){
+    if [[ "${TF_JOB_NAME}" == "worker" ]]; then
+        start_tf_trainer
+    else
+        start_tf_pserver
+    fi
+}
+
+usage() {
+    echo "usage: tf_k8s [<args>]:"
+    echo "  start_tf         Start tensorflow jobs"
+}
+
+case "$1" in
+    start_tf)
+        start_tf
+        ;;
+    --help)
+        usage
+        ;;
+    *)
+        usage
+        ;;
+esac
--- a/benchmark/cluster/vgg16/tf_pserver.yaml
+++ b/benchmark/cluster/vgg16/tf_pserver.yaml
+apiVersion: extensions/v1beta1
+kind: ReplicaSet
+metadata:
+  name: vgg16job-tf-pserver
+spec:
+  replicas: 10
+  template:
+    metadata:
+      labels:
+        tf-job-pserver: vgg16job-tf
+    spec:
+      hostNetwork: true
+      imagePullSecrets:
+      - name: job-registry-secret
+      containers:
+      - name: pserver
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
+        imagePullPolicy: Always
+        command: ["tf_k8s", "start_tf"]
+        ports:
+        - name: jobport-30236
+          containerPort: 30236
+        env:
+        - name: PORT
+          value: "32036"
+        - name: ENTRY
+          value: "python vgg16_tf.py"
+        - name: JOB_NAME
+          value: vgg16job-tf
+        - name: PSERVERS_NUM
+          value: "10"
+        - name: TF_JOB_NAME 
+          value: "ps"
+        - name: TRAINERS_NUM
+          value: "20"
+        - name: BATCH_SIZE
+          value: "128"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: NUM_PASSES
+          value: "1"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        - name: POD_IP
+          valueFrom:
+            fieldRef:
+              fieldPath: "status.podIP"
+        resources:
+          requests:
+            memory: 10Gi
+            cpu: 4
+          limits:
+            memory: 10Gi
+            cpu: 4
--- a/benchmark/cluster/vgg16/tf_trainer.yaml
+++ b/benchmark/cluster/vgg16/tf_trainer.yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: vgg16job-tf-trainer
+spec:
+  parallelism: 20
+  completions: 20
+  template:
+    metadata:
+      labels:
+        tf-job-trainer: vgg16job-tf
+    spec:
+      imagePullSecrets:
+      - name: job-registry-secret
+      hostNetwork: true
+      containers:
+      - name: trainer
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
+        imagePullPolicy: Always
+        command: ["tf_k8s", "start_tf"]
+        ports:
+        - name: jobport-30236
+          containerPort: 30236
+        env:
+        - name: PORT
+          value: "32036"
+        - name: JOB_NAME
+          value: vgg16job-tf
+        - name: TF_JOB_NAME 
+          value: "worker"
+        - name: ENTRY
+          value: "python vgg16_tf.py"
+        - name: PSERVERS_NUM
+          value: "10"
+        - name: BATCH_SIZE
+          value: "128"
+        - name: TRAINERS_NUM
+          value: "20"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: NUM_PASSES
+          value: "1"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        - name: POD_IP
+          valueFrom:
+            fieldRef:
+              fieldPath: "status.podIP"
+        resources:
+          requests:
+            memory: 40Gi
+            cpu: 2
+          limits:
+            memory: 40Gi
+            cpu: 2
+      restartPolicy: Never
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@@ -68,6 +68,21 @@ parser.add_argument(
    type=str2bool,
    default=True,
    help='Whether to run as local mode.')
+
+parser.add_argument(
+    "--ps_hosts",
+    type=str,
+    default="",
+    help="Comma-separated list of hostname:port pairs")
+parser.add_argument(
+    "--trainer_hosts",
+    type=str,
+    default="",
+    help="Comma-separated list of hostname:port pairs")
+
+# Flags for defining the tf.train.Server
+parser.add_argument(
+    "--task_index", type=int, default=0, help="Index of task within the job")
 args = parser.parse_args()


@@ -180,8 +195,9 @@ def main():
                    iters += 1
                    num_samples += len(data)
                    print(
-                        "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f"
-                        % (pass_id, iters, loss, acc, time.time() - ts)
+                        "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
+                        % (pass_id, iters, loss, acc,
+                           len(data) / (time.time() - ts))
                    )  # The accuracy is the accumulation of batches, but not the current batch.

            pass_elapsed = time.time() - start_time
@@ -209,27 +225,24 @@ def main():
            batch_size=args.batch_size)
        train_loop(exe, fluid.default_main_program())
    else:
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # all pserver endpoints
-        eplist = []
-        for ip in pserver_ips.split(","):
-            eplist.append(':'.join([ip, "6174"]))
-        pserver_endpoints = ",".join(eplist)
-        print("pserver endpoints: ", pserver_endpoints)
        trainers = int(os.getenv("TRAINERS"))  # total trainer count
        print("trainers total: ", trainers)
-        current_endpoint = os.getenv(
-            "POD_IP") + ":6174"  # current pserver endpoint
+
        training_role = os.getenv(
            "TRAINING_ROLE",
            "TRAINER")  # get the training role: trainer/pserver
+
        t = fluid.DistributeTranspiler()
        t.transpile(
            optimize_ops,
            params_grads,
-            pservers=pserver_endpoints,
+            trainer_id=args.task_index,
+            pservers=args.ps_hosts,
            trainers=trainers)

        if training_role == "PSERVER":
+            current_endpoint = os.getenv("POD_IP") + ":" + os.getenv(
+                "PADDLE_INIT_PORT")
            if not current_endpoint:
                print("need env SERVER_ENDPOINT")
                exit(1)

--- a/benchmark/cluster/vgg16/vgg16_tf.py
+++ b/benchmark/cluster/vgg16/vgg16_tf.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VGG16 benchmark in TensorFlow
+You can get distribution example template structure here:
+https://medium.com/clusterone/how-to-write-distributed-tensorflow-code-with-an-example-on-tensorport-70bf3306adcb
+https://www.tensorflow.org/deploy/distributed
+"""
+
+import tensorflow as tf
+import paddle.v2 as paddle
+import numpy as np
+import argparse
+import time
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--batch_size', type=int, default=128, help="Batch size for training.")
+parser.add_argument(
+    '--learning_rate',
+    type=float,
+    default=1e-3,
+    help="Learning rate for training.")
+parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
+parser.add_argument(
+    '--device',
+    type=str,
+    default='CPU',
+    choices=['CPU', 'GPU'],
+    help="The device type.")
+parser.add_argument(
+    '--data_format',
+    type=str,
+    default='NHWC',
+    choices=['NCHW', 'NHWC'],
+    help='The data order, NCHW=[batch, channels, height, width].'
+    'Only support NHWC right now.')
+parser.add_argument(
+    '--data_set',
+    type=str,
+    default='cifar10',
+    choices=['cifar10', 'flowers'],
+    help='Optional dataset for benchmark.')
+
+parser.add_argument(
+    "--ps_hosts",
+    type=str,
+    default="",
+    help="Comma-separated list of hostname:port pairs")
+parser.add_argument(
+    "--worker_hosts",
+    type=str,
+    default="",
+    help="Comma-separated list of hostname:port pairs")
+parser.add_argument(
+    "--job_name", type=str, default="", help="One of 'worker', 'ps'")
+# Flags for defining the tf.train.Server
+parser.add_argument(
+    "--task_index", type=int, default=0, help="Index of task within the job")
+
+args = parser.parse_args()
+
+
+class VGG16Model(object):
+    def __init__(self):
+        self.parameters = []
+
+    def batch_norm_relu(self, inputs, is_training):
+        """Performs a batch normalization followed by a ReLU."""
+        # We set fused=True for a significant speed boost. See
+        # https://www.tensorflow.org/speed/speed_guide#common_fused_ops
+        inputs = tf.layers.batch_normalization(
+            inputs=inputs,
+            axis=1 if args.data_format == 'NCHW' else -1,
+            momentum=0.9,
+            epsilon=1e-05,
+            center=True,
+            scale=True,
+            training=is_training,
+            fused=True)
+        inputs = tf.nn.relu(inputs)
+        return inputs
+
+    def conv_bn_layer(self,
+                      name,
+                      images,
+                      kernel_shape,
+                      is_training,
+                      drop_rate=0.0):
+        with tf.name_scope(name) as scope:
+            kernel = tf.Variable(
+                tf.truncated_normal(
+                    kernel_shape, dtype=tf.float32, stddev=1e-1),
+                name='weights')
+            conv = tf.nn.conv2d(
+                images,
+                kernel, [1, 1, 1, 1],
+                data_format=args.data_format,
+                padding='SAME')
+            biases = tf.Variable(
+                tf.constant(
+                    0.0, shape=[kernel_shape[-1]], dtype=tf.float32),
+                trainable=True,
+                name='biases')
+            out = tf.nn.bias_add(conv, biases)
+            out = self.batch_norm_relu(out, is_training)
+            out = tf.layers.dropout(out, rate=drop_rate, training=is_training)
+            return out
+
+    def fc_layer(self, name, inputs, shape):
+        with tf.name_scope(name) as scope:
+            fc_w = tf.Variable(
+                tf.truncated_normal(
+                    shape, dtype=tf.float32, stddev=1e-1),
+                name='weights')
+            fc_b = tf.Variable(
+                tf.constant(
+                    0.0, shape=[shape[-1]], dtype=tf.float32),
+                trainable=True,
+                name='biases')
+            out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b)
+            return out
+
+    def network(self, images, class_dim, is_training):
+        """ VGG16 model structure.
+
+            TODO(kuke): enable this network to support the 'NCHW' data format
+        """
+
+        # conv1
+        conv1_1 = self.conv_bn_layer(
+            'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3)
+        conv1_2 = self.conv_bn_layer(
+            'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0)
+        # pool1
+        pool1 = tf.nn.max_pool(
+            conv1_2,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool1')
+        # conv2
+        conv2_1 = self.conv_bn_layer(
+            'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4)
+        conv2_2 = self.conv_bn_layer(
+            'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0)
+        # pool2
+        pool2 = tf.nn.max_pool(
+            conv2_2,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool2')
+        # conv3
+        conv3_1 = self.conv_bn_layer(
+            'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4)
+        conv3_2 = self.conv_bn_layer(
+            'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4)
+        conv3_3 = self.conv_bn_layer(
+            'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0)
+        # pool3
+        pool3 = tf.nn.max_pool(
+            conv3_3,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool3')
+        # conv4
+        conv4_1 = self.conv_bn_layer(
+            'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4)
+        conv4_2 = self.conv_bn_layer(
+            'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
+        conv4_3 = self.conv_bn_layer(
+            'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
+        # pool4
+        pool4 = tf.nn.max_pool(
+            conv4_3,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool4')
+        # conv5
+        conv5_1 = self.conv_bn_layer(
+            'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4)
+        conv5_2 = self.conv_bn_layer(
+            'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
+        conv5_3 = self.conv_bn_layer(
+            'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
+        # pool5
+        pool5 = tf.nn.max_pool(
+            conv5_3,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool4')
+        # flatten
+        shape = int(np.prod(pool5.get_shape()[1:]))
+        pool5_flat = tf.reshape(pool5, [-1, shape])
+        # fc1
+        drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training)
+        fc1 = self.fc_layer('fc1', drop, [shape, 512])
+        # fc2
+        bn = self.batch_norm_relu(fc1, is_training)
+        drop = tf.layers.dropout(bn, rate=0.5, training=is_training)
+        fc2 = self.fc_layer('fc2', drop, [512, 512])
+
+        fc3 = self.fc_layer('fc3', fc2, [512, class_dim])
+
+        return fc3
+
+
+def run_benchmark(cluster_spec, server):
+    """Run benchmark on cifar10 or flowers."""
+
+    if args.data_set == "cifar10":
+        class_dim = 10
+        raw_shape = (3, 32, 32)
+        dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else (
+            None, 3, 32, 32)
+    else:
+        class_dim = 102
+        raw_shape = (3, 224, 224)
+        dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else (
+            None, 3, 224, 224)
+
+    device = tf.train.replica_device_setter(
+        worker_device="/job:worker/task:{}".format(args.task_index),
+        cluster=cluster_spec)
+
+    with tf.device(device):
+        images = tf.placeholder(tf.float32, shape=dat_shape)
+        labels = tf.placeholder(tf.int64, shape=(None, ))
+        is_training = tf.placeholder('bool')
+        onehot_labels = tf.one_hot(labels, depth=class_dim)
+
+        vgg16 = VGG16Model()
+        logits = vgg16.network(images, class_dim, is_training)
+        loss = tf.losses.softmax_cross_entropy(
+            onehot_labels=onehot_labels, logits=logits)
+        avg_loss = tf.reduce_mean(loss)
+
+        correct = tf.equal(tf.argmax(logits, 1), labels)
+        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
+
+        optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
+        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+        global_step = tf.Variable(0, name='global_step', trainable=False)
+        with tf.control_dependencies(update_ops):
+            train_op = optimizer.minimize(avg_loss, global_step=global_step)
+
+        summary_op = tf.summary.merge_all()
+        init_op = tf.global_variables_initializer()
+
+    # data reader
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.test10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+
+    # test
+    def test():
+        test_accs = []
+        for batch_id, data in enumerate(test_reader()):
+            test_images = np.array(
+         map(lambda x: np.transpose(x[0].reshape(raw_shape),
+         axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
+            test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
+            test_accs.append(
+                accuracy.eval(feed_dict={
+                    images: test_images,
+                    labels: test_labels,
+                    is_training: False
+                }))
+        return np.mean(test_accs)
+
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+
+    hooks = [tf.train.StopAtStepHook(last_step=1000000)]
+
+    with tf.train.MonitoredTrainingSession(
+            master=server.target, is_chief=(args.task_index == 0),
+            hooks=hooks) as sess:
+        iters, num_samples, start_time = 0, 0, 0.0
+        for pass_id in range(args.num_passes):
+            # train
+            num_samples = 0
+            start_time = time.time()
+            for batch_id, data in enumerate(train_reader()):
+                train_images = np.array(
+                    map(lambda x: np.transpose(x[0].reshape(raw_shape),
+                    axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
+                train_labels = np.array(map(lambda x: x[1], data)).astype(
+                    'int64')
+                iter_begin_time = time.time()
+                _, loss, acc = sess.run([train_op, avg_loss, accuracy],
+                                        feed_dict={
+                                            images: train_images,
+                                            labels: train_labels,
+                                            is_training: True
+                                        })
+                iters += 1
+                print(
+                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed=%.2f imgs/sec"
+                    % (pass_id, iters, loss, acc,
+                       len(data) / (time.time() - iter_begin_time)))
+                num_samples += len(data)
+            train_elapsed = time.time() - start_time
+            # test
+            pass_test_acc = test()
+            print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" %
+                  (pass_id, num_samples / train_elapsed, pass_test_acc))
+
+
+def print_arguments():
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == '__main__':
+    print_arguments()
+
+    ps_hosts = args.ps_hosts.split(",")
+    worker_hosts = args.worker_hosts.split(",")
+
+    # Create a cluster from the parameter server and worker hosts.
+    cluster_spec = tf.train.ClusterSpec({
+        "ps": ps_hosts,
+        "worker": worker_hosts
+    })
+
+    # Create and start a server for the local task.
+    server = tf.train.Server(
+        cluster_spec, job_name=args.job_name, task_index=args.task_index)
+
+    if args.job_name == "ps":
+        print("start pserver")
+        server.join()
+    elif args.job_name == "worker":
+        print("start worker")
+        run_benchmark(cluster_spec, server)
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -59,6 +59,7 @@ endif(NOT WITH_GOLANG)

 if(NOT WITH_GPU)
    add_definitions(-DHPPL_STUB_FUNC)
+    add_definitions("-DCUPTI_LIB_PATH=\"\"")

    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
@@ -73,7 +74,14 @@ else()
    if(NOT CUDNN_FOUND)
        message(FATAL_ERROR "Paddle needs cudnn to compile")
    endif()
-
+    if(CUPTI_FOUND)
+        include_directories(${CUPTI_INCLUDE_DIR})
+        add_definitions(-DPADDLE_WITH_CUPTI)
+        add_definitions("-DCUPTI_LIB_PATH=\"${CUPTI_LIBRARY_PATH}\"")
+    else()
+        add_definitions("-DCUPTI_LIB_PATH=\"\"")
+        message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.")
+    endif()
    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")

    # Include cuda and cudnn

--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -155,7 +155,8 @@ endif()
 include_directories(${CUDA_INCLUDE_DIRS})
 list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
 if(NOT WITH_DSO)
-    list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
+    # TODO(panyx0718): CUPTI only allows DSO?
+    list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
 endif(NOT WITH_DSO)

 # setting nvcc arch flags

--- a/cmake/cupti.cmake
+++ b/cmake/cupti.cmake
+if(NOT WITH_GPU)
+    return()
+endif()
+
+
+set(CUPTI_ROOT "/usr" CACHE PATH "CUPTI ROOT")
+find_path(CUPTI_INCLUDE_DIR cupti.h
+        PATHS ${CUPTI_ROOT} ${CUPTI_ROOT}/include
+        $ENV{CUPTI_ROOT} $ENV{CUPTI_ROOT}/include
+        ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/include
+        NO_DEFAULT_PATH
+        )
+
+get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
+
+set(TARGET_ARCH "x86_64")
+if(NOT ${CMAKE_SYSTEM_PROCESSOR})
+    set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
+endif()
+
+list(APPEND CUPTI_CHECK_LIBRARY_DIRS
+        ${CUPTI_ROOT}
+        ${CUPTI_ROOT}/lib64
+        ${CUPTI_ROOT}/lib
+        ${CUPTI_ROOT}/lib/${TARGET_ARCH}-linux-gnu
+        $ENV{CUPTI_ROOT}
+        $ENV{CUPTI_ROOT}/lib64
+        $ENV{CUPTI_ROOT}/lib
+        /usr/lib
+        ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/lib64)
+find_library(CUPTI_LIBRARY NAMES libcupti.so libcupti.dylib # libcupti_static.a
+       PATHS ${CUPTI_CHECK_LIBRARY_DIRS} ${CUPTI_INCLUDE_DIR} ${__libpath_hist}
+       NO_DEFAULT_PATH
+       DOC "Path to cuPTI library.")
+
+get_filename_component(CUPTI_LIBRARY_PATH ${CUPTI_LIBRARY} DIRECTORY)
+if(CUPTI_INCLUDE_DIR AND CUPTI_LIBRARY)
+    set(CUPTI_FOUND ON)
+else()
+    set(CUPTI_FOUND OFF)
+endif()
--- a/doc/api/v2/fluid/data_feeder.rst
+++ b/doc/api/v2/fluid/data_feeder.rst
@@ -8,7 +8,7 @@ data_feeder
 DataFeeder
 ----------

-..  autoclass:: paddle.v2.fluid.data_feeder.DataFeeder
+..  autoclass:: paddle.fluid.data_feeder.DataFeeder
    :members:
    :noindex:

--- a/doc/api/v2/fluid/evaluator.rst
+++ b/doc/api/v2/fluid/evaluator.rst
@@ -8,14 +8,14 @@ evaluator
 Accuracy
 --------

-..  autoclass:: paddle.v2.fluid.evaluator.Accuracy
+..  autoclass:: paddle.fluid.evaluator.Accuracy
    :members:
    :noindex:

 ChunkEvaluator
 --------------

-..  autoclass:: paddle.v2.fluid.evaluator.ChunkEvaluator
+..  autoclass:: paddle.fluid.evaluator.ChunkEvaluator
    :members:
    :noindex:

--- a/doc/api/v2/fluid/executor.rst
+++ b/doc/api/v2/fluid/executor.rst
@@ -8,25 +8,25 @@ executor
 Executor
 --------

-..  autoclass:: paddle.v2.fluid.executor.Executor
+..  autoclass:: paddle.fluid.executor.Executor
    :members:
    :noindex:

 global_scope
 ------------

-..  autofunction:: paddle.v2.fluid.executor.global_scope
+..  autofunction:: paddle.fluid.executor.global_scope
    :noindex:

 scope_guard
 -----------

-..  autofunction:: paddle.v2.fluid.executor.scope_guard
+..  autofunction:: paddle.fluid.executor.scope_guard
    :noindex:

 switch_scope
 ------------

-..  autofunction:: paddle.v2.fluid.executor.switch_scope
+..  autofunction:: paddle.fluid.executor.switch_scope
    :noindex:

--- a/doc/api/v2/fluid/gen_doc.py
+++ b/doc/api/v2/fluid/gen_doc.py
@@ -17,7 +17,7 @@ import argparse
 import sys
 import types

-import paddle.v2.fluid as fluid
+import paddle.fluid as fluid


 def parse_arg():
@@ -70,7 +70,7 @@ class DocGenerator(object):

    def print_class(self, name):
        self._print_header_(name, dot='-', is_title=False)
-        self.stream.write('''..  autoclass:: paddle.v2.fluid.{0}.{1}
+        self.stream.write('''..  autoclass:: paddle.fluid.{0}.{1}
    :members:
    :noindex:

@@ -78,7 +78,7 @@ class DocGenerator(object):

    def print_method(self, name):
        self._print_header_(name, dot='-', is_title=False)
-        self.stream.write('''..  autofunction:: paddle.v2.fluid.{0}.{1}
+        self.stream.write('''..  autofunction:: paddle.fluid.{0}.{1}
    :noindex:

 '''.format(self.module_name, name))

--- a/doc/api/v2/fluid/gen_doc.sh
+++ b/doc/api/v2/fluid/gen_doc.sh
--- a/doc/api/fluid/index.rst
+++ b/doc/api/fluid/index.rst
+======================
+Fluid
+======================
+
+..  toctree::
+    :maxdepth: 1
+
+    layers.rst
+    data_feeder.rst
+    executor.rst
+    initializer.rst
+    evaluator.rst
+    nets.rst
+    optimizer.rst
+    param_attr.rst
+    profiler.rst
+    regularizer.rst
+    io.rst
--- a/doc/api/v2/fluid/initializer.rst
+++ b/doc/api/v2/fluid/initializer.rst
@@ -8,28 +8,28 @@ initializer
 Constant
 --------

-..  autoclass:: paddle.v2.fluid.initializer.Constant
+..  autoclass:: paddle.fluid.initializer.Constant
    :members:
    :noindex:

 Uniform
 -------

-..  autoclass:: paddle.v2.fluid.initializer.Uniform
+..  autoclass:: paddle.fluid.initializer.Uniform
    :members:
    :noindex:

 Normal
 ------

-..  autoclass:: paddle.v2.fluid.initializer.Normal
+..  autoclass:: paddle.fluid.initializer.Normal
    :members:
    :noindex:

 Xavier
 ------

-..  autoclass:: paddle.v2.fluid.initializer.Xavier
+..  autoclass:: paddle.fluid.initializer.Xavier
    :members:
    :noindex:

--- a/doc/api/v2/fluid/io.rst
+++ b/doc/api/v2/fluid/io.rst
@@ -8,54 +8,54 @@ io
 save_vars
 ---------

-..  autofunction:: paddle.v2.fluid.io.save_vars
+..  autofunction:: paddle.fluid.io.save_vars
    :noindex:

 save_params
 -----------

-..  autofunction:: paddle.v2.fluid.io.save_params
+..  autofunction:: paddle.fluid.io.save_params
    :noindex:

 save_persistables
 -----------------

-..  autofunction:: paddle.v2.fluid.io.save_persistables
+..  autofunction:: paddle.fluid.io.save_persistables
    :noindex:

 load_vars
 ---------

-..  autofunction:: paddle.v2.fluid.io.load_vars
+..  autofunction:: paddle.fluid.io.load_vars
    :noindex:

 load_params
 -----------

-..  autofunction:: paddle.v2.fluid.io.load_params
+..  autofunction:: paddle.fluid.io.load_params
    :noindex:

 load_persistables
 -----------------

-..  autofunction:: paddle.v2.fluid.io.load_persistables
+..  autofunction:: paddle.fluid.io.load_persistables
    :noindex:

 save_inference_model
 --------------------

-..  autofunction:: paddle.v2.fluid.io.save_inference_model
+..  autofunction:: paddle.fluid.io.save_inference_model
    :noindex:

 load_inference_model
 --------------------

-..  autofunction:: paddle.v2.fluid.io.load_inference_model
+..  autofunction:: paddle.fluid.io.load_inference_model
    :noindex:

 get_inference_program
 ---------------------

-..  autofunction:: paddle.v2.fluid.io.get_inference_program
+..  autofunction:: paddle.fluid.io.get_inference_program
    :noindex:

--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -11,167 +11,167 @@ control_flow
 split_lod_tensor
 ----------------

-..  autofunction:: paddle.v2.fluid.layers.split_lod_tensor
+..  autofunction:: paddle.fluid.layers.split_lod_tensor
    :noindex:

 merge_lod_tensor
 ----------------

-..  autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
+..  autofunction:: paddle.fluid.layers.merge_lod_tensor
    :noindex:

 BlockGuard
 ----------

-..  autoclass:: paddle.v2.fluid.layers.BlockGuard
+..  autoclass:: paddle.fluid.layers.BlockGuard
    :members:
    :noindex:

 BlockGuardWithCompletion
 ------------------------

-..  autoclass:: paddle.v2.fluid.layers.BlockGuardWithCompletion
+..  autoclass:: paddle.fluid.layers.BlockGuardWithCompletion
    :members:
    :noindex:

 StaticRNNMemoryLink
 -------------------

-..  autoclass:: paddle.v2.fluid.layers.StaticRNNMemoryLink
+..  autoclass:: paddle.fluid.layers.StaticRNNMemoryLink
    :members:
    :noindex:

 WhileGuard
 ----------

-..  autoclass:: paddle.v2.fluid.layers.WhileGuard
+..  autoclass:: paddle.fluid.layers.WhileGuard
    :members:
    :noindex:

 While
 -----

-..  autoclass:: paddle.v2.fluid.layers.While
+..  autoclass:: paddle.fluid.layers.While
    :members:
    :noindex:

 lod_rank_table
 --------------

-..  autofunction:: paddle.v2.fluid.layers.lod_rank_table
+..  autofunction:: paddle.fluid.layers.lod_rank_table
    :noindex:

 max_sequence_len
 ----------------

-..  autofunction:: paddle.v2.fluid.layers.max_sequence_len
+..  autofunction:: paddle.fluid.layers.max_sequence_len
    :noindex:

 topk
 ----

-..  autofunction:: paddle.v2.fluid.layers.topk
+..  autofunction:: paddle.fluid.layers.topk
    :noindex:

 lod_tensor_to_array
 -------------------

-..  autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
+..  autofunction:: paddle.fluid.layers.lod_tensor_to_array
    :noindex:

 array_to_lod_tensor
 -------------------

-..  autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
+..  autofunction:: paddle.fluid.layers.array_to_lod_tensor
    :noindex:

 increment
 ---------

-..  autofunction:: paddle.v2.fluid.layers.increment
+..  autofunction:: paddle.fluid.layers.increment
    :noindex:

 array_write
 -----------

-..  autofunction:: paddle.v2.fluid.layers.array_write
+..  autofunction:: paddle.fluid.layers.array_write
    :noindex:

 create_array
 ------------

-..  autofunction:: paddle.v2.fluid.layers.create_array
+..  autofunction:: paddle.fluid.layers.create_array
    :noindex:

 less_than
 ---------

-..  autofunction:: paddle.v2.fluid.layers.less_than
+..  autofunction:: paddle.fluid.layers.less_than
    :noindex:

 array_read
 ----------

-..  autofunction:: paddle.v2.fluid.layers.array_read
+..  autofunction:: paddle.fluid.layers.array_read
    :noindex:

 shrink_memory
 -------------

-..  autofunction:: paddle.v2.fluid.layers.shrink_memory
+..  autofunction:: paddle.fluid.layers.shrink_memory
    :noindex:

 array_length
 ------------

-..  autofunction:: paddle.v2.fluid.layers.array_length
+..  autofunction:: paddle.fluid.layers.array_length
    :noindex:

 IfElse
 ------

-..  autoclass:: paddle.v2.fluid.layers.IfElse
+..  autoclass:: paddle.fluid.layers.IfElse
    :members:
    :noindex:

 DynamicRNN
 ----------

-..  autoclass:: paddle.v2.fluid.layers.DynamicRNN
+..  autoclass:: paddle.fluid.layers.DynamicRNN
    :members:
    :noindex:

 ConditionalBlock
 ----------------

-..  autoclass:: paddle.v2.fluid.layers.ConditionalBlock
+..  autoclass:: paddle.fluid.layers.ConditionalBlock
    :members:
    :noindex:

 StaticRNN
 ---------

-..  autoclass:: paddle.v2.fluid.layers.StaticRNN
+..  autoclass:: paddle.fluid.layers.StaticRNN
    :members:
    :noindex:

 reorder_lod_tensor_by_rank
 --------------------------

-..  autofunction:: paddle.v2.fluid.layers.reorder_lod_tensor_by_rank
+..  autofunction:: paddle.fluid.layers.reorder_lod_tensor_by_rank
    :noindex:

 ParallelDo
 ----------

-..  autoclass:: paddle.v2.fluid.layers.ParallelDo
+..  autoclass:: paddle.fluid.layers.ParallelDo
    :members:
    :noindex:

 Print
 -----

-..  autofunction:: paddle.v2.fluid.layers.Print
+..  autofunction:: paddle.fluid.layers.Print
    :noindex:

 device
@@ -180,7 +180,7 @@ device
 get_places
 ----------

-..  autofunction:: paddle.v2.fluid.layers.get_places
+..  autofunction:: paddle.fluid.layers.get_places
    :noindex:

 io
@@ -189,27 +189,27 @@ io
 data
 ----

-..  autofunction:: paddle.v2.fluid.layers.data
+..  autofunction:: paddle.fluid.layers.data
    :noindex:

 BlockGuardServ
 --------------

-..  autoclass:: paddle.v2.fluid.layers.BlockGuardServ
+..  autoclass:: paddle.fluid.layers.BlockGuardServ
    :members:
    :noindex:

 ListenAndServ
 -------------

-..  autoclass:: paddle.v2.fluid.layers.ListenAndServ
+..  autoclass:: paddle.fluid.layers.ListenAndServ
    :members:
    :noindex:

 Send
 ----

-..  autofunction:: paddle.v2.fluid.layers.Send
+..  autofunction:: paddle.fluid.layers.Send
    :noindex:

 nn
@@ -218,259 +218,259 @@ nn
 fc
 --

-..  autofunction:: paddle.v2.fluid.layers.fc
+..  autofunction:: paddle.fluid.layers.fc
    :noindex:

 embedding
 ---------

-..  autofunction:: paddle.v2.fluid.layers.embedding
+..  autofunction:: paddle.fluid.layers.embedding
    :noindex:

 dynamic_lstm
 ------------

-..  autofunction:: paddle.v2.fluid.layers.dynamic_lstm
+..  autofunction:: paddle.fluid.layers.dynamic_lstm
    :noindex:

 dynamic_lstmp
 -------------

-..  autofunction:: paddle.v2.fluid.layers.dynamic_lstmp
+..  autofunction:: paddle.fluid.layers.dynamic_lstmp
    :noindex:

 dynamic_gru
 -----------

-..  autofunction:: paddle.v2.fluid.layers.dynamic_gru
+..  autofunction:: paddle.fluid.layers.dynamic_gru
    :noindex:

 gru_unit
 --------

-..  autofunction:: paddle.v2.fluid.layers.gru_unit
+..  autofunction:: paddle.fluid.layers.gru_unit
    :noindex:

 linear_chain_crf
 ----------------

-..  autofunction:: paddle.v2.fluid.layers.linear_chain_crf
+..  autofunction:: paddle.fluid.layers.linear_chain_crf
    :noindex:

 crf_decoding
 ------------

-..  autofunction:: paddle.v2.fluid.layers.crf_decoding
+..  autofunction:: paddle.fluid.layers.crf_decoding
    :noindex:

 cos_sim
 -------

-..  autofunction:: paddle.v2.fluid.layers.cos_sim
+..  autofunction:: paddle.fluid.layers.cos_sim
    :noindex:

 cross_entropy
 -------------

-..  autofunction:: paddle.v2.fluid.layers.cross_entropy
+..  autofunction:: paddle.fluid.layers.cross_entropy
    :noindex:

 square_error_cost
 -----------------

-..  autofunction:: paddle.v2.fluid.layers.square_error_cost
+..  autofunction:: paddle.fluid.layers.square_error_cost
    :noindex:

 accuracy
 --------

-..  autofunction:: paddle.v2.fluid.layers.accuracy
+..  autofunction:: paddle.fluid.layers.accuracy
    :noindex:

 chunk_eval
 ----------

-..  autofunction:: paddle.v2.fluid.layers.chunk_eval
+..  autofunction:: paddle.fluid.layers.chunk_eval
    :noindex:

 sequence_conv
 -------------

-..  autofunction:: paddle.v2.fluid.layers.sequence_conv
+..  autofunction:: paddle.fluid.layers.sequence_conv
    :noindex:

 conv2d
 ------

-..  autofunction:: paddle.v2.fluid.layers.conv2d
+..  autofunction:: paddle.fluid.layers.conv2d
    :noindex:

 sequence_pool
 -------------

-..  autofunction:: paddle.v2.fluid.layers.sequence_pool
+..  autofunction:: paddle.fluid.layers.sequence_pool
    :noindex:

 pool2d
 ------

-..  autofunction:: paddle.v2.fluid.layers.pool2d
+..  autofunction:: paddle.fluid.layers.pool2d
    :noindex:

 batch_norm
 ----------

-..  autofunction:: paddle.v2.fluid.layers.batch_norm
+..  autofunction:: paddle.fluid.layers.batch_norm
    :noindex:

 layer_norm
 ----------

-..  autofunction:: paddle.v2.fluid.layers.layer_norm
+..  autofunction:: paddle.fluid.layers.layer_norm
    :noindex:

 beam_search_decode
 ------------------

-..  autofunction:: paddle.v2.fluid.layers.beam_search_decode
+..  autofunction:: paddle.fluid.layers.beam_search_decode
    :noindex:

 conv2d_transpose
 ----------------

-..  autofunction:: paddle.v2.fluid.layers.conv2d_transpose
+..  autofunction:: paddle.fluid.layers.conv2d_transpose
    :noindex:

 sequence_expand
 ---------------

-..  autofunction:: paddle.v2.fluid.layers.sequence_expand
+..  autofunction:: paddle.fluid.layers.sequence_expand
    :noindex:

 lstm_unit
 ---------

-..  autofunction:: paddle.v2.fluid.layers.lstm_unit
+..  autofunction:: paddle.fluid.layers.lstm_unit
    :noindex:

 reduce_sum
 ----------

-..  autofunction:: paddle.v2.fluid.layers.reduce_sum
+..  autofunction:: paddle.fluid.layers.reduce_sum
    :noindex:

 reduce_mean
 -----------

-..  autofunction:: paddle.v2.fluid.layers.reduce_mean
+..  autofunction:: paddle.fluid.layers.reduce_mean
    :noindex:

 reduce_max
 ----------

-..  autofunction:: paddle.v2.fluid.layers.reduce_max
+..  autofunction:: paddle.fluid.layers.reduce_max
    :noindex:

 reduce_min
 ----------

-..  autofunction:: paddle.v2.fluid.layers.reduce_min
+..  autofunction:: paddle.fluid.layers.reduce_min
    :noindex:

 sequence_first_step
 -------------------

-..  autofunction:: paddle.v2.fluid.layers.sequence_first_step
+..  autofunction:: paddle.fluid.layers.sequence_first_step
    :noindex:

 sequence_last_step
 ------------------

-..  autofunction:: paddle.v2.fluid.layers.sequence_last_step
+..  autofunction:: paddle.fluid.layers.sequence_last_step
    :noindex:

 dropout
 -------

-..  autofunction:: paddle.v2.fluid.layers.dropout
+..  autofunction:: paddle.fluid.layers.dropout
    :noindex:

 split
 -----

-..  autofunction:: paddle.v2.fluid.layers.split
+..  autofunction:: paddle.fluid.layers.split
    :noindex:

 ctc_greedy_decoder
 ------------------

-..  autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder
+..  autofunction:: paddle.fluid.layers.ctc_greedy_decoder
    :noindex:

 edit_distance
 -------------

-..  autofunction:: paddle.v2.fluid.layers.edit_distance
+..  autofunction:: paddle.fluid.layers.edit_distance
    :noindex:

 l2_normalize
 ------------

-..  autofunction:: paddle.v2.fluid.layers.l2_normalize
+..  autofunction:: paddle.fluid.layers.l2_normalize
    :noindex:

 matmul
 ------

-..  autofunction:: paddle.v2.fluid.layers.matmul
+..  autofunction:: paddle.fluid.layers.matmul
    :noindex:

 warpctc
 -------

-..  autofunction:: paddle.v2.fluid.layers.warpctc
+..  autofunction:: paddle.fluid.layers.warpctc
    :noindex:

 sequence_reshape
 ----------------

-..  autofunction:: paddle.v2.fluid.layers.sequence_reshape
+..  autofunction:: paddle.fluid.layers.sequence_reshape
    :noindex:

 transpose
 ---------

-..  autofunction:: paddle.v2.fluid.layers.transpose
+..  autofunction:: paddle.fluid.layers.transpose
    :noindex:

 im2sequence
 -----------

-..  autofunction:: paddle.v2.fluid.layers.im2sequence
+..  autofunction:: paddle.fluid.layers.im2sequence
    :noindex:

 nce
 ---

-..  autofunction:: paddle.v2.fluid.layers.nce
+..  autofunction:: paddle.fluid.layers.nce
    :noindex:

 beam_search
 -----------

-..  autofunction:: paddle.v2.fluid.layers.beam_search
+..  autofunction:: paddle.fluid.layers.beam_search
    :noindex:

 row_conv
 --------

-..  autofunction:: paddle.v2.fluid.layers.row_conv
+..  autofunction:: paddle.fluid.layers.row_conv
    :noindex:

 multiplex
 ---------

-..  autofunction:: paddle.v2.fluid.layers.multiplex
+..  autofunction:: paddle.fluid.layers.multiplex
    :noindex:

 ops
@@ -479,259 +479,259 @@ ops
 mean
 ----

-..  autofunction:: paddle.v2.fluid.layers.mean
+..  autofunction:: paddle.fluid.layers.mean
    :noindex:

 mul
 ---

-..  autofunction:: paddle.v2.fluid.layers.mul
+..  autofunction:: paddle.fluid.layers.mul
    :noindex:

 reshape
 -------

-..  autofunction:: paddle.v2.fluid.layers.reshape
+..  autofunction:: paddle.fluid.layers.reshape
    :noindex:

 scale
 -----

-..  autofunction:: paddle.v2.fluid.layers.scale
+..  autofunction:: paddle.fluid.layers.scale
    :noindex:

 sigmoid_cross_entropy_with_logits
 ---------------------------------

-..  autofunction:: paddle.v2.fluid.layers.sigmoid_cross_entropy_with_logits
+..  autofunction:: paddle.fluid.layers.sigmoid_cross_entropy_with_logits
    :noindex:

 elementwise_add
 ---------------

-..  autofunction:: paddle.v2.fluid.layers.elementwise_add
+..  autofunction:: paddle.fluid.layers.elementwise_add
    :noindex:

 elementwise_div
 ---------------

-..  autofunction:: paddle.v2.fluid.layers.elementwise_div
+..  autofunction:: paddle.fluid.layers.elementwise_div
    :noindex:

 elementwise_sub
 ---------------

-..  autofunction:: paddle.v2.fluid.layers.elementwise_sub
+..  autofunction:: paddle.fluid.layers.elementwise_sub
    :noindex:

 elementwise_mul
 ---------------

-..  autofunction:: paddle.v2.fluid.layers.elementwise_mul
+..  autofunction:: paddle.fluid.layers.elementwise_mul
    :noindex:

 elementwise_max
 ---------------

-..  autofunction:: paddle.v2.fluid.layers.elementwise_max
+..  autofunction:: paddle.fluid.layers.elementwise_max
    :noindex:

 elementwise_min
 ---------------

-..  autofunction:: paddle.v2.fluid.layers.elementwise_min
+..  autofunction:: paddle.fluid.layers.elementwise_min
    :noindex:

 elementwise_pow
 ---------------

-..  autofunction:: paddle.v2.fluid.layers.elementwise_pow
+..  autofunction:: paddle.fluid.layers.elementwise_pow
    :noindex:

 clip
 ----

-..  autofunction:: paddle.v2.fluid.layers.clip
+..  autofunction:: paddle.fluid.layers.clip
    :noindex:

 clip_by_norm
 ------------

-..  autofunction:: paddle.v2.fluid.layers.clip_by_norm
+..  autofunction:: paddle.fluid.layers.clip_by_norm
    :noindex:

 sequence_softmax
 ----------------

-..  autofunction:: paddle.v2.fluid.layers.sequence_softmax
+..  autofunction:: paddle.fluid.layers.sequence_softmax
    :noindex:

 sigmoid
 -------

-..  autofunction:: paddle.v2.fluid.layers.sigmoid
+..  autofunction:: paddle.fluid.layers.sigmoid
    :noindex:

 logsigmoid
 ----------

-..  autofunction:: paddle.v2.fluid.layers.logsigmoid
+..  autofunction:: paddle.fluid.layers.logsigmoid
    :noindex:

 exp
 ---

-..  autofunction:: paddle.v2.fluid.layers.exp
+..  autofunction:: paddle.fluid.layers.exp
    :noindex:

 relu
 ----

-..  autofunction:: paddle.v2.fluid.layers.relu
+..  autofunction:: paddle.fluid.layers.relu
    :noindex:

 tanh
 ----

-..  autofunction:: paddle.v2.fluid.layers.tanh
+..  autofunction:: paddle.fluid.layers.tanh
    :noindex:

 tanh_shrink
 -----------

-..  autofunction:: paddle.v2.fluid.layers.tanh_shrink
+..  autofunction:: paddle.fluid.layers.tanh_shrink
    :noindex:

 softshrink
 ----------

-..  autofunction:: paddle.v2.fluid.layers.softshrink
+..  autofunction:: paddle.fluid.layers.softshrink
    :noindex:

 sqrt
 ----

-..  autofunction:: paddle.v2.fluid.layers.sqrt
+..  autofunction:: paddle.fluid.layers.sqrt
    :noindex:

 abs
 ---

-..  autofunction:: paddle.v2.fluid.layers.abs
+..  autofunction:: paddle.fluid.layers.abs
    :noindex:

 ceil
 ----

-..  autofunction:: paddle.v2.fluid.layers.ceil
+..  autofunction:: paddle.fluid.layers.ceil
    :noindex:

 floor
 -----

-..  autofunction:: paddle.v2.fluid.layers.floor
+..  autofunction:: paddle.fluid.layers.floor
    :noindex:

 round
 -----

-..  autofunction:: paddle.v2.fluid.layers.round
+..  autofunction:: paddle.fluid.layers.round
    :noindex:

 reciprocal
 ----------

-..  autofunction:: paddle.v2.fluid.layers.reciprocal
+..  autofunction:: paddle.fluid.layers.reciprocal
    :noindex:

 log
 ---

-..  autofunction:: paddle.v2.fluid.layers.log
+..  autofunction:: paddle.fluid.layers.log
    :noindex:

 square
 ------

-..  autofunction:: paddle.v2.fluid.layers.square
+..  autofunction:: paddle.fluid.layers.square
    :noindex:

 softplus
 --------

-..  autofunction:: paddle.v2.fluid.layers.softplus
+..  autofunction:: paddle.fluid.layers.softplus
    :noindex:

 softsign
 --------

-..  autofunction:: paddle.v2.fluid.layers.softsign
+..  autofunction:: paddle.fluid.layers.softsign
    :noindex:

 brelu
 -----

-..  autofunction:: paddle.v2.fluid.layers.brelu
+..  autofunction:: paddle.fluid.layers.brelu
    :noindex:

 leaky_relu
 ----------

-..  autofunction:: paddle.v2.fluid.layers.leaky_relu
+..  autofunction:: paddle.fluid.layers.leaky_relu
    :noindex:

 soft_relu
 ---------

-..  autofunction:: paddle.v2.fluid.layers.soft_relu
+..  autofunction:: paddle.fluid.layers.soft_relu
    :noindex:

 elu
 ---

-..  autofunction:: paddle.v2.fluid.layers.elu
+..  autofunction:: paddle.fluid.layers.elu
    :noindex:

 relu6
 -----

-..  autofunction:: paddle.v2.fluid.layers.relu6
+..  autofunction:: paddle.fluid.layers.relu6
    :noindex:

 pow
 ---

-..  autofunction:: paddle.v2.fluid.layers.pow
+..  autofunction:: paddle.fluid.layers.pow
    :noindex:

 stanh
 -----

-..  autofunction:: paddle.v2.fluid.layers.stanh
+..  autofunction:: paddle.fluid.layers.stanh
    :noindex:

 hard_shrink
 -----------

-..  autofunction:: paddle.v2.fluid.layers.hard_shrink
+..  autofunction:: paddle.fluid.layers.hard_shrink
    :noindex:

 thresholded_relu
 ----------------

-..  autofunction:: paddle.v2.fluid.layers.thresholded_relu
+..  autofunction:: paddle.fluid.layers.thresholded_relu
    :noindex:

 hard_sigmoid
 ------------

-..  autofunction:: paddle.v2.fluid.layers.hard_sigmoid
+..  autofunction:: paddle.fluid.layers.hard_sigmoid
    :noindex:

 swish
 -----

-..  autofunction:: paddle.v2.fluid.layers.swish
+..  autofunction:: paddle.fluid.layers.swish
    :noindex:

 tensor
@@ -740,66 +740,66 @@ tensor
 create_tensor
 -------------

-..  autofunction:: paddle.v2.fluid.layers.create_tensor
+..  autofunction:: paddle.fluid.layers.create_tensor
    :noindex:

 create_parameter
 ----------------

-..  autofunction:: paddle.v2.fluid.layers.create_parameter
+..  autofunction:: paddle.fluid.layers.create_parameter
    :noindex:

 create_global_var
 -----------------

-..  autofunction:: paddle.v2.fluid.layers.create_global_var
+..  autofunction:: paddle.fluid.layers.create_global_var
    :noindex:

 cast
 ----

-..  autofunction:: paddle.v2.fluid.layers.cast
+..  autofunction:: paddle.fluid.layers.cast
    :noindex:

 concat
 ------

-..  autofunction:: paddle.v2.fluid.layers.concat
+..  autofunction:: paddle.fluid.layers.concat
    :noindex:

 sums
 ----

-..  autofunction:: paddle.v2.fluid.layers.sums
+..  autofunction:: paddle.fluid.layers.sums
    :noindex:

 assign
 ------

-..  autofunction:: paddle.v2.fluid.layers.assign
+..  autofunction:: paddle.fluid.layers.assign
    :noindex:

 fill_constant_batch_size_like
 -----------------------------

-..  autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
+..  autofunction:: paddle.fluid.layers.fill_constant_batch_size_like
    :noindex:

 fill_constant
 -------------

-..  autofunction:: paddle.v2.fluid.layers.fill_constant
+..  autofunction:: paddle.fluid.layers.fill_constant
    :noindex:

 ones
 ----

-..  autofunction:: paddle.v2.fluid.layers.ones
+..  autofunction:: paddle.fluid.layers.ones
    :noindex:

 zeros
 -----

-..  autofunction:: paddle.v2.fluid.layers.zeros
+..  autofunction:: paddle.fluid.layers.zeros
    :noindex:

--- a/doc/api/v2/fluid/nets.rst
+++ b/doc/api/v2/fluid/nets.rst
@@ -8,24 +8,24 @@ nets
 simple_img_conv_pool
 --------------------

-..  autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
+..  autofunction:: paddle.fluid.nets.simple_img_conv_pool
    :noindex:

 sequence_conv_pool
 ------------------

-..  autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
+..  autofunction:: paddle.fluid.nets.sequence_conv_pool
    :noindex:

 glu
 ---

-..  autofunction:: paddle.v2.fluid.nets.glu
+..  autofunction:: paddle.fluid.nets.glu
    :noindex:

 scaled_dot_product_attention
 ----------------------------

-..  autofunction:: paddle.v2.fluid.nets.scaled_dot_product_attention
+..  autofunction:: paddle.fluid.nets.scaled_dot_product_attention
    :noindex:

--- a/doc/api/v2/fluid/optimizer.rst
+++ b/doc/api/v2/fluid/optimizer.rst
@@ -8,42 +8,42 @@ optimizer
 SGD
 ---

-..  autoclass:: paddle.v2.fluid.optimizer.SGD
+..  autoclass:: paddle.fluid.optimizer.SGD
    :members:
    :noindex:

 Momentum
 --------

-..  autoclass:: paddle.v2.fluid.optimizer.Momentum
+..  autoclass:: paddle.fluid.optimizer.Momentum
    :members:
    :noindex:

 Adagrad
 -------

-..  autoclass:: paddle.v2.fluid.optimizer.Adagrad
+..  autoclass:: paddle.fluid.optimizer.Adagrad
    :members:
    :noindex:

 Adam
 ----

-..  autoclass:: paddle.v2.fluid.optimizer.Adam
+..  autoclass:: paddle.fluid.optimizer.Adam
    :members:
    :noindex:

 Adamax
 ------

-..  autoclass:: paddle.v2.fluid.optimizer.Adamax
+..  autoclass:: paddle.fluid.optimizer.Adamax
    :members:
    :noindex:

 DecayedAdagrad
 --------------

-..  autoclass:: paddle.v2.fluid.optimizer.DecayedAdagrad
+..  autoclass:: paddle.fluid.optimizer.DecayedAdagrad
    :members:
    :noindex:

--- a/doc/api/v2/fluid/param_attr.rst
+++ b/doc/api/v2/fluid/param_attr.rst
@@ -8,14 +8,14 @@ param_attr
 ParamAttr
 ---------

-..  autoclass:: paddle.v2.fluid.param_attr.ParamAttr
+..  autoclass:: paddle.fluid.param_attr.ParamAttr
    :members:
    :noindex:

 WeightNormParamAttr
 -------------------

-..  autoclass:: paddle.v2.fluid.param_attr.WeightNormParamAttr
+..  autoclass:: paddle.fluid.param_attr.WeightNormParamAttr
    :members:
    :noindex:

--- a/doc/api/v2/fluid/profiler.rst
+++ b/doc/api/v2/fluid/profiler.rst
@@ -8,18 +8,18 @@ profiler
 cuda_profiler
 -------------

-..  autofunction:: paddle.v2.fluid.profiler.cuda_profiler
+..  autofunction:: paddle.fluid.profiler.cuda_profiler
    :noindex:

 reset_profiler
 --------------

-..  autofunction:: paddle.v2.fluid.profiler.reset_profiler
+..  autofunction:: paddle.fluid.profiler.reset_profiler
    :noindex:

 profiler
 --------

-..  autofunction:: paddle.v2.fluid.profiler.profiler
+..  autofunction:: paddle.fluid.profiler.profiler
    :noindex:

--- a/doc/api/v2/fluid/regularizer.rst
+++ b/doc/api/v2/fluid/regularizer.rst
@@ -8,20 +8,20 @@ regularizer
 append_regularization_ops
 -------------------------

-..  autofunction:: paddle.v2.fluid.regularizer.append_regularization_ops
+..  autofunction:: paddle.fluid.regularizer.append_regularization_ops
    :noindex:

 L1Decay
 -------

-..  autoclass:: paddle.v2.fluid.regularizer.L1Decay
+..  autoclass:: paddle.fluid.regularizer.L1Decay
    :members:
    :noindex:

 L2Decay
 -------

-..  autoclass:: paddle.v2.fluid.regularizer.L2Decay
+..  autoclass:: paddle.fluid.regularizer.L2Decay
    :members:
    :noindex:

--- a/doc/api/index_cn.rst
+++ b/doc/api/index_cn.rst
-API
-===
-
-..  toctree::
-    :maxdepth: 1
-
-    模型配置 <v2/model_configs.rst>
-    数据访问 <v2/data.rst>
-    训练与应用 <v2/run_logic.rst>
-    v2/fluid.rst
--- a/doc/api/index_en.rst
+++ b/doc/api/index_en.rst
@@ -4,7 +4,8 @@ API
 ..  toctree::
    :maxdepth: 1

+    overview.rst
    v2/model_configs.rst
    v2/data.rst
    v2/run_logic.rst
-    v2/fluid.rst
+    fluid/index.rst
--- a/doc/api/overview.rst
+++ b/doc/api/overview.rst
+V2 API Overview
+================
+
+The PaddlePaddle V2 API is designed to provide a modern user interface for PaddlePaddle V1(the original layer-based platform of PaddlePaddle),
+it proposes some high-level concepts such as `Layers <http://www.paddlepaddle.org/docs/develop/api/en/v2/config/layer.html>`_ , `Optimizer <http://www.paddlepaddle.org/docs/develop/api/en/v2/config/optimizer.html>`_ , `Evaluator <http://www.paddlepaddle.org/docs/develop/api/en/v2/config/evaluators.html>`_  and `Data Reader <http://www.paddlepaddle.org/docs/develop/api/en/v2/data/data_reader.html>`_ to make the model configuration more familiar to users.
+
+A model is composed of the computation described by a group of `Layers`, with `Evaluator` to define the error, `Optimizer` to update the parameters and `Data Reader` to feed in the data.
+
+We also provide the `interface for Training and Inference <http://www.paddlepaddle.org/docs/develop/api/en/v2/run_logic.html>`_ to help control the training and inference phrase,
+it has several easy to use methods
+
+- `paddle.train` 
+- `paddle.test`
+- `paddle.infer`
+
+to better expose the internal running details, different `events <http://www.paddlepaddle.org/docs/develop/api/en/v2/run_logic.html#event>`_ are available to users by writing some callbacks.
--- a/doc/api/v2/fluid.rst
+++ b/doc/api/v2/fluid.rst
-======================
-Fluid
-======================
-
-..  toctree::
-    :maxdepth: 1
-
-    fluid/layers.rst
-    fluid/data_feeder.rst
-    fluid/executor.rst
-    fluid/initializer.rst
-    fluid/evaluator.rst
-    fluid/nets.rst
-    fluid/optimizer.rst
-    fluid/param_attr.rst
-    fluid/profiler.rst
-    fluid/regularizer.rst
-    fluid/io.rst
--- a/doc/build_and_install/build_from_source_cn.rst
+++ b/doc/build_and_install/build_from_source_cn.rst
@@ -189,7 +189,7 @@ PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种B
    "WITH_TESTING", "是否开启单元测试", "OFF"
    "WITH_DOC", "是否编译中英文文档", "OFF"
    "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练", "Auto"
-    "WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON"
+    "WITH_GOLANG", "是否编译go语言的可容错parameter server", "OFF"
    "WITH_MKL", "是否使用MKL数学库，如果为否则是用OpenBLAS", "ON"

 BLAS

--- a/doc/build_and_install/build_from_source_en.rst
+++ b/doc/build_and_install/build_from_source_en.rst
@@ -191,7 +191,7 @@ You can add :code:`-D` argument to pass such options, like:
    "WITH_TESTING", "Build unit tests", "OFF"
    "WITH_DOC", "Build documentations", "OFF"
    "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
-    "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"
+    "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "OFF"
    "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON"



--- a/doc/design/concurrent_programming.md
+++ b/doc/design/concurrent_programming.md
@@ -12,7 +12,7 @@ The following table compares concepts in Fluid and Go

 | Go | Fluid |
 |----|-------|
-|user-defined functions | [layers](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid) |
+|user-defined functions | [layers](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid) |
 | control-flow and built-in functions | [intrinsics/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators) |
 | goroutines, channels | [class ThreadPool](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h) |
 | runtime | [class Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) |

--- a/doc/design/fluid.md
+++ b/doc/design/fluid.md
@@ -89,7 +89,7 @@ with train_loop.block():
    h[t] = the_step(input[t])
 ```    

-An actual Fluid example is described  [here](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44).
+An actual Fluid example is described  [here](https://github.com/PaddlePaddle/Paddle/blob/bde090a97564b9c61a6aaa38b72ccc4889d102d9/python/paddle/fluid/tests/unittests/test_while_op.py#L50-L58).

 From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.


--- a/doc/design/memory_optimization.md
+++ b/doc/design/memory_optimization.md
@@ -101,7 +101,7 @@ In-place is a built-in attribute of an operator. Since we treat in-place and oth

 #### contruct control flow graph

-Following is the ProgramDesc protobuf of [machine translation](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book/test_machine_translation.py) example.
+Following is the ProgramDesc protobuf of [machine translation](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py) example.

 - Block0:


--- a/doc/faq/build_and_install/index_cn.rst
+++ b/doc/faq/build_and_install/index_cn.rst
@@ -16,7 +16,7 @@
    $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
    $ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu

-更多关于Docker的安装与使用, 请参考 `PaddlePaddle Docker 文档 <http://www.paddlepaddle.org/doc_cn/build_and_install/install/docker_install.html>`_ 。
+更多关于Docker的安装与使用, 请参考 `PaddlePaddle Docker 文档 <http://www.paddlepaddle.org/docs/0.11.0/documentation/zh/getstarted/build_and_install/docker_install_cn.html>`_ 。


 2. CMake源码编译, 找到的PythonLibs和PythonInterp版本不一致

--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
 FAQ
 ====

+本文档对关于PaddlePaddle的一些常见问题提供了解答。如果您的问题未在此处，请您到 `PaddlePaddle社区 <https://github.com/PaddlePaddle/Paddle/issues>`_ 查找答案或直接提 `issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_ ，我们会及时进行回复。
+ 
 ..  toctree::
  :maxdepth: 1


--- a/doc/faq/local/index_cn.rst
+++ b/doc/faq/local/index_cn.rst
@@ -148,10 +148,10 @@ Paddle二进制在运行时捕获了浮点数异常，只要出现浮点数异

 ..  code-block:: python

-optimizer = paddle.optimizer.RMSProp(
-    learning_rate=1e-3,
-    gradient_clipping_threshold=10.0,
-    regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+    optimizer = paddle.optimizer.RMSProp(
+        learning_rate=1e-3,
+        gradient_clipping_threshold=10.0,
+        regularization=paddle.optimizer.L2Regularization(rate=8e-4))

 具体可以参考  `nmt_without_attention  <https://github.com/PaddlePaddle/models/blob/develop/nmt_without_attention/train.py#L35>`_ 示例。

@@ -159,13 +159,13 @@ optimizer = paddle.optimizer.RMSProp(

 ..  code-block:: python

-decoder_inputs = paddle.layer.fc(
-    act=paddle.activation.Linear(),
-    size=decoder_size * 3,
-    bias_attr=False,
-    input=[context, current_word],
-    layer_attr=paddle.attr.ExtraLayerAttribute(
-        error_clipping_threshold=100.0))
+    decoder_inputs = paddle.layer.fc(
+        act=paddle.activation.Linear(),
+        size=decoder_size * 3,
+        bias_attr=False,
+        input=[context, current_word],
+        layer_attr=paddle.attr.ExtraLayerAttribute(
+            error_clipping_threshold=100.0))

 完整代码可以参考示例 `machine translation <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py#L66>`_ 。


--- a/doc/faq/parameter/index_cn.rst
+++ b/doc/faq/parameter/index_cn.rst
@@ -196,6 +196,6 @@ PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数
        obj="process",
        args={"src_dict_path": src_dict_path})

-完整源码可参考 `seqToseq <https://github.com/PaddlePaddle/Paddle/tree/develop/demo/seqToseq>`_ 示例。
+完整源码可参考 `sequence_recurrent <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_recurrent.py>`_ 示例。


--- a/doc/howto/capi/index_cn.rst
+++ b/doc/howto/capi/index_cn.rst
 C-API预测库
 ==================

+当我们训练完一个神经网络模型之后，下一步就是用模型来做预测。预测就是准备输入数据，经过模型处理之后，得到预测结果的过程。
+
+相比于模型训练，预测有如下特点：
+
+#. 预测不需要训练过程中反向传播和参数更新的部分。
+#. 预测不需要标签(label)。
+#. 预测很多时候需要和用户系统整合在一起。
+
+因为上述特点，模型预测SDK需要单独设计，并具备以下特点：
+
+#. 预测SDK不包含反向传播和参数更新部分，以减小SDK的体积。
+#. 预测SDK需要提供一个简洁的用户接口，方便使用。
+#. 因为输入数据可能有多种结构，对输入数据的格式做清晰简洁的封装。
+#. 为了和用户系统兼容，SDK的接口需要是满足C标准的接口。
+
+PaddlePaddle提供了C-API，用于解决上述问题。关于C-API的使用，我们提供了如下指南：
+
 ..  toctree::
  :maxdepth: 1


--- a/doc/howto/capi/workflow_of_capi_cn.md
+++ b/doc/howto/capi/workflow_of_capi_cn.md
@@ -65,6 +65,7 @@
    output_file = "output.paddle.model"
    merge_v2_model(net, param_file, output_file)
    ```
+
    对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)这个示例，可直接运行 `python` [merge_v2_model.py](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense/merge_v2_model.py)。序列化结果会写入当前运行目录下的`output.paddle.model`文件中。使用这种方式，运行时C-API可以通过指定`output.paddle.model`文件的路径来加载预测模型。

 #### 注意事项

--- a/doc/howto/cluster/fluid_cluster_train_en.md
+++ b/doc/howto/cluster/fluid_cluster_train_en.md
@@ -32,7 +32,7 @@ The non-cluster version of this demo with fluid API is as follows:

 ``` python
 import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
+import paddle.fluid as fluid

 x = fluid.layers.data(name='x', shape=[13], dtype='float32')
 y_predict = fluid.layers.fc(input=x, size=1, act=None)
@@ -125,11 +125,11 @@ for pass_id in range(100):

 ### E2E demo

-Please find the complete demo from [here](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py).
+Please find the complete demo from [here](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book_distribute/notest_dist_fit_a_line.py).
 First `cd` into the folder that contains the `python` files. In this case:

 ```bash
-cd /paddle/python/paddle/v2/fluid/tests/book_distribute
+cd /paddle/python/paddle/fluid/tests/book_distribute
 ```

 In parameter server node run the following in the command line:

--- a/doc/howto/cluster/multi_cluster/index_cn.rst
+++ b/doc/howto/cluster/multi_cluster/index_cn.rst
 在不同集群中运行
 ================
+用户的集群环境不尽相同，为了方便大家的部署，我们提供了多种的集群部署方式，方便提交集群训练任务，以下将一一介绍:

-PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务，包括：
- `Kubernetes <http://kubernetes.io>`_ Google开源的容器集群的调度框架，支持大规模集群生产环境的完整集群方案。
- `OpenMPI <https://www.open-mpi.org>`_ 成熟的高性能并行计算框架。
- `Fabric <http://www.fabfile.org>`_ 集群管理工具。可以使用`Fabric`编写集群任务提交和管理脚本。
+`Kubernetes <http://kubernetes.io>`_ 是Google开源的容器集群的调度框架，支持大规模集群生产环境的完整集群方案。以下指南展示了PaddlePaddle对Kubernetes的支持：

-对于不同的集群平台，会分别介绍集群作业的启动和停止方法。这些例子都可以在 `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ 找到。
+..  toctree::
+  :maxdepth: 1
+
+  k8s_cn.md
+  k8s_distributed_cn.md

-在使用分布式计算平台进行训练时，任务被调度在集群中时，分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数，比如节点的ID、IP和任务节点个数等。
+`OpenMPI <https://www.open-mpi.org>`_  是成熟的高性能并行计算框架，在HPC领域使用非常的广泛。以下指南介绍了如何使用OpenMPI来搭建PaddlePaddle的集群训练任务:

 ..  toctree::
  :maxdepth: 1

-  fabric_cn.md
  openmpi_cn.md
-  k8s_cn.md
-  k8s_distributed_cn.md
+
+`Fabric <http://www.fabfile.org>`_ 是一个方便的程序部署和管理工具。我们提供了使用Fabric 进行部署、管理的方法，如果想详细了解，请阅读以下指南:
+
+..  toctree::
+  :maxdepth: 1
+
+  fabric_cn.md
+
+我们也支持在AWS上部署PaddlePaddle，详细请了解:
+
+..  toctree::
+  :maxdepth: 1
+
  k8s_aws_cn.md
+
+您可以在 `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ 找到以上相关的例子。
+
--- a/doc/howto/optimization/cpu_profiling_cn.md
+++ b/doc/howto/optimization/cpu_profiling_cn.md
@@ -35,7 +35,7 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
 ```
   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.284    0.284   29.514   29.514 main.py:1(<module>)
-     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
+     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/executor.py:20(run)
     4696   12.040    0.003   12.040    0.003 {built-in method run}
        1    0.144    0.144    6.534    6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
 ```
@@ -61,9 +61,9 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
 ```text
     4696   12.040    0.003   12.040    0.003 {built-in method run}
   300005    0.874    0.000    1.681    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
-   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
-     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
-        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(<module>)
+   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:219(__init__)
+     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp)
+        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/__init__.py:1(<module>)
 ```

 可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长，每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息，了解其调用关系。
@@ -76,9 +76,9 @@ Called By:

 Function                                                                                                 was called by...
                                                                                                             ncalls  tottime  cumtime
-/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)
-/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone)
-                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:487(clone)
+                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:534(append_backward)


 Called:

--- a/doc/howto/optimization/cpu_profiling_en.md
+++ b/doc/howto/optimization/cpu_profiling_en.md
@@ -49,7 +49,7 @@ port, we will see the output like the following:
 ```
   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.284    0.284   29.514   29.514 main.py:1(<module>)
-     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
+     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/executor.py:20(run)
     4696   12.040    0.003   12.040    0.003 {built-in method run}
        1    0.144    0.144    6.534    6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
 ```
@@ -74,9 +74,9 @@ focus on. We can sort above profiling file by tottime:
 ```text
     4696   12.040    0.003   12.040    0.003 {built-in method run}
   300005    0.874    0.000    1.681    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
-   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
-     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
-        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(<module>)
+   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:219(__init__)
+     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp)
+        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/__init__.py:1(<module>)
 ```

 We can see that the most time-consuming function is the `built-in
@@ -93,9 +93,9 @@ Called By:

 Function                                                                                                 was called by...
                                                                                                             ncalls  tottime  cumtime
-/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)
-/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone)
-                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:487(clone)
+                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:534(append_backward)


 Called:

--- a/doc/howto/read_source.md
+++ b/doc/howto/read_source.md
 # PaddlePaddle Fluid Source Code Overview

-Examples: https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid/tests/book
+Examples: https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/tests/book

 Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework

@@ -26,16 +26,16 @@ sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
 sgd_optimizer.minimize(avg_cost)
 ```

- Variables: `x`,  `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/framework.py#)
- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid/layers)
+- Variables: `x`,  `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/framework.py#)
+- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/layers)
  - Every Layer has one or more operators and variables/parameters
    - All the operators are defined at [`paddle/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators). Other worth-looking files:
      - Base class: [`paddle/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h)
      - Operator Registration: [`paddle/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h) 
      - Operator Lookup: [`paddle/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_info.h)
 - Optimizer: `fluid.optimizer.SGD`. It does the following
-  - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/backward.py)]
-  - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/optimizer.py)]
+  - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/backward.py)]
+  - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/optimizer.py)]

 # Run Time

@@ -57,7 +57,7 @@ exe.run(fluid.default_main_program(),

 - Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h)
  - The device handle are at [paddle/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h)
- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)]
+- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)]
  - Feeds the data: `feed=feeder.feed(data)`
  - Evaluates all the operators
  - Fetches the result: `fetch_list=[avg_cost]`

--- a/doc/mobile/cross_compiling_for_android_cn.md
+++ b/doc/mobile/cross_compiling_for_android_cn.md
@@ -23,6 +23,12 @@ $ docker build -t username/paddle-android:dev . -f Dockerfile.android
 $ docker pull paddlepaddle/paddle:latest-dev-android
 ```

+对于国内用户，我们提供了加速访问的镜像源：
+
+```bash
+$ docker pull docker.paddlepaddlehub.com/paddle:latest-dev-android
+```
+
 ### 编译PaddlePaddle C-API库
 构建好开发镜像后，即可使用开发镜像来编译Android版PaddlePaddle C-API库。
 Android的Docker开发镜像向用户提供两个可配置的参数：
@@ -56,15 +62,15 @@ Android的Docker开发镜像向用户提供两个可配置的参数：

 - 编译`armeabi-v7a`，`Android API 21`的PaddlePaddle库

-  ```bash
-  $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
-  ```
+```bash
+$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
+```

 - 编译`arm64-v8a`，`Android API 21`的PaddlePaddle库

-  ```bash
-  $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
-  ```
+```bash
+$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
+```

 执行上述`docker run`命令时，容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置，并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`，`ANDROID_API<21`时，Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节，根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后，PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录，所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。

@@ -155,7 +161,11 @@ cmake -DCMAKE_SYSTEM_NAME=Android \
      ..
 ```

-用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小，可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`；若希望最快的执行速度，则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS`来影响PaddlePaddle的编译过程。
+用户还可根据自己的需求设置其他编译参数。
+
+- 设置`CMAKE_BUILD_TYPE`为`MinSizeRel`，最小化生成的库的大小。
+- 设置`CMAKE_BUILD_TYPE`为`Release`，获得最快的执行速度，
+- 用户亦可以通过手动设置`CMAKE_C/CXX_FLAGS`来影响PaddlePaddle的编译过程。

 **性能TIPS**，为了达到最快的计算速度，在CMake参数配置上，有以下建议：


--- a/doc/mobile/cross_compiling_for_android_en.md
+++ b/doc/mobile/cross_compiling_for_android_en.md
@@ -25,6 +25,12 @@ Users can directly use the published Docker image.
 $ docker pull paddlepaddle/paddle:latest-dev-android
 ```

+For users in China, we provide a faster mirror.
+
+```bash
+$ docker pull docker.paddlepaddlehub.com/paddle:latest-dev-android
+```
+
 ### Build the Inference Library

 We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below:
@@ -86,19 +92,19 @@ Android NDK includes everything we need to build the [*standalone toolchain*](ht

 - To build the standalone toolchain for `armeabi-v7a` and Android API level 21:

-  ```bash
-  your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
-          --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain
-  ```
+```bash
+your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+        --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain
+```
  
  The generated standalone toolchain will be in `your/path/to/arm_standalone_toolchain`.

 - To build the standalone toolchain for `arm64-v8a` and Android API level 21:

-  ```bash
-  your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
-          --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
-  ```
+```bash
+your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+        --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
+```

  The generated standalone toolchain will be in `your/path/to/arm64_standalone_toolchain`.


--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -56,7 +56,7 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
-    shape_inference data_transform lod_tensor)
+    shape_inference data_transform lod_tensor profiler)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)

@@ -68,9 +68,9 @@ py_proto_compile(framework_py_proto SRCS framework.proto)
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
 add_custom_command(TARGET framework_py_proto POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/proto
-    COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/proto/
-    COMMENT "Copy generated python proto into directory paddle/v2/fluid/proto."
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto
+    COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/
+    COMMENT "Copy generated python proto into directory paddle/fluid/proto."
    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})

 cc_library(backward SRCS backward.cc DEPS net_op)
@@ -80,7 +80,7 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)

 cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
-framework_proto backward glog lod_rank_table profiler feed_fetch_method)
+framework_proto backward glog lod_rank_table feed_fetch_method)

 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -25,7 +25,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/profiler.h"

 DECLARE_bool(benchmark);
 DEFINE_bool(check_nan_inf, false,
@@ -58,13 +57,13 @@ static void CreateTensor(Variable* var, proto::VarType::Type var_type) {
    var->GetMutable<ReaderHolder>();
  } else if (var_type == proto::VarType::CHANNEL) {
    var->GetMutable<ChannelHolder>();
-  } else if (var_type == proto::VarType::NCCL_COM) {
-    // GetMutable will be called in ncclInit
+  } else if (var_type == proto::VarType::RAW) {
+    // GetMutable will be called in operator
  } else {
    PADDLE_THROW(
        "Variable type %d is not in "
        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
-        "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, NCCL_COM]",
+        "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]",
        var_type);
  }
 }
@@ -126,9 +125,6 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
  for (auto& op_desc : block.AllOps()) {
    auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);

-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    platform::RecordEvent record_event(op->Type(), pool.Get(place_));
-
    VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
    op->Run(*local_scope, place_);


--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -113,7 +113,10 @@ message VarType {
    PLACE_LIST = 14;
    READER = 15;
    CHANNEL = 16;
-    NCCL_COM = 17;
+    // Any runtime decided variable type is raw
+    // raw variables should manage their own allocations
+    // in operators like nccl_op
+    RAW = 17;
  }

  required Type type = 1;
@@ -164,4 +167,6 @@ message BlockDesc {
 // Please refer to
 // https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
 // for more details.
+// TODO(panyx0718): A model can have multiple programs. Need a
+// way to distinguish them. Maybe ID or name?
 message ProgramDesc { repeated BlockDesc blocks = 1; }
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -31,8 +31,14 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) {
  os << "{";
  for (auto &v : lod) {
    os << "{";
+    bool is_first = true;
    for (auto &i : v) {
-      os << i << ",";
+      if (is_first) {
+        os << i;
+        is_first = false;
+      } else {
+        os << ", " << i;
+      }
    }
    os << "}";
  }

--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -125,6 +125,8 @@ class OpDesc {

  BlockDesc *Block() { return this->block_; }

+  const BlockDesc &BlockRef() const { return *this->block_; }
+
  void SetBlock(BlockDesc *block) { this->block_ = block; }

 private:

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/platform/profiler.h"

 DECLARE_bool(benchmark);

@@ -497,7 +498,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
  this->InferShape(&infer_shape_ctx);
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  auto dev_ctx = pool.Get(place);
-
+  // profile
+  platform::RecordEvent record_event(Type(), dev_ctx);
  // check if op[type] has kernel registered.
  auto& all_op_kernels = AllOpKernels();
  auto kernels_iter = all_op_kernels.find(type_);

--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -32,23 +32,11 @@ void ReadBinaryFile(const std::string& filename, std::string& contents) {
  inputfs.close();
 }

-bool IsParameter(const framework::VarDesc* var,
-                 const framework::ProgramDesc& main_program) {
-  if (var->Persistable()) {
-    // There are many unreachable variables in the program
-    for (size_t i = 0; i < main_program.Size(); ++i) {
-      const framework::BlockDesc& block = main_program.Block(i);
-      for (auto* op : block.AllOps()) {
-        if (op->Type() == framework::kFeedOpType) {
-          continue;
-        }
-        for (auto input_argument_name : op->InputArgumentNames()) {
-          if (input_argument_name == var->Name()) {
-            return true;
-          }
-        }
-      }
-    }
+bool IsPersistable(const framework::VarDesc* var) {
+  if (var->Persistable() &&
+      var->GetType() != framework::proto::VarType::FEED_MINIBATCH &&
+      var->GetType() != framework::proto::VarType::FETCH_LIST) {
+    return true;
  }
  return false;
 }
@@ -65,8 +53,8 @@ void LoadPersistables(framework::Executor& executor,
  std::vector<std::string> paramlist;

  for (auto* var : global_block.AllVars()) {
-    if (IsParameter(var, main_program)) {
-      VLOG(3) << "parameter's name: " << var->Name();
+    if (IsPersistable(var)) {
+      VLOG(3) << "persistable variable's name: " << var->Name();

      framework::VarDesc* new_var = load_block->Var(var->Name());
      new_var->SetShape(var->GetShape());
@@ -101,7 +89,6 @@ void LoadPersistables(framework::Executor& executor,

  executor.Run(*load_program, &scope, 0, true, true);

-  VLOG(3) << "Ran loading successfully";
  delete load_program;
 }


--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -4,7 +4,7 @@ function(inference_test TARGET_NAME)
  set(multiValueArgs ARGS)
  cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

-  set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests)
+  set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/fluid/tests)
  set(arg_list "")
  if(inference_test_ARGS)
    foreach(arg ${inference_test_ARGS})
@@ -30,5 +30,5 @@ inference_test(label_semantic_roles)
 inference_test(recognize_digits ARGS mlp conv)
 inference_test(recommender_system)
 #inference_test(rnn_encoder_decoder)
-inference_test(understand_sentiment)
+inference_test(understand_sentiment ARGS conv)
 inference_test(word2vec)
--- a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
@@ -32,16 +32,42 @@ TEST(inference, label_semantic_roles) {
  paddle::framework::LoDTensor word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1,
      ctx_p2, mark;
  paddle::framework::LoD lod{{0, 4, 10}};
-
-  SetupLoDTensor(word, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(
-      predicate, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(ctx_n2, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(ctx_n1, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(ctx_0, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(ctx_p1, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(ctx_p2, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(mark, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  int64_t word_dict_len = 44068;
+  int64_t predicate_dict_len = 3162;
+  int64_t mark_dict_len = 2;
+
+  SetupLoDTensor(word,
+                 lod,
+                 static_cast<int64_t>(0),
+                 static_cast<int64_t>(word_dict_len - 1));
+  SetupLoDTensor(predicate,
+                 lod,
+                 static_cast<int64_t>(0),
+                 static_cast<int64_t>(predicate_dict_len - 1));
+  SetupLoDTensor(ctx_n2,
+                 lod,
+                 static_cast<int64_t>(0),
+                 static_cast<int64_t>(word_dict_len - 1));
+  SetupLoDTensor(ctx_n1,
+                 lod,
+                 static_cast<int64_t>(0),
+                 static_cast<int64_t>(word_dict_len - 1));
+  SetupLoDTensor(ctx_0,
+                 lod,
+                 static_cast<int64_t>(0),
+                 static_cast<int64_t>(word_dict_len - 1));
+  SetupLoDTensor(ctx_p1,
+                 lod,
+                 static_cast<int64_t>(0),
+                 static_cast<int64_t>(word_dict_len - 1));
+  SetupLoDTensor(ctx_p2,
+                 lod,
+                 static_cast<int64_t>(0),
+                 static_cast<int64_t>(word_dict_len - 1));
+  SetupLoDTensor(mark,
+                 lod,
+                 static_cast<int64_t>(0),
+                 static_cast<int64_t>(mark_dict_len - 1));

  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
  cpu_feeds.push_back(&word);

--- a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
@@ -31,7 +31,12 @@ TEST(inference, understand_sentiment) {

  paddle::framework::LoDTensor words;
  paddle::framework::LoD lod{{0, 4, 10}};
-  SetupLoDTensor(words, lod, static_cast<int64_t>(0), static_cast<int64_t>(10));
+  int64_t word_dict_len = 5147;
+
+  SetupLoDTensor(words,
+                 lod,
+                 static_cast<int64_t>(0),
+                 static_cast<int64_t>(word_dict_len - 1));

  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
  cpu_feeds.push_back(&words);

--- a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
@@ -31,12 +31,12 @@ TEST(inference, word2vec) {

  paddle::framework::LoDTensor first_word, second_word, third_word, fourth_word;
  paddle::framework::LoD lod{{0, 1}};
-  int64_t dict_size = 2072;  // Hard-coding the size of dictionary
+  int64_t dict_size = 2073;  // The size of dictionary

-  SetupLoDTensor(first_word, lod, static_cast<int64_t>(0), dict_size);
-  SetupLoDTensor(second_word, lod, static_cast<int64_t>(0), dict_size);
-  SetupLoDTensor(third_word, lod, static_cast<int64_t>(0), dict_size);
-  SetupLoDTensor(fourth_word, lod, static_cast<int64_t>(0), dict_size);
+  SetupLoDTensor(first_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(second_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(third_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);

  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
  cpu_feeds.push_back(&first_word);

--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -101,8 +101,8 @@ void TestInference(const std::string& dirname,
  if (IsCombined) {
    // All parameters are saved in a single file.
    // Hard-coding the file names of program and parameters in unittest.
-    // Users are free to specify different filename
-    // (provided: the filenames are changed in the python api as well: io.py)
+    // The file names should be consistent with that used in Python API
+    //  `fluid.io.save_inference_model`.
    std::string prog_filename = "__model_combined__";
    std::string param_filename = "__params_combined__";
    inference_program = paddle::inference::Load(executor,

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -11,6 +11,8 @@ function(op_library TARGET)
    set(cc_srcs)
    set(cu_srcs)
    set(cu_cc_srcs)
+    set(cudnn_cu_cc_srcs)
+    set(CUDNN_FILE)
    set(op_common_deps operator op_registry math_function)
    set(options "")
    set(oneValueArgs "")
@@ -30,10 +32,16 @@ function(op_library TARGET)
        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
            list(APPEND cu_srcs ${TARGET}.cu)
        endif()
+        string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}")
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
+            list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
+        endif()
    else()
        foreach(src ${op_library_SRCS})
            if (${src} MATCHES ".*\\.cu$")
                list(APPEND cu_srcs ${src})
+            elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
+                list(APPEND cudnn_cu_cc_srcs ${src})
            elseif(${src} MATCHES ".*\\.cu.cc$")
                list(APPEND cu_cc_srcs ${src})
            elseif(${src} MATCHES ".*\\.cc$")
@@ -54,7 +62,7 @@ function(op_library TARGET)
        set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
    endif()
    if (WITH_GPU)
-        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
+        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                ${op_common_deps})
    else()
        cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS}
@@ -98,6 +106,12 @@ function(op_library TARGET)
        set(pybind_flag 1)
    endif()

+    # pybind USE_OP_DEVICE_KERNEL for CUDNN
+    list(LENGTH cudnn_cu_cc_srcs cudnn_cu_cc_srcs_len)
+    if (WITH_GPU AND ${cudnn_cu_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
+    endif()
+
    # pybind USE_OP
    if (${pybind_flag} EQUAL 0)
        file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
@@ -141,6 +155,7 @@ op_library(print_op DEPS lod_tensor)
 op_library(adagrad_op DEPS selected_rows_functor)
 op_library(maxout_op DEPS maxouting)
 op_library(unpool_op DEPS unpooling)
+op_library(pool_op DEPS pooling)
 op_library(pool_with_index_op DEPS pooling)
 op_library(lod_rank_table_op DEPS lod_rank_table)
 op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
@@ -152,43 +167,17 @@ op_library(lstm_op DEPS sequence2batch lstm_compute)
 op_library(lstmp_op DEPS sequence2batch lstm_compute)
 op_library(gru_op DEPS sequence2batch gru_compute)
 op_library(recurrent_op DEPS executor)
-op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale math_function)
+op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 op_library(cos_sim_op DEPS cos_sim_functor)
 op_library(parallel_do_op DEPS executor)
 op_library(create_reader_op DEPS reader)

-# Regist multiple Kernel to pybind
 if (WITH_GPU)
-
-op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS
-    vol2col depthwise_conv)
-
-op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function)
-op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling)
-op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc
-  conv_transpose_cudnn_op.cu.cc DEPS vol2col)
-file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(conv2d, CUDNN);\n")
-file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(pool2d, CUDNN);\n")
-file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(conv2d_transpose, CUDNN);\n")
+    op_library(conv_op DEPS vol2col depthwise_conv)
 else()
-op_library(conv_op SRCS conv_op.cc DEPS vol2col)
-op_library(pool_op SRCS pool_op.cc DEPS pooling)
-op_library(conv_transpose_op SRCS conv_transpose_op.cc DEPS vol2col)
+    op_library(conv_op DEPS vol2col)
 endif()
-
-cc_library(batch_size_like SRCS batch_size_like.cc DEPS op_registry)
-
-op_library(fill_constant_batch_size_like_op
-  SRCS fill_constant_batch_size_like_op.cc fill_constant_batch_size_like_op.cu.cc
-  DEPS batch_size_like)
-
-op_library(uniform_random_batch_size_like_op
-  SRCS uniform_random_batch_size_like_op.cc
-  DEPS batch_size_like uniform_random_op)
-
-op_library(gaussian_random_batch_size_like_op
-  SRCS gaussian_random_batch_size_like_op.cc
-  DEPS batch_size_like gaussian_random_op)
+op_library(conv_transpose_op DEPS vol2col)

 # FIXME(typhoonzero): save/load depends lodtensor serialization functions
 op_library(save_op DEPS lod_tensor)

--- a/paddle/fluid/operators/batch_size_like.cc
+++ b/paddle/fluid/operators/batch_size_like.cc
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/batch_size_like.h"
-
-namespace paddle {
-namespace operators {
-
-void BatchSizeLikeOp::InferShape(framework::InferShapeContext *ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("Input"),
-                 "Input(Input) of %s should not be null.", Type());
-  PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of %s should not be null.",
-                 Type());
-
-  auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
-  PADDLE_ENFORCE_GT(shape.size(), 0);
-  std::vector<int64_t> shape_int64(shape.size(), 0);
-  std::transform(shape.begin(), shape.end(), shape_int64.begin(),
-                 [](int a) { return static_cast<int64_t>(a); });
-  auto output_dim = framework::make_ddim(shape_int64);
-
-  int input_dim_idx = ctx->Attrs().Get<int>("input_dim_idx");
-  PADDLE_ENFORCE_GE(input_dim_idx, 0);
-  PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), input_dim_idx);
-
-  int output_dim_idx = ctx->Attrs().Get<int>("output_dim_idx");
-  PADDLE_ENFORCE_GE(output_dim_idx, 0);
-  PADDLE_ENFORCE_GT(static_cast<int>(shape.size()), output_dim_idx);
-
-  output_dim[output_dim_idx] = ctx->GetInputDim("Input")[input_dim_idx];
-  ctx->SetOutputDim("Out", output_dim);
-}
-
-BatchSizeLikeOpMaker::BatchSizeLikeOpMaker(OpProto *proto,
-                                           OpAttrChecker *op_checker)
-    : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-  AddInput("Input",
-           "(Tensor) Tensor "
-           "whose input_dim_idx'th dimension specifies the batch_size");
-  AddOutput("Out",
-            "(Tensor) Tensor of specified shape will be filled "
-            "with the specified value");
-  AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
-  AddAttr<int>("input_dim_idx",
-               "(int, default 0) The index of input's batch size dimension")
-      .SetDefault(0);
-  AddAttr<int>("output_dim_idx",
-               "(int, default 0) The index of output's batch size dimension")
-      .SetDefault(0);
-}
-
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/batch_size_like.h
+++ b/paddle/fluid/operators/batch_size_like.h
@@ -24,12 +24,50 @@ class BatchSizeLikeOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

-  void InferShape(framework::InferShapeContext *ctx) const override;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of %s should not be null.", Type());
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of %s should not be null.", Type());
+
+    auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    PADDLE_ENFORCE_GT(shape.size(), 0);
+    std::vector<int64_t> shape_int64(shape.size(), 0);
+    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
+                   [](int a) { return static_cast<int64_t>(a); });
+    auto output_dim = framework::make_ddim(shape_int64);
+
+    int input_dim_idx = ctx->Attrs().Get<int>("input_dim_idx");
+    PADDLE_ENFORCE_GE(input_dim_idx, 0);
+    PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), input_dim_idx);
+
+    int output_dim_idx = ctx->Attrs().Get<int>("output_dim_idx");
+    PADDLE_ENFORCE_GE(output_dim_idx, 0);
+    PADDLE_ENFORCE_GT(static_cast<int>(shape.size()), output_dim_idx);
+
+    output_dim[output_dim_idx] = ctx->GetInputDim("Input")[input_dim_idx];
+    ctx->SetOutputDim("Out", output_dim);
+  }
 };

 class BatchSizeLikeOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  BatchSizeLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker);
+  BatchSizeLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(Tensor) Tensor "
+             "whose input_dim_idx'th dimension specifies the batch_size");
+    AddOutput("Out",
+              "(Tensor) Tensor of specified shape will be filled "
+              "with the specified value");
+    AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
+    AddAttr<int>("input_dim_idx",
+                 "(int, default 0) The index of input's batch size dimension")
+        .SetDefault(0);
+    AddAttr<int>("output_dim_idx",
+                 "(int, default 0) The index of output's batch size dimension")
+        .SetDefault(0);
+  }
 };

 }  // namespace operators

--- a/paddle/fluid/operators/bipartite_match_op.cc
+++ b/paddle/fluid/operators/bipartite_match_op.cc
@@ -94,6 +94,38 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
    }
  }

+  void ArgMaxMatch(const Tensor& dist, int* match_indices, T* match_dist,
+                   T overlap_threshold) const {
+    constexpr T kEPS = static_cast<T>(1e-6);
+    int64_t row = dist.dims()[0];
+    int64_t col = dist.dims()[1];
+    auto* dist_data = dist.data<T>();
+    for (int64_t j = 0; j < col; ++j) {
+      if (match_indices[j] != -1) {
+        // the j-th column has been matched to one entity.
+        continue;
+      }
+      int max_row_idx = -1;
+      T max_dist = -1;
+      for (int i = 0; i < row; ++i) {
+        T dist = dist_data[i * col + j];
+        if (dist < kEPS) {
+          // distance is 0 between m-th row and j-th column
+          continue;
+        }
+        if (dist >= overlap_threshold && dist > max_dist) {
+          max_row_idx = i;
+          max_dist = dist;
+        }
+      }
+      if (max_row_idx != -1) {
+        PADDLE_ENFORCE_EQ(match_indices[j], -1);
+        match_indices[j] = max_row_idx;
+        match_dist[j] = max_dist;
+      }
+    }
+  }
+
  void Compute(const framework::ExecutionContext& context) const override {
    auto* dist_mat = context.Input<LoDTensor>("DistMat");
    auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices");
@@ -120,13 +152,21 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {

    int* indices = match_indices->data<int>();
    T* dist = match_dist->data<T>();
+    auto type = context.Attr<std::string>("match_type");
+    auto threshold = context.Attr<float>("dist_threshold");
    if (n == 1) {
      BipartiteMatch(*dist_mat, indices, dist);
+      if (type == "per_prediction") {
+        ArgMaxMatch(*dist_mat, indices, dist, threshold);
+      }
    } else {
      auto lod = dist_mat->lod().back();
      for (size_t i = 0; i < lod.size() - 1; ++i) {
        Tensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]);
        BipartiteMatch(one_ins, indices + i * col, dist + i * col);
+        if (type == "per_prediction") {
+          ArgMaxMatch(one_ins, indices + i * col, dist + i * col, threshold);
+        }
      }
    }
  }
@@ -147,6 +187,19 @@ class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {
        "This tensor can contain LoD information to represent a batch of "
        "inputs. One instance of this batch can contain different numbers of "
        "entities.");
+    AddAttr<std::string>(
+        "match_type",
+        "(string, defalut: per_prediction) "
+        "The type of matching method, should be 'bipartite' or "
+        "'per_prediction', 'bipartite' by defalut.")
+        .SetDefault("bipartite")
+        .InEnum({"bipartite", "per_prediction"});
+    AddAttr<float>(
+        "dist_threshold",
+        "(float, defalut: 0.5) "
+        "If `match_type` is 'per_prediction', this threshold is to determine "
+        "the extra matching bboxes based on the maximum distance.")
+        .SetDefault(0.5);
    AddOutput("ColToRowMatchIndices",
              "(Tensor) A 2-D Tensor with shape [N, M] in int type. "
              "N is the batch size. If ColToRowMatchIndices[i][j] is -1, it "
@@ -168,10 +221,10 @@ distance matrix. For input 2D matrix, the bipartite matching algorithm can
 find the matched column for each row, also can find the matched row for
 each column. And this operator only calculate matched indices from column
 to row. For each instance, the number of matched indices is the number of
-of columns of the input ditance matrix.
+of columns of the input distance matrix.

 There are two outputs to save matched indices and distance.
-A simple description, this algothrim matched the best (maximum distance)
+A simple description, this algorithm matched the best (maximum distance)
 row entity to the column entity and the matched indices are not duplicated
 in each row of ColToRowMatchIndices. If the column entity is not matched
 any row entity, set -1 in ColToRowMatchIndices.

--- a/paddle/fluid/operators/compare_op.cc
+++ b/paddle/fluid/operators/compare_op.cc
@@ -83,7 +83,7 @@ class CompareOp : public framework::OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle

-#define REGISTER_LOGICAL_OP(op_type, _equation)                      \
+#define REGISTER_COMPARE_OP(op_type, _equation)                      \
  struct _##op_type##Comment {                                       \
    static char type[];                                              \
    static char equation[];                                          \
@@ -96,11 +96,17 @@ class CompareOp : public framework::OperatorWithKernel {
      ::paddle::operators::CompareOpInferShape<_##op_type##Comment>, \
      ::paddle::framework::EmptyGradOpMaker);

-REGISTER_LOGICAL_OP(less_than, "Out = X < Y");
-REGISTER_LOGICAL_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor);
-REGISTER_LOGICAL_OP(less_equal, "Out = X <= Y");
-REGISTER_LOGICAL_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor);
-REGISTER_LOGICAL_OP(equal, "Out = X == Y");
-REGISTER_LOGICAL_KERNEL(equal, CPU, paddle::operators::EqualFunctor);
-REGISTER_LOGICAL_OP(not_equal, "Out = X != Y");
-REGISTER_LOGICAL_KERNEL(not_equal, CPU, paddle::operators::NotEqualFunctor);
+REGISTER_COMPARE_OP(less_than, "Out = X < Y");
+REGISTER_COMPARE_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor);
+REGISTER_COMPARE_OP(less_equal, "Out = X <= Y");
+REGISTER_COMPARE_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor);
+REGISTER_COMPARE_OP(greater_than, "Out = X > Y");
+REGISTER_COMPARE_KERNEL(greater_than, CPU,
+                        paddle::operators::GreaterThanFunctor);
+REGISTER_COMPARE_OP(greater_equal, "Out = X >= Y");
+REGISTER_COMPARE_KERNEL(greater_equal, CPU,
+                        paddle::operators::GreaterEqualFunctor);
+REGISTER_COMPARE_OP(equal, "Out = X == Y");
+REGISTER_COMPARE_KERNEL(equal, CPU, paddle::operators::EqualFunctor);
+REGISTER_COMPARE_OP(not_equal, "Out = X != Y");
+REGISTER_COMPARE_KERNEL(not_equal, CPU, paddle::operators::NotEqualFunctor);
--- a/paddle/fluid/operators/compare_op.cu
+++ b/paddle/fluid/operators/compare_op.cu
@@ -14,7 +14,11 @@ limitations under the License. */

 #include "paddle/fluid/operators/compare_op.h"

-REGISTER_LOGICAL_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor);
-REGISTER_LOGICAL_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor);
-REGISTER_LOGICAL_KERNEL(equal, CUDA, paddle::operators::EqualFunctor);
-REGISTER_LOGICAL_KERNEL(not_equal, CUDA, paddle::operators::NotEqualFunctor);
+REGISTER_COMPARE_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor);
+REGISTER_COMPARE_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor);
+REGISTER_COMPARE_KERNEL(greater_than, CUDA,
+                        paddle::operators::GreaterThanFunctor);
+REGISTER_COMPARE_KERNEL(greater_equal, CUDA,
+                        paddle::operators::GreaterEqualFunctor);
+REGISTER_COMPARE_KERNEL(equal, CUDA, paddle::operators::EqualFunctor);
+REGISTER_COMPARE_KERNEL(not_equal, CUDA, paddle::operators::NotEqualFunctor);
--- a/paddle/fluid/operators/compare_op.h
+++ b/paddle/fluid/operators/compare_op.h
@@ -34,6 +34,18 @@ struct LessEqualFunctor {
  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a <= b; }
 };

+template <typename T>
+struct GreaterThanFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a > b; }
+};
+
+template <typename T>
+struct GreaterEqualFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a >= b; }
+};
+
 template <typename T>
 struct EqualFunctor {
  using ELEM_TYPE = T;
@@ -76,7 +88,7 @@ class CompareOpKernel
 }  // namespace operators
 }  // namespace paddle

-#define REGISTER_LOGICAL_KERNEL(op_type, dev, functor)                    \
+#define REGISTER_COMPARE_KERNEL(op_type, dev, functor)                    \
  REGISTER_OP_##dev##_KERNEL(                                             \
      op_type, ::paddle::operators::CompareOpKernel<                      \
                   ::paddle::platform::dev##DeviceContext, functor<int>>, \

--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
@@ -34,12 +35,46 @@ class ConcatKernel : public framework::OpKernel<T> {
    auto out_stride = framework::stride_numel(out->dims());

    size_t output_offset = 0;
-    for (auto* in : ins) {
-      auto in_stride = framework::stride_numel(in->dims());
-      StridedNumelCopyWithAxis<T>(ctx.device_context(), axis,
-                                  out->data<T>() + output_offset, out_stride,
-                                  in->data<T>(), in_stride, in_stride[axis]);
-      output_offset += in_stride[axis];
+
+    // If axis >=1, copy to out immediately need to call many times
+    // of cuda memcpy. Copy the input to cpu and do the stride copy,
+    // then copy to gpu output.
+
+    if (platform::is_gpu_place(place) && axis >= 1) {
+      platform::CPUPlace copy_place;
+      auto& cpu_ctx = *platform::DeviceContextPool::Instance().Get(copy_place);
+      framework::Tensor cpu_out;
+      cpu_out.Resize(out->dims());
+      cpu_out.mutable_data<T>(copy_place);
+      auto& dev_ctx = ctx.device_context();
+      std::vector<std::unique_ptr<framework::Tensor>> cpu_ins;
+      for (auto* in : ins) {
+        std::unique_ptr<framework::Tensor> cpu_in(new framework::Tensor);
+        framework::TensorCopy(*in, copy_place, dev_ctx, cpu_in.get());
+        cpu_ins.emplace_back(std::move(cpu_in));
+      }
+      // TODO(dzhwinter): overlap copy and compute stream
+      // https://devblogs.nvidia.com/how-overlap-data-transfers-cuda-cc/
+      dev_ctx.Wait();
+
+      for (auto& in : cpu_ins) {
+        auto& cpu_in = *in.get();
+        auto in_stride = framework::stride_numel(cpu_in.dims());
+
+        StridedNumelCopyWithAxis<T>(
+            cpu_ctx, axis, cpu_out.data<T>() + output_offset, out_stride,
+            cpu_in.data<T>(), in_stride, in_stride[axis]);
+        output_offset += in_stride[axis];
+      }
+      framework::TensorCopy(cpu_out, place, dev_ctx, out);
+    } else {
+      for (auto* in : ins) {
+        auto in_stride = framework::stride_numel(in->dims());
+        StridedNumelCopyWithAxis<T>(ctx.device_context(), axis,
+                                    out->data<T>() + output_offset, out_stride,
+                                    in->data<T>(), in_stride, in_stride[axis]);
+        output_offset += in_stride[axis];
+      }
    }
  }
 };

--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -54,12 +54,6 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {

  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    PADDLE_ENFORCE(in_dims[i + 2] + 2 * paddings[i] -
-                           (dilations[i] * (filter_dims[i + 2] - 1) + 1) >
-                       0,
-                   "Due to the settings of paddings, filter_dims and "
-                   "dilations, the output size is less than 0, please check "
-                   "again.");
    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
                                          dilations[i], paddings[i],
                                          strides[i]));

--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -31,7 +31,14 @@ using Tensor = framework::Tensor;
 inline int ConvOutputSize(int input_size, int filter_size, int dilation,
                          int padding, int stride) {
  const int dkernel = dilation * (filter_size - 1) + 1;
-  const int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
+  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
+  PADDLE_ENFORCE(
+      output_size > 0,
+      "Due to the settings of padding(%d), filter_size(%d), dilation(%d) and "
+      "stride(%d), the output size is less than 0, please check "
+      "again. Input_size:%d",
+      padding, filter_size, dilation, stride, input_size);
+
  return output_size;
 }
 inline bool IsExpand(std::vector<int64_t>& filter_dim,

--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -177,8 +177,8 @@ std::shared_ptr<grpc::Channel> RPCClient::GetChannel(const std::string& ep) {
  args.SetMaxSendMessageSize(std::numeric_limits<int>::max());
  args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());

-  auto ch = std::shared_ptr<grpc::Channel>(
-      grpc::CreateCustomChannel(ep, grpc::InsecureChannelCredentials(), args));
+  auto ch =
+      grpc::CreateCustomChannel(ep, grpc::InsecureChannelCredentials(), args);

  channels_[ep] = ch;
  return ch;

--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -129,6 +129,8 @@ class ListenAndServOp : public framework::OperatorBase {
      }
      if (exit_flag) {
        rpc_service_->ShutDown();
+        rpc_service_->SetCond(1);
+        break;
      }
      try {
        executor.Run(*program, &recv_scope, block->ID(), /*global_block*/

--- a/paddle/fluid/operators/nccl_op.cc
+++ b/paddle/fluid/operators/nccl_op.cc
@@ -65,7 +65,7 @@ class NCCLInitOpVarTypeInference : public framework::VarTypeInference {
                  framework::BlockDesc *block) const override {
    auto out_var_name = op_desc.Output("Communicator").front();
    auto &out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    auto var_type = framework::proto::VarType::NCCL_COM;
+    auto var_type = framework::proto::VarType::RAW;
    out_var.SetType(var_type);
  }
 };

--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -19,6 +19,11 @@ namespace operators {

 int PoolOutputSize(int input_size, int filter_size, int padding, int stride) {
  int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  PADDLE_ENFORCE(output_size > 0,
+                 "Due to the settings of padding(%d), filter_size(%d) and "
+                 "stride(%d), the output size is less than 0, please check "
+                 "again. Input_size:%d",
+                 padding, filter_size, stride, input_size);
  return output_size;
 }


--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -121,10 +121,15 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;

 REGISTER_OP(reshape, ops::ReshapeOp, ops::ReshapeOpMaker, reshape_grad,
            ops::ReshapeGradOp);
-REGISTER_OP_CPU_KERNEL(reshape,
-                       ops::ReshapeKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    reshape_grad, ops::ReshapeGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(reshape, ops::ReshapeKernel<CPU, float>,
+                       ops::ReshapeKernel<CPU, double>,
+                       ops::ReshapeKernel<CPU, int>,
+                       ops::ReshapeKernel<CPU, int64_t>);
+REGISTER_OP_CPU_KERNEL(reshape_grad, ops::ReshapeGradKernel<CPU, float>,
+                       ops::ReshapeGradKernel<CPU, double>,
+                       ops::ReshapeGradKernel<CPU, int>,
+                       ops::ReshapeGradKernel<CPU, int64_t>);
--- a/paddle/fluid/operators/reshape_op.cu
+++ b/paddle/fluid/operators/reshape_op.cu
@@ -13,10 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/reshape_op.h"
+using CUDA = paddle::platform::CUDADeviceContext;

-REGISTER_OP_CUDA_KERNEL(
-    reshape,
-    paddle::operators::ReshapeKernel<paddle::platform::CUDAPlace, float>);
-REGISTER_OP_CUDA_KERNEL(
-    reshape_grad,
-    paddle::operators::ReshapeGradKernel<paddle::platform::CUDAPlace, float>);
+REGISTER_OP_CUDA_KERNEL(reshape, paddle::operators::ReshapeKernel<CUDA, float>,
+                        paddle::operators::ReshapeKernel<CUDA, double>,
+                        paddle::operators::ReshapeKernel<CUDA, int>,
+                        paddle::operators::ReshapeKernel<CUDA, int64_t>);
+REGISTER_OP_CUDA_KERNEL(reshape_grad,
+                        paddle::operators::ReshapeGradKernel<CUDA, float>,
+                        paddle::operators::ReshapeGradKernel<CUDA, double>,
+                        paddle::operators::ReshapeGradKernel<CUDA, int>,
+                        paddle::operators::ReshapeGradKernel<CUDA, int64_t>);
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -121,9 +121,27 @@ This operator will send tensor to recv_op at the parameter server.
  }
 };

+class SendOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    auto out_var_name = op_desc.Output("RPCClient").front();
+    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
+    auto var_type = framework::proto::VarType::RAW;
+    out_var.SetType(var_type);
+  }
+};
+
+class SendOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
 }  // namespace operators
 }  // namespace paddle

 namespace ops = paddle::operators;

-REGISTER_OPERATOR(send, ops::SendOp, ops::SendOpMaker);
+REGISTER_OPERATOR(send, ops::SendOp, paddle::framework::EmptyGradOpMaker,
+                  ops::SendOpMaker, ops::SendOpVarTypeInference,
+                  ops::SendOpShapeInference);
--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -95,7 +95,7 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs,
  for (auto kv : outputs) {
    for (auto v : kv.second) {
      auto var = block->Var(v);
-      var->SetDataType(f::proto::DataType::FP32);
+      var->SetDataType(f::proto::VarType::FP32);
    }
  }

@@ -122,33 +122,37 @@ void StartServerNet(bool is_sparse) {

  // sub program run in listen_and_serv_op, for simple test we use sum
  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
+  f::BlockDesc *optimize_block = program.MutableBlock(0);
  // X for server side tensors, RX for received tensers, must be of same shape.
-  AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, block);
+  AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block);

  f::AttributeMap attrs;
  attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
+  attrs.insert({"Fanin", 1});
  attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
  attrs.insert({"GradList", std::vector<std::string>({"x1"})});
-  attrs.insert({"OptimizeBlock", block});
+  attrs.insert({"OptimizeBlock", optimize_block});
  listen_and_serv_op =
-      f::OpRegistry::CreateOp("listen_and_serv", {}, {}, attrs);
+      f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs);
  listen_and_serv_op->Run(scope, place);
 }

 TEST(SendRecvOp, CPUDense) {
  std::thread server_thread(StartServerNet, false);
-  sleep(10);  // wait server to start
+  sleep(5);  // wait server to start
  // local net
  f::Scope scope;
  p::CPUPlace place;
  InitTensorsInScope(scope, place);
+  // create rpc client var
+  scope.Var("RPC_CLIENT_VAR");

  f::AttributeMap attrs;
  attrs.insert({"endpoints", std::vector<std::string>({"127.0.0.1:6174"})});
  attrs.insert({"epmap", std::vector<std::string>({"127.0.0.1:6174"})});
-  auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}},
-                                         {{"Out", {"Out"}}}, attrs);
+  auto send_op = f::OpRegistry::CreateOp(
+      "send", {{"X", {"x1"}}},
+      {{"Out", {"Out"}}, {"RPCClient", {"RPC_CLIENT_VAR"}}}, attrs);
  send_op->Run(scope, place);

  auto in_var = scope.Var("x1");
@@ -175,11 +179,13 @@ TEST(SendRecvOp, CPUSparse) {
  p::CPUPlace place;
  p::CPUDeviceContext ctx(place);
  InitSelectedRowsInScope(scope, place);
+  scope.Var("RPC_CLIENT_VAR");
  f::AttributeMap attrs;
  attrs.insert({"endpoints", std::vector<std::string>({"127.0.0.1:6174"})});
  attrs.insert({"epmap", std::vector<std::string>({"127.0.0.1:6174"})});
-  auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}},
-                                         {{"Out", {"Out"}}}, attrs);
+  auto send_op = f::OpRegistry::CreateOp(
+      "send", {{"X", {"x1"}}},
+      {{"Out", {"Out"}}, {"RPCClient", {"RPC_CLIENT_VAR"}}}, attrs);
  send_op->Run(scope, place);

  auto x0 = scope.Var("x0")->GetMutable<f::SelectedRows>();

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
+proto_library(profiler_proto SRCS profiler.proto)
+
 if(WITH_GPU)
  cc_library(enforce SRCS enforce.cc DEPS)
 else()
@@ -37,7 +39,8 @@ nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
 nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context)

-cc_library(profiler SRCS profiler.cc DEPS device_context)
+cc_library(device_tracer SRCS device_tracer.cc DEPS profiler_proto ${GPU_CTX_DEPS})
+cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)

 nv_test(float16_gpu_test SRCS float16_test.cu)

--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/device_tracer.h"
+#include <map>
+#include <mutex>
+#include "glog/logging.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace paddle {
+namespace platform {
+namespace {
+
+thread_local const char *cur_annotation = nullptr;
+std::once_flag tracer_once_flag;
+DeviceTracer *tracer = nullptr;
+}  // namespace
+#ifdef PADDLE_WITH_CUPTI
+
+namespace {
+// TODO(panyx0718): Revisit the buffer size here.
+uint64_t kBufSize = 32 * 1024;
+uint64_t kAlignSize = 8;
+
+#define ALIGN_BUFFER(buffer, align)                                 \
+  (((uintptr_t)(buffer) & ((align)-1))                              \
+       ? ((buffer) + (align) - ((uintptr_t)(buffer) & ((align)-1))) \
+       : (buffer))
+
+#define CUPTI_CALL(call)                                                   \
+  do {                                                                     \
+    CUptiResult _status = call;                                            \
+    if (_status != CUPTI_SUCCESS) {                                        \
+      const char *errstr;                                                  \
+      dynload::cuptiGetResultString(_status, &errstr);                     \
+      fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
+              __FILE__, __LINE__, #call, errstr);                          \
+      exit(-1);                                                            \
+    }                                                                      \
+  } while (0)
+
+void EnableActivity() {
+  // Device activity record is created when CUDA initializes, so we
+  // want to enable it before cuInit() or any CUDA runtime call.
+  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY));
+  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));
+  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
+  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
+  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
+  // We don't track these activities for now.
+  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT));
+  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
+  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
+  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_NAME));
+  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MARKER));
+}
+
+void DisableActivity() {
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY));
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL));
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE));
+  // Disable all other activity record kinds.
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT));
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DRIVER));
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME));
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET));
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME));
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER));
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD));
+}
+
+void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size,
+                              size_t *maxNumRecords) {
+  uint8_t *buf = (uint8_t *)malloc(kBufSize + kAlignSize);
+  *size = kBufSize;
+  *buffer = ALIGN_BUFFER(buf, kAlignSize);
+  *maxNumRecords = 0;
+}
+
+void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
+                              size_t size, size_t validSize) {
+  CUptiResult status;
+  CUpti_Activity *record = NULL;
+  if (validSize > 0) {
+    do {
+      status = dynload::cuptiActivityGetNextRecord(buffer, validSize, &record);
+      if (status == CUPTI_SUCCESS) {
+        switch (record->kind) {
+          case CUPTI_ACTIVITY_KIND_KERNEL:
+          case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: {
+            auto *kernel =
+                reinterpret_cast<const CUpti_ActivityKernel3 *>(record);
+            tracer->AddKernelRecords(kernel->start, kernel->end,
+                                     kernel->deviceId, kernel->streamId,
+                                     kernel->correlationId);
+            break;
+          }
+          default: { break; }
+        }
+      } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
+        // Seems not an error in this case.
+        break;
+      } else {
+        CUPTI_CALL(status);
+      }
+    } while (1);
+
+    size_t dropped;
+    CUPTI_CALL(
+        dynload::cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped));
+    if (dropped != 0) {
+      fprintf(stderr, "Dropped %u activity records\n", (unsigned int)dropped);
+    }
+  }
+  free(buffer);
+}
+}  // namespace
+
+class DeviceTracerImpl : public DeviceTracer {
+ public:
+  DeviceTracerImpl() : enabled_(false) {}
+
+  void AddAnnotation(uint64_t id, const std::string &anno) {
+    std::lock_guard<std::mutex> l(trace_mu_);
+    correlations_[id] = anno;
+  }
+
+  void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
+                        uint32_t stream_id, uint32_t correlation_id) {
+    std::lock_guard<std::mutex> l(trace_mu_);
+    kernel_records_.push_back(
+        KernelRecord{start, end, device_id, stream_id, correlation_id});
+  }
+
+  bool IsEnabled() {
+    std::lock_guard<std::mutex> l(trace_mu_);
+    return enabled_;
+  }
+
+  void Enable() {
+    std::lock_guard<std::mutex> l(trace_mu_);
+    if (enabled_) {
+      fprintf(stderr, "DeviceTracer already enabled\n");
+      return;
+    }
+    EnableActivity();
+
+    // Register callbacks for buffer requests and completed by CUPTI.
+    CUPTI_CALL(dynload::cuptiActivityRegisterCallbacks(bufferRequested,
+                                                       bufferCompleted));
+
+    CUptiResult ret;
+    ret = dynload::cuptiSubscribe(
+        &subscriber_, static_cast<CUpti_CallbackFunc>(ApiCallback), this);
+    if (ret == CUPTI_ERROR_MAX_LIMIT_REACHED) {
+      fprintf(stderr, "CUPTI subcriber limit reached.\n");
+    } else if (ret != CUPTI_SUCCESS) {
+      fprintf(stderr, "Failed to create CUPTI subscriber.\n");
+    }
+    CUPTI_CALL(
+        dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API,
+                                     CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));
+
+    CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
+    enabled_ = true;
+  }
+
+  proto::Profile GenProfile() {
+    std::lock_guard<std::mutex> l(trace_mu_);
+    proto::Profile profile_pb;
+    profile_pb.set_start_ns(start_ns_);
+    profile_pb.set_end_ns(end_ns_);
+    std::map<std::string, std::vector<uint64_t>> event_times;
+    for (const KernelRecord &r : kernel_records_) {
+      if (correlations_.find(r.correlation_id) == correlations_.end()) {
+        fprintf(stderr, "cannot relate a kernel activity\n");
+        continue;
+      }
+      auto *event = profile_pb.add_events();
+      event->set_name(correlations_.at(r.correlation_id));
+      event->set_start_ns(r.start_ns);
+      event->set_end_ns(r.end_ns);
+      event->set_stream_id(r.stream_id);
+      event->set_device_id(r.device_id);
+      event_times[event->name()].push_back(r.end_ns - r.start_ns);
+    }
+    for (const auto &et : event_times) {
+      fprintf(
+          stderr, "%s: total: %fms invoked cuda kernels: %lu\n",
+          et.first.c_str(),
+          std::accumulate(et.second.begin(), et.second.end(), 0) / 1000000.0,
+          et.second.size());
+    }
+    return profile_pb;
+  }
+
+  void Disable() {
+    // flush might cause additional calls to DeviceTracker.
+    dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED);
+    std::lock_guard<std::mutex> l(trace_mu_);
+    DisableActivity();
+    dynload::cuptiUnsubscribe(subscriber_);
+    CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_));
+    PADDLE_ENFORCE(dynload::cuptiFinalize());
+    enabled_ = false;
+  }
+
+ private:
+  static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain,
+                                   CUpti_CallbackId cbid, const void *cbdata) {
+    auto *cbInfo = reinterpret_cast<const CUpti_CallbackData *>(cbdata);
+    DeviceTracer *tracer = reinterpret_cast<DeviceTracer *>(userdata);
+
+    if ((domain == CUPTI_CB_DOMAIN_DRIVER_API) &&
+        (cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)) {
+      if (cbInfo->callbackSite == CUPTI_API_ENTER) {
+        const std::string anno =
+            cur_annotation ? cur_annotation : cbInfo->symbolName;
+        tracer->AddAnnotation(cbInfo->correlationId, anno);
+      }
+    } else {
+      VLOG(1) << "Unhandled API Callback for " << domain << " " << cbid;
+    }
+  }
+
+  std::mutex trace_mu_;
+  bool enabled_;
+  uint64_t start_ns_;
+  uint64_t end_ns_;
+  std::vector<KernelRecord> kernel_records_;
+  std::unordered_map<uint32_t, std::string> correlations_;
+  CUpti_SubscriberHandle subscriber_;
+};
+
+#endif  // PADDLE_WITH_CUPTI
+
+class DeviceTracerDummy : public DeviceTracer {
+ public:
+  DeviceTracerDummy() {}
+
+  void AddAnnotation(uint64_t id, const std::string &anno) {}
+
+  void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
+                        uint32_t stream_id, uint32_t correlation_id) {}
+
+  bool IsEnabled() { return false; }
+
+  void Enable() {}
+
+  proto::Profile GenProfile() { return proto::Profile(); }
+
+  void Disable() {}
+};
+
+void CreateTracer(DeviceTracer **t) {
+#ifdef PADDLE_WITH_CUPTI
+  *t = new DeviceTracerImpl();
+#else
+  *t = new DeviceTracerDummy();
+#endif  // PADDLE_WITH_CUPTI
+}
+
+DeviceTracer *GetDeviceTracer() {
+  std::call_once(tracer_once_flag, CreateTracer, &tracer);
+  return tracer;
+}
+
+void SetCurAnnotation(const char *anno) { cur_annotation = anno; }
+
+void ClearCurAnnotation() { cur_annotation = nullptr; }
+
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/platform/dynload/cupti.h"
+#include "paddle/fluid/platform/profiler.pb.h"
+
+namespace paddle {
+namespace platform {
+
+///////////////////////
+// WARN: Under Development. Don't depend on it yet.
+//////////////////////
+
+// DeviceTracer performs the following tasks:
+// 1. Register cuda callbacks for various events: kernel, memcpy, etc.
+// 2. Collect cuda statistics: start/end ts, memory, etc.
+// 3. Generate a protobuf for further analysis.
+class DeviceTracer {
+ public:
+  struct KernelRecord {
+    uint64_t start_ns;
+    uint64_t end_ns;
+    uint32_t device_id;
+    uint32_t stream_id;
+    uint32_t correlation_id;
+  };
+
+  virtual ~DeviceTracer() {}
+  // Needs to be called once before use.
+  virtual void Enable() = 0;
+  // Needs to be called once after use.
+  virtual void Disable() = 0;
+
+  // Add a pair to correlate internal cuda id with high level
+  // annotation (string). So cuda statistics can be represented by
+  // human-readable annotations.
+  virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0;
+
+  // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
+  // added before for human readability.
+  virtual void AddKernelRecords(uint64_t start, uint64_t end,
+                                uint32_t device_id, uint32_t stream_id,
+                                uint32_t correlation_id) = 0;
+
+  // Generate a proto after done (Disabled).
+  virtual proto::Profile GenProfile() = 0;
+
+  virtual bool IsEnabled() = 0;
+};
+
+// Get a DeviceTracer.
+DeviceTracer* GetDeviceTracer();
+
+// Set a name for the cuda kernel operation being launched by the thread.
+void SetCurAnnotation(const char* anno);
+// Clear the name after the operation is done.
+void ClearCurAnnotation();
+
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
-nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc nccl.cc
-        DEPS dynamic_loader)
+
+list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc nccl.cc)
+if (CUPTI_FOUND)
+    list(APPEND CUDA_SRCS cupti.cc)
+endif(CUPTI_FOUND)
+nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
 cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
--- a/paddle/fluid/platform/dynload/cupti.cc
+++ b/paddle/fluid/platform/dynload/cupti.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUPTI
+
+#include "paddle/fluid/platform/dynload/cupti.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag cupti_dso_flag;
+void *cupti_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUPTI_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+
+#endif  // PADDLE_WITH_CUPTI
--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_CUPTI
+#include <cuda.h>
+#include <cupti.h>
+#include <dlfcn.h>
+#include <mutex>
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag cupti_dso_flag;
+extern void *cupti_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cupti routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#ifdef PADDLE_USE_DSO
+#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)                    \
+  struct DynLoad__##__name {                                       \
+    template <typename... Args>                                    \
+    inline CUptiResult CUPTIAPI operator()(Args... args) {         \
+      typedef CUptiResult CUPTIAPI (*cuptiFunc)(Args...);          \
+      std::call_once(cupti_dso_flag,                               \
+                     paddle::platform::dynload::GetCUPTIDsoHandle, \
+                     &cupti_dso_handle);                           \
+      void *p_##__name = dlsym(cupti_dso_handle, #__name);         \
+      return reinterpret_cast<cuptiFunc>(p_##__name)(args...);     \
+    }                                                              \
+  };                                                               \
+  extern DynLoad__##__name __name
+#else
+#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)            \
+  struct DynLoad__##__name {                               \
+    template <typename... Args>                            \
+    inline CUptiResult CUPTIAPI operator()(Args... args) { \
+      return __name(args...);                              \
+    }                                                      \
+  };                                                       \
+  extern DynLoad__##__name __name
+#endif
+
+#define CUPTI_ROUTINE_EACH(__macro)           \
+  __macro(cuptiActivityEnable);               \
+  __macro(cuptiActivityDisable);              \
+  __macro(cuptiActivityRegisterCallbacks);    \
+  __macro(cuptiActivityGetAttribute);         \
+  __macro(cuptiActivitySetAttribute);         \
+  __macro(cuptiGetTimestamp);                 \
+  __macro(cuptiActivityGetNextRecord);        \
+  __macro(cuptiGetResultString);              \
+  __macro(cuptiActivityGetNumDroppedRecords); \
+  __macro(cuptiActivityFlushAll);             \
+  __macro(cuptiFinalize);                     \
+  __macro(cuptiSubscribe);                    \
+  __macro(cuptiUnsubscribe);                  \
+  __macro(cuptiEnableCallback);
+
+CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+
+#endif  // PADDLE_WITH_CUPTI
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -40,10 +40,14 @@ DEFINE_string(nccl_dir, "",
              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
              "dlopen will search cuda from LD_LIBRARY_PATH");

+DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
+
 namespace paddle {
 namespace platform {
 namespace dynload {

+static const char* cupti_lib_path = CUPTI_LIB_PATH;
+
 static inline std::string join(const std::string& part1,
                               const std::string& part2) {
  // directory separator
@@ -143,6 +147,18 @@ void GetCUDNNDsoHandle(void** dso_handle) {
 #endif
 }

+void GetCUPTIDsoHandle(void** dso_handle) {
+  std::string cupti_path = cupti_lib_path;
+  if (!FLAGS_cupti_dir.empty()) {
+    cupti_path = FLAGS_cupti_dir;
+  }
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", dso_handle, false);
+#else
+  GetDsoHandleFromSearchPath(cupti_path, "libcupti.so", dso_handle, false);
+#endif
+}
+
 void GetCurandDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);

--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -34,6 +34,8 @@ void GetCublasDsoHandle(void** dso_handle);
 */
 void GetCUDNNDsoHandle(void** dso_handle);

+void GetCUPTIDsoHandle(void** dso_handle);
+
 /**
 * @brief    load the DSO of CURAND
 *

--- a/paddle/fluid/platform/nccl_test.cu
+++ b/paddle/fluid/platform/nccl_test.cu
@@ -129,9 +129,6 @@ TEST(NCCL, all_reduce) {
 }  // namespace paddle

 int main(int argc, char** argv) {
-  // FIXME(tonyyang-svail):
-  //   Due to the driver issue on our CI, disable for now
-  return 0;
  dev_count = paddle::platform::GetCUDADeviceCount();
  if (dev_count <= 1) {
    LOG(WARNING)

--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -15,7 +15,13 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 #include <iomanip>
 #include <map>
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#endif  // PADDLE_WITH_CUDA
 #include "glog/logging.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/platform/device_tracer.h"
+#include "paddle/fluid/string/printf.h"

 namespace paddle {
 namespace platform {
@@ -132,10 +138,13 @@ RecordEvent::RecordEvent(const std::string& name,
  dev_ctx_ = dev_ctx;
  name_ = name;
  PushEvent(name_, dev_ctx_);
+  // Maybe need the same push/pop behavior.
+  SetCurAnnotation(name_.c_str());
 }

 RecordEvent::~RecordEvent() {
  if (g_state == ProfilerState::kDisabled) return;
+  ClearCurAnnotation();
  PopEvent(name_, dev_ctx_);
 }

@@ -147,7 +156,14 @@ void EnableProfiler(ProfilerState state) {
                 "The profiling state should be disabled when calling ",
                 "EnableProfiler.");
  g_state = state;
-  g_profiler_place = (g_state == ProfilerState::kCUDA) ? "CUDA" : "CPU";
+  if (g_state == ProfilerState::kCUDA) {
+    g_profiler_place = "CUDA";
+  } else if (g_state == ProfilerState::kCPU) {
+    g_profiler_place = "CPU";
+  } else {
+    g_profiler_place = "All";
+    GetDeviceTracer()->Enable();
+  }
 #ifdef PADDLE_WITH_CUDA
  if (g_state == ProfilerState::kCUDA) {
    // Generate some dummy evenets first to reduce the startup overhead.
@@ -190,6 +206,12 @@ void DisableProfiler(EventSortingKey sorted_key) {
  Mark("_stop_profiler_", nullptr);
  g_state = ProfilerState::kDisabled;

+  DeviceTracer* tracer = GetDeviceTracer();
+  if (g_profiler_place == "All" && tracer && tracer->IsEnabled()) {
+    tracer->Disable();
+    tracer->GenProfile();
+  }
+
  std::vector<std::vector<Event>> all_events = GetAllEvents();
  ParseEvents(all_events, sorted_key);
  ResetProfiler();
@@ -254,9 +276,11 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
        }

        if (rit != pushed_events.rend()) {
-          double event_time = (g_profiler_place == "CUDA")
-                                  ? rit->CudaElapsedMs(events[i][j])
-                                  : rit->CpuElapsedMs(events[i][j]);
+          double event_time =
+              (g_profiler_place == "CUDA" || g_profiler_place == "All")
+                  ? rit->CudaElapsedMs(events[i][j])
+                  : rit->CpuElapsedMs(events[i][j]);
+
          std::string event_name =
              "thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
          max_name_width = std::max(max_name_width, event_name.size());

--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <mutex>
 #include <vector>
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/profiler.pb.h"

 namespace paddle {
 namespace platform {
@@ -93,6 +94,7 @@ enum ProfilerState {
  kDisabled,  // disabled state
  kCPU,       // CPU profiling state
  kCUDA,      // GPU profiling state
+  kAll,       // Profile both CPU and GPU. (Currently experimental).
 };

 void Mark(const std::string& name, const DeviceContext* dev_ctx);
@@ -102,7 +104,7 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
 void PopEvent(const std::string& name, const DeviceContext* dev_ctx);

 struct RecordEvent {
-  explicit RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
+  RecordEvent(const std::string& name, const DeviceContext* dev_ctx);

  ~RecordEvent();

@@ -110,9 +112,12 @@ struct RecordEvent {
  const DeviceContext* dev_ctx_;
  // Event name
  std::string name_;
+  // Need to distinguish name by op type, block_id, program_id and perhaps
+  // different kernel invocations within an op.
+  std::string full_name_;
 };

-// Return the event list of all threads. Asummed the returned value calls
+// Return the event list of all threads. Assumed the returned value calls
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
 std::vector<std::vector<Event>> GetAllEvents();


--- a/paddle/fluid/platform/profiler.proto
+++ b/paddle/fluid/platform/profiler.proto
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax = "proto2";
+package paddle.platform.proto;
+
+message Event {
+  optional string name = 1;
+  optional uint64 start_ns = 2;
+  optional uint64 end_ns = 3;
+  optional uint32 device_id = 5;
+  optional uint32 stream_id = 6;
+}
+
+message Profile {
+  repeated Event events = 1;
+  optional uint64 start_ns = 2;
+  optional uint64 end_ns = 3;
+}
\ No newline at end of file
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -252,7 +252,7 @@ void BindVarDsec(py::module &m) {
      .value("CHANNEL", proto::VarType::CHANNEL)
      .value("PLACE_LIST", proto::VarType::PLACE_LIST)
      .value("READER", proto::VarType::READER)
-      .value("NCCL_COM", proto::VarType::NCCL_COM);
+      .value("RAW", proto::VarType::RAW);
 }

 void BindOpDesc(py::module &m) {

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -49,11 +49,6 @@ PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);

 namespace paddle {
 namespace pybind {
-static size_t UniqueIntegerGenerator(const std::string &prefix) {
-  static std::unordered_map<std::string, std::atomic<size_t>> generators;
-  return generators[prefix].fetch_add(1);
-}
-
 bool IsCompiledWithCUDA() {
 #ifndef PADDLE_WITH_CUDA
  return false;
@@ -410,7 +405,6 @@ All parameter, weight, gradient are variables in Paddle.
           (void (Executor::*)(const ProgramDesc &, Scope *, int, bool, bool)) &
               Executor::Run);

-  m.def("unique_integer", UniqueIntegerGenerator);
  m.def("init_gflags", framework::InitGflags);
  m.def("init_glog", framework::InitGLOG);
  m.def("init_devices", &framework::InitDevices);
@@ -465,6 +459,7 @@ All parameter, weight, gradient are variables in Paddle.
      .value("kDisabled", platform::ProfilerState::kDisabled)
      .value("kCPU", platform::ProfilerState::kCPU)
      .value("kCUDA", platform::ProfilerState::kCUDA)
+      .value("kAll", platform::ProfilerState::kAll)
      .export_values();

  py::enum_<platform::EventSortingKey>(m, "EventSortingKey", py::arithmetic())

--- a/paddle/scripts/docker/README.md
+++ b/paddle/scripts/docker/README.md
@@ -58,7 +58,7 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
 | `WITH_AVX` | OFF | Set to "ON" to enable AVX support. |
 | `WITH_TESTING` | OFF | Build unit tests binaries. |
 | `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. |
-| `WITH_GOLANG` | ON | Build fault-tolerant parameter server written in go. |
+| `WITH_GOLANG` | OFF | Build fault-tolerant parameter server written in go. |
 | `WITH_SWIG_PY` | ON | Build with SWIG python API support. |
 | `WITH_C_API` | OFF | Build capi libraries for inference. |
 | `WITH_PYTHON` | ON | Build with python support. Turn this off if build is only for capi. |

--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -40,7 +40,7 @@ function cmake_gen() {
        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
        -DWITH_MKL=${WITH_MKL:-ON}
        -DWITH_AVX=${WITH_AVX:-OFF}
-        -DWITH_GOLANG=${WITH_GOLANG:-ON}
+        -DWITH_GOLANG=${WITH_GOLANG:-OFF}
        -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
        -DWITH_SWIG_PY=ON
        -DWITH_C_API=${WITH_C_API:-OFF}
@@ -49,6 +49,7 @@ function cmake_gen() {
        -DCUDNN_ROOT=/usr/
        -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON}
        -DWITH_TESTING=${WITH_TESTING:-ON}
+        -DWITH_FAST_BUNDLE_TEST=ON
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
    ========================================
 EOF
@@ -64,7 +65,7 @@ EOF
        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
        -DWITH_MKL=${WITH_MKL:-ON} \
        -DWITH_AVX=${WITH_AVX:-OFF} \
-        -DWITH_GOLANG=${WITH_GOLANG:-ON} \
+        -DWITH_GOLANG=${WITH_GOLANG:-OFF} \
        -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
        -DWITH_C_API=${WITH_C_API:-OFF} \
@@ -72,6 +73,7 @@ EOF
        -DCUDNN_ROOT=/usr/ \
        -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
        -DWITH_TESTING=${WITH_TESTING:-ON} \
+        -DWITH_FAST_BUNDLE_TEST=ON \
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 }

@@ -171,7 +173,7 @@ EOF
 EOF

    if [[ ${WITH_GPU} == "ON"  ]]; then
-        NCCL_DEPS="apt-get install -y libnccl-dev &&"
+        NCCL_DEPS="apt-get install -y libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 &&"
    else
        NCCL_DEPS=""
    fi

--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -28,10 +28,9 @@ int main(int argc, char** argv) {
  }
 #ifdef PADDLE_WITH_CUDA
  new_argv.push_back(
-      strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory,"
-             "warpctc_dir"));
+      strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
 #else
-  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory,warpctc_dir"));
+  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory"));
 #endif
  int new_argc = static_cast<int>(new_argv.size());
  char** new_argv_address = new_argv.data();

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -3,12 +3,14 @@ file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
 file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
 file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py)
 file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py)
+file(GLOB_RECURSE FLUID_PY_FILES ./paddle/fluid/ *.py)

 set(PY_FILES paddle/__init__.py
  ${TRAINER_PY_FILES}
  ${HELPERS_PY_FILES}
  ${UTILS_PY_FILES}
-  ${V2_PY_FILES})
+  ${V2_PY_FILES}
+  ${FLUID_PY_FILES})

 add_custom_target(copy_paddle_master)

@@ -43,10 +45,10 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
    ${CMAKE_CURRENT_BINARY_DIR}/setup.py)


-add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so
-        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so
+add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/python/paddle/fluid/core.so
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_SOURCE_DIR}/python/paddle/fluid/core.so
        DEPENDS paddle_pybind)
-add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so)
+add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/paddle/fluid/core.so)


 add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
@@ -72,7 +74,7 @@ if (WITH_TESTING)
    add_subdirectory(paddle/v2/tests)
    add_subdirectory(paddle/v2/reader/tests)
    add_subdirectory(paddle/v2/plot/tests)
-    add_subdirectory(paddle/v2/fluid/tests)
+    add_subdirectory(paddle/fluid/tests)
  endif()
 endif()
 install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}

--- a/python/paddle/v2/fluid/.gitignore
+++ b/python/paddle/v2/fluid/.gitignore
 proto
+core.so
--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -39,6 +39,7 @@ from concurrency import (Go, make_channel, channel_send, channel_recv,
 import clip
 from memory_optimization_transpiler import memory_optimize
 import profiler
+import unique_name

 Tensor = LoDTensor

@@ -63,6 +64,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + [
    'DistributeTranspiler',
    'memory_optimize',
    'profiler',
+    'unique_name',
 ]



--- a/python/paddle/v2/fluid/backward.py
+++ b/python/paddle/v2/fluid/backward.py
--- a/python/paddle/v2/fluid/clip.py
+++ b/python/paddle/v2/fluid/clip.py
--- a/python/paddle/v2/fluid/concurrency.py
+++ b/python/paddle/v2/fluid/concurrency.py
--- a/python/paddle/v2/fluid/data_feeder.py
+++ b/python/paddle/v2/fluid/data_feeder.py
--- a/python/paddle/v2/fluid/debuger.py
+++ b/python/paddle/v2/fluid/debuger.py
--- a/python/paddle/v2/fluid/default_scope_funcs.py
+++ b/python/paddle/v2/fluid/default_scope_funcs.py
--- a/python/paddle/v2/fluid/distribute_transpiler.py
+++ b/python/paddle/v2/fluid/distribute_transpiler.py
--- a/python/paddle/v2/fluid/distribute_transpiler_simple.py
+++ b/python/paddle/v2/fluid/distribute_transpiler_simple.py
--- a/python/paddle/v2/fluid/distributed_spliter.py
+++ b/python/paddle/v2/fluid/distributed_spliter.py
--- a/python/paddle/v2/fluid/evaluator.py
+++ b/python/paddle/v2/fluid/evaluator.py
--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
--- a/python/paddle/v2/fluid/graphviz.py
+++ b/python/paddle/v2/fluid/graphviz.py
--- a/python/paddle/v2/fluid/initializer.py
+++ b/python/paddle/v2/fluid/initializer.py
--- a/python/paddle/v2/fluid/io.py
+++ b/python/paddle/v2/fluid/io.py
--- a/python/paddle/v2/fluid/layer_helper.py
+++ b/python/paddle/v2/fluid/layer_helper.py
--- a/python/paddle/v2/fluid/layers/__init__.py
+++ b/python/paddle/v2/fluid/layers/__init__.py
--- a/python/paddle/v2/fluid/layers/control_flow.py
+++ b/python/paddle/v2/fluid/layers/control_flow.py
--- a/python/paddle/v2/fluid/layers/detection.py
+++ b/python/paddle/v2/fluid/layers/detection.py
--- a/python/paddle/v2/fluid/layers/device.py
+++ b/python/paddle/v2/fluid/layers/device.py
--- a/python/paddle/v2/fluid/layers/io.py
+++ b/python/paddle/v2/fluid/layers/io.py
--- a/python/paddle/v2/fluid/layers/layer_function_generator.py
+++ b/python/paddle/v2/fluid/layers/layer_function_generator.py
--- a/python/paddle/v2/fluid/layers/math_op_patch.py
+++ b/python/paddle/v2/fluid/layers/math_op_patch.py
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
--- a/python/paddle/v2/fluid/layers/ops.py
+++ b/python/paddle/v2/fluid/layers/ops.py
--- a/python/paddle/v2/fluid/layers/tensor.py
+++ b/python/paddle/v2/fluid/layers/tensor.py
--- a/python/paddle/fluid/layers/utils.py
+++ b/python/paddle/fluid/layers/utils.py
--- a/python/paddle/v2/fluid/learning_rate_decay.py
+++ b/python/paddle/v2/fluid/learning_rate_decay.py
--- a/python/paddle/v2/fluid/memory_optimization_transpiler.py
+++ b/python/paddle/v2/fluid/memory_optimization_transpiler.py
--- a/python/paddle/v2/fluid/net_drawer.py
+++ b/python/paddle/v2/fluid/net_drawer.py
--- a/python/paddle/v2/fluid/nets.py
+++ b/python/paddle/v2/fluid/nets.py
--- a/python/paddle/v2/fluid/op.py
+++ b/python/paddle/v2/fluid/op.py
--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
--- a/python/paddle/v2/fluid/param_attr.py
+++ b/python/paddle/v2/fluid/param_attr.py
--- a/python/paddle/v2/fluid/profiler.py
+++ b/python/paddle/v2/fluid/profiler.py
--- a/python/paddle/v2/fluid/regularizer.py
+++ b/python/paddle/v2/fluid/regularizer.py
--- a/python/paddle/v2/fluid/tests/.gitignore
+++ b/python/paddle/v2/fluid/tests/.gitignore
--- a/python/paddle/v2/fluid/tests/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/CMakeLists.txt
--- a/python/paddle/v2/fluid/tests/__init__.py
+++ b/python/paddle/v2/fluid/tests/__init__.py
--- a/python/paddle/v2/fluid/tests/book/.gitignore
+++ b/python/paddle/v2/fluid/tests/book/.gitignore
--- a/python/paddle/v2/fluid/tests/book/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/book/CMakeLists.txt
--- a/python/paddle/v2/fluid/tests/book/__init__.py
+++ b/python/paddle/v2/fluid/tests/book/__init__.py
--- a/python/paddle/v2/fluid/tests/book/notest_rnn_encoder_decoer.py
+++ b/python/paddle/v2/fluid/tests/book/notest_rnn_encoder_decoer.py
--- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
--- a/python/paddle/v2/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification.py
--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
--- a/python/paddle/v2/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
--- a/python/paddle/v2/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py
--- a/python/paddle/v2/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/CMakeLists.txt
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
--- a/python/paddle/v2/fluid/tests/demo/fc_gan.py
+++ b/python/paddle/v2/fluid/tests/demo/fc_gan.py
--- a/python/paddle/v2/fluid/tests/notest_concurrency.py
+++ b/python/paddle/v2/fluid/tests/notest_concurrency.py
--- a/python/paddle/v2/fluid/tests/notest_csp.py
+++ b/python/paddle/v2/fluid/tests/notest_csp.py
--- a/python/paddle/v2/fluid/tests/test_cpp_reader.py
+++ b/python/paddle/v2/fluid/tests/test_cpp_reader.py
--- a/python/paddle/v2/fluid/tests/test_data_feeder.py
+++ b/python/paddle/v2/fluid/tests/test_data_feeder.py
--- a/python/paddle/v2/fluid/tests/test_detection.py
+++ b/python/paddle/v2/fluid/tests/test_detection.py
--- a/python/paddle/v2/fluid/tests/test_error_clip.py
+++ b/python/paddle/v2/fluid/tests/test_error_clip.py
--- a/python/paddle/v2/fluid/tests/test_gradient_clip.py
+++ b/python/paddle/v2/fluid/tests/test_gradient_clip.py
--- a/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
+++ b/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
--- a/python/paddle/v2/fluid/tests/test_python_operator_overriding.py
+++ b/python/paddle/v2/fluid/tests/test_python_operator_overriding.py
--- a/python/paddle/v2/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/v2/fluid/tests/unittests/__init__.py
+++ b/python/paddle/v2/fluid/tests/unittests/__init__.py
--- a/python/paddle/v2/fluid/tests/unittests/decorators.py
+++ b/python/paddle/v2/fluid/tests/unittests/decorators.py
--- a/python/paddle/v2/fluid/tests/unittests/op_test.py
+++ b/python/paddle/v2/fluid/tests/unittests/op_test.py
--- a/python/paddle/v2/fluid/tests/unittests/test_accuracy_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_accuracy_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_activation_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_adadelta_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_adadelta_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_adagrad_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_adagrad_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_adam_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_adamax_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_adamax_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_array_read_write_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_array_read_write_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_assign_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_assign_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_assign_value_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_assign_value_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_auc_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_auc_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_batch_norm_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_beam_search_decode_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_beam_search_decode_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_beam_search_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_beam_search_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_bilinear_tensor_product_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_bilinear_tensor_product_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_bipartite_match_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_bipartite_match_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_box_coder_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_box_coder_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_calc_gradient.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_calc_gradient.py
--- a/python/paddle/v2/fluid/tests/unittests/test_cast_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_cast_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_chunk_eval_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_chunk_eval_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_clip_by_norm_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_clip_by_norm_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_clip_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_clip_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_compare_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_compare_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_concat_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_cond_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_cond_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_conditional_block.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_conditional_block.py
--- a/python/paddle/v2/fluid/tests/unittests/test_const_value.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_const_value.py
--- a/python/paddle/v2/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_conv2d_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_conv2d_transpose_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_conv3d_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_conv3d_transpose_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_conv3d_transpose_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_conv_shift_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_conv_shift_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_cos_sim_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_cos_sim_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_create_op_doc_string.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_create_op_doc_string.py
--- a/python/paddle/v2/fluid/tests/unittests/test_crf_decoding_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_crf_decoding_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_crop_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_crop_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_cross_entropy_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_cross_entropy_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_ctc_align.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_ctc_align.py
--- a/python/paddle/v2/fluid/tests/unittests/test_cumsum_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_cumsum_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_decayed_adagrad_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_decayed_adagrad_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_default_scope_funcs.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_default_scope_funcs.py
--- a/python/paddle/v2/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_detection_map_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_detection_output_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_detection_output_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_dropout_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_dyn_rnn.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_dyn_rnn.py
--- a/python/paddle/v2/fluid/tests/unittests/test_dynrnn_gradient_check.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_dynrnn_gradient_check.py
--- a/python/paddle/v2/fluid/tests/unittests/test_dynrnn_static_input.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_dynrnn_static_input.py
--- a/python/paddle/v2/fluid/tests/unittests/test_edit_distance_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_edit_distance_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_elementwise_add_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_elementwise_div_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_elementwise_div_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_elementwise_max_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_elementwise_max_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_elementwise_min_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_elementwise_min_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_elementwise_mul_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_elementwise_mul_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_elementwise_pow_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_elementwise_pow_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_elementwise_sub_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_elementwise_sub_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_exception.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_exception.py
--- a/python/paddle/v2/fluid/tests/unittests/test_executor_and_mul.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_executor_and_mul.py
--- a/python/paddle/v2/fluid/tests/unittests/test_expand_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_expand_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_feed_fetch_method.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_feed_fetch_method.py
--- a/python/paddle/v2/fluid/tests/unittests/test_fetch_var.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_fetch_var.py
--- a/python/paddle/v2/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_fill_constant_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_fill_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_fill_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_fill_zeros_like_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_fill_zeros_like_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_framework_debug_str.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_framework_debug_str.py
--- a/python/paddle/v2/fluid/tests/unittests/test_ftrl_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_ftrl_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_gather_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_gather_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_gaussian_random_batch_size_like_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_gaussian_random_batch_size_like_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_gaussian_random_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_get_places_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_get_places_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_gru_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_gru_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_gru_unit_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_gru_unit_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_hinge_loss_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_hinge_loss_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_huber_loss_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_huber_loss_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_im2sequence_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_im2sequence_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_image_classification_layer.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_image_classification_layer.py
--- a/python/paddle/v2/fluid/tests/unittests/test_infer_shape.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_infer_shape.py
--- a/python/paddle/v2/fluid/tests/unittests/test_inference_model_io.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_inference_model_io.py
--- a/python/paddle/v2/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_initializer.py
--- a/python/paddle/v2/fluid/tests/unittests/test_iou_similarity_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_iou_similarity_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_is_empty_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_is_empty_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_l1_norm_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_l1_norm_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_label_smooth_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_label_smooth_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_layer_norm_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_layers.py
--- a/python/paddle/v2/fluid/tests/unittests/test_learning_rate_decay.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_learning_rate_decay.py
--- a/python/paddle/v2/fluid/tests/unittests/test_linear_chain_crf_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_linear_chain_crf_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_lod_array_length_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_lod_array_length_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_lod_rank_table.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_lod_rank_table.py
--- a/python/paddle/v2/fluid/tests/unittests/test_lod_reset_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_lod_reset_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_lod_tensor_array.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_lod_tensor_array.py
--- a/python/paddle/v2/fluid/tests/unittests/test_lod_tensor_array_ops.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_lod_tensor_array_ops.py
--- a/python/paddle/v2/fluid/tests/unittests/test_log_loss_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_log_loss_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_logical_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_logical_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_lookup_table_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_lookup_table_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_lrn_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_lrn_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_lstm_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_lstm_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_lstm_unit_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_lstm_unit_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_lstmp_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_lstmp_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_margin_rank_loss_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_margin_rank_loss_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_math_op_patch.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_math_op_patch.py
--- a/python/paddle/v2/fluid/tests/unittests/test_matmul_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_matmul_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_maxout_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_maxout_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_mean_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_mean_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_memory_optimization_transpiler.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_memory_optimization_transpiler.py
--- a/python/paddle/v2/fluid/tests/unittests/test_mine_hard_examples_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_mine_hard_examples_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_minus_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_minus_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_modified_huber_loss_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_modified_huber_loss_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_momentum_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_mul_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_mul_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_multiclass_nms_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_multihead_attention.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_multihead_attention.py
--- a/python/paddle/v2/fluid/tests/unittests/test_multiplex_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_multiplex_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_nce.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_nce.py
--- a/python/paddle/v2/fluid/tests/unittests/test_net.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_net.py
--- a/python/paddle/v2/fluid/tests/unittests/test_norm_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_norm_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_normalization_wrapper.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_normalization_wrapper.py
--- a/python/paddle/fluid/tests/unittests/test_nvprof.py
+++ b/python/paddle/fluid/tests/unittests/test_nvprof.py
--- a/python/paddle/v2/fluid/tests/unittests/test_one_hot_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_one_hot_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_op_support_gpu.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_op_support_gpu.py
--- a/python/paddle/v2/fluid/tests/unittests/test_operator.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_operator.py
--- a/python/paddle/v2/fluid/tests/unittests/test_operator_desc.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_operator_desc.py
--- a/python/paddle/v2/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_optimizer.py
--- a/python/paddle/v2/fluid/tests/unittests/test_pad_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_pad_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_parallel_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_parallel_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_parameter.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_parameter.py
--- a/python/paddle/v2/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_pool2d_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_pool3d_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_pool_max_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_pool_max_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_positive_negative_pair_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_positive_negative_pair_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_precision_recall_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_precision_recall_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_prelu_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_prelu_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_print_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_print_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_prior_box_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_prior_box_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_profiler.py
--- a/python/paddle/v2/fluid/tests/unittests/test_program.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_program.py
--- a/python/paddle/v2/fluid/tests/unittests/test_protobuf.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_protobuf.py
--- a/python/paddle/v2/fluid/tests/unittests/test_protobuf_descs.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_protobuf_descs.py
--- a/python/paddle/v2/fluid/tests/unittests/test_proximal_adagrad_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_proximal_adagrad_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_proximal_gd_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_proximal_gd_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_rank_loss_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_rank_loss_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_recurrent_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_recurrent_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_recv_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_recv_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_reduce_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_registry.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_registry.py
--- a/python/paddle/v2/fluid/tests/unittests/test_regularizer.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_regularizer.py
--- a/python/paddle/v2/fluid/tests/unittests/test_reorder_lod_tensor.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_reorder_lod_tensor.py
--- a/python/paddle/v2/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_reshape_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_rmsprop_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_rnn_memory_helper_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_rnn_memory_helper_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_roi_pool_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_roi_pool_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_row_conv_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_row_conv_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_scale_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_scale_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_scatter_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_scatter_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_scope.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_scope.py
--- a/python/paddle/v2/fluid/tests/unittests/test_selected_rows.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_selected_rows.py
--- a/python/paddle/v2/fluid/tests/unittests/test_seq_concat_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_seq_concat_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_seq_conv.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_seq_conv.py
--- a/python/paddle/v2/fluid/tests/unittests/test_seq_pool.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_seq_pool.py
--- a/python/paddle/v2/fluid/tests/unittests/test_sequence_erase_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_sequence_erase_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_sequence_expand.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_sequence_expand.py
--- a/python/paddle/v2/fluid/tests/unittests/test_sequence_reshape.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_sequence_reshape.py
--- a/python/paddle/v2/fluid/tests/unittests/test_sequence_slice_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_sequence_slice_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_sequence_softmax_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_sequence_softmax_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_sgd_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_sgd_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_shrink_rnn_memory.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_shrink_rnn_memory.py
--- a/python/paddle/v2/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_sign_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_sign_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_smooth_l1_loss_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_smooth_l1_loss_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_softmax_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_split_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_split_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_split_selected_rows_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_split_selected_rows_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_split_var.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_split_var.py
--- a/python/paddle/v2/fluid/tests/unittests/test_spp_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_spp_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_squared_l2_distance_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_squared_l2_distance_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_squared_l2_norm_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_squared_l2_norm_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_sum_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_switch.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_switch.py
--- a/python/paddle/v2/fluid/tests/unittests/test_target_assign_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_target_assign_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_tensor.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_tensor.py
--- a/python/paddle/v2/fluid/tests/unittests/test_top_k_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_top_k_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_transpose_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_uniform_random_batch_size_like_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_uniform_random_batch_size_like_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_uniform_random_op.py
--- a/python/paddle/fluid/tests/unittests/test_unique_name.py
+++ b/python/paddle/fluid/tests/unittests/test_unique_name.py
--- a/python/paddle/v2/fluid/tests/unittests/test_unpool_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_unpool_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_variable.py
--- a/python/paddle/v2/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_warpctc_op.py
--- a/python/paddle/v2/fluid/tests/unittests/test_weight_normalization.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_weight_normalization.py
--- a/python/paddle/v2/fluid/tests/unittests/test_while_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_while_op.py
--- a/python/paddle/fluid/unique_name.py
+++ b/python/paddle/fluid/unique_name.py
--- a/python/paddle/v2/dataset/conll05.py
+++ b/python/paddle/v2/dataset/conll05.py
--- a/python/paddle/v2/fluid/tests/book_distribute/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/book_distribute/CMakeLists.txt
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_image_classification.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_image_classification.py
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_label_semantic_roles.py
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_word2vec.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_word2vec.py
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_machine_translation.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_machine_translation.py
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_mlp_dist.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_mlp_dist.py
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_recommender_system_dist.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_recommender_system_dist.py
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_conv_dist.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_conv_dist.py
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_dynamic_lstm.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_dynamic_lstm.py
--- a/python/setup.py.in
+++ b/python/setup.py.in