diff --git a/.gitignore b/.gitignore
index fe0d13f4d9eab2c2a8e7001c9ecb69cce1333af1..2badc3bdaa52f2608183fa34393719be66630654 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,7 +27,6 @@ third_party/
 cmake-build-*
 
 # generated while compiling
-python/paddle/v2/fluid/core.so
 paddle/pybind/pybind.h
 CMakeFiles
 cmake_install.cmake
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5db5c228be2d6491463ec1ddb17de7bec730bd44..a2f440c2d089b5d596ab59d5099c0066ef325614 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,7 +19,7 @@ set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 
 include(system)
 
-project(paddle CXX C Go)
+project(paddle CXX C)
 message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
         "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
 message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
@@ -60,7 +60,7 @@ option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
 option(WITH_DISTRIBUTE  "Compile with grpc distributed support"         OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
-option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   ON)
+option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -146,6 +146,7 @@ include(external/cares)
 include(external/grpc)
 
 include(cudnn)              # set cudnn libraries, must before configure
+include(cupti)
 include(configure)          # add paddle env configuration
 include(generic)            # simplify cmake module
 include(package)            # set paddle packages
@@ -174,7 +175,7 @@ set(EXTERNAL_LIBS
 )
 
 if(WITH_GPU)
-  include(cuda)
+    include(cuda)
 endif(WITH_GPU)
 
 if(WITH_MKLML)
@@ -201,17 +202,18 @@ endif()
 # "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
 # placed after this block, because they depends on it.
 if(WITH_GOLANG)
+    enable_language(Go)
     add_subdirectory(go)
 endif(WITH_GOLANG)
 
 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 
-SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
-SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
+set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 
 add_subdirectory(paddle)
 if(WITH_PYTHON)
-  add_subdirectory(python)
+    add_subdirectory(python)
 endif()
 
 if(WITH_DOC)
diff --git a/Dockerfile b/Dockerfile
index 6ac9901ac6cea12e97047efdfb6272c957f166ae..60e76c7f2ede6beaca11659020d5991a75d5b741 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -22,7 +22,8 @@ COPY ./paddle/scripts/docker/root/ /root/
 
 RUN apt-get update && \
     apt-get install -y \
-    git python-pip python-dev openssh-server bison libnccl-dev \
+    git python-pip python-dev openssh-server bison \
+    libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
     wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
     curl sed grep graphviz libjpeg-dev zlib1g-dev  \
     python-matplotlib gcc-4.8 g++-4.8 \
diff --git a/Dockerfile.android b/Dockerfile.android
index 9d13a414f67be04e17b7d83403228d92bce0eda9..cc022d596b4b74dd1e4f4d0901dd81c91a7decd1 100644
--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -21,16 +21,6 @@ RUN apt-get update && \
     wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
     apt-get clean -y
 
-# Install Go and glide
-RUN wget -qO- go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
-    tar -xz -C /usr/local && \
-    mkdir /root/gopath && \
-    mkdir /root/gopath/bin && \
-    mkdir /root/gopath/src
-ENV GOROOT=/usr/local/go GOPATH=/root/gopath
-# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
-ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
-
 # git credential to skip password typing
 RUN git config --global credential.helper store
 
diff --git a/benchmark/cluster/vgg16/Dockerfile b/benchmark/cluster/vgg16/Dockerfile
index 98356cd7613baff7f0cd66d1462068232b2b8500..13ad8e1b6237e6f41a076c4fb54311728832ae33 100644
--- a/benchmark/cluster/vgg16/Dockerfile
+++ b/benchmark/cluster/vgg16/Dockerfile
@@ -1,18 +1,35 @@
-#FROM python:2.7.14
 FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
-RUN apt-get update && apt-get install -y python
-RUN pip install -U kubernetes opencv-python &&   apt-get update -y &&   apt-get install -y iputils-ping libgtk2.0-dev
-# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
-#       so we must build one with distribute support to install in this image.
+
+# you can get mirror list here:
+# https://launchpad.net/ubuntu/+archivemirrors
+ARG UBUNTU_MIRROR
+RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
+
+RUN apt-get update && apt-get install -y python python-dev python-pip iputils-ping libgtk2.0-dev
+RUN pip install -U kubernetes opencv-python
+
 RUN pip install paddlepaddle
+# if network is slowly, you may need to add proxy here.
+# ENV https_proxy=
 RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
 RUN pip uninstall -y paddlepaddle
+# unset proxy if it is setted.
+# ENV https_proxy=""
+
+# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
+#       so we must build one with distribute support to install in this image.
+ADD *.whl /
+RUN pip install /*.whl && rm -f /*.whl
+ENV LD_LIBRARY_PATH=/usr/local/lib
+
+# tf k8s
+RUN pip install tensorflow==1.4.0
+ADD tf_k8s /usr/bin
+RUN chmod +x /usr/bin/tf_k8s
+ADD vgg16_tf.py /workspace/
 
 # below lines may change a lot for debugging
 ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
 ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
-ADD *.whl /
-RUN pip install /*.whl && rm -f /*.whl && \
-chmod +x /usr/bin/paddle_k8s
-ENV LD_LIBRARY_PATH=/usr/local/lib
+RUN chmod +x /usr/bin/paddle_k8s
 ADD vgg16_fluid.py vgg16_v2.py /workspace/
diff --git a/benchmark/cluster/vgg16/fluid_trainer.yaml b/benchmark/cluster/vgg16/fluid_trainer.yaml
index 0a0ed25ebe43c4cc0d5ab0b72cf36c936fcce802..3d56caac009464d1073423bb63abff1f8b0cf28f 100644
--- a/benchmark/cluster/vgg16/fluid_trainer.yaml
+++ b/benchmark/cluster/vgg16/fluid_trainer.yaml
@@ -11,7 +11,7 @@ spec:
         paddle-job: vgg16job
     spec:
       imagePullSecrets:
-        - name: job-registry-secret
+      - name: job-registry-secret
       hostNetwork: true
       containers:
       - name: trainer
diff --git a/benchmark/cluster/vgg16/tf_k8s b/benchmark/cluster/vgg16/tf_k8s
new file mode 100644
index 0000000000000000000000000000000000000000..4fc263d5f681aeabfa71f1758714d269d987b272
--- /dev/null
+++ b/benchmark/cluster/vgg16/tf_k8s
@@ -0,0 +1,82 @@
+#!/bin/bash
+check_trainer_ret() {
+  ret=$1
+  stdbuf -oL echo "job returned $ret...setting pod return message..."
+  stdbuf -oL echo "==============================="
+
+  if [ $ret -eq 136 ] ; then
+    echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
+  elif [ $ret -eq 139 ] ; then
+    echo "Segmentation Fault" > /dev/termination-log
+  elif [ $ret -eq 1 ] ; then
+    echo "General Error" > /dev/termination-log
+  elif [ $ret -eq 134 ] ; then
+    echo "Program Abort" > /dev/termination-log
+  fi
+  stdbuf -oL echo "termination log wroted..."
+  exit $ret
+}
+
+g_pservers=""
+g_trainers=""
+
+wait_running_pods(){
+  pserver_label="tf-job-pserver=${JOB_NAME}"
+  trainer_label="tf-job-trainer=${JOB_NAME}"
+
+  stdbuf -oL python /root/k8s_tools.py wait_pods_running ${pserver_label} ${PSERVERS_NUM}
+  stdbuf -oL python /root/k8s_tools.py wait_pods_running ${trainer_label} ${TRAINERS_NUM}
+
+  g_pservers=$(python /root/k8s_tools.py fetch_endpoints ${pserver_label} ${PORT})
+  g_trainers=$(python /root/k8s_tools.py fetch_endpoints ${trainer_label} ${PORT})
+}
+
+start_tf_pserver(){
+  wait_running_pods
+
+  label="tf-job-pserver=${JOB_NAME}"
+  pserver_id=$(python /root/k8s_tools.py fetch_id ${label})
+
+  cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
+  --job_name=${TF_JOB_NAME} --task_index=${pserver_id}"
+
+  stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
+}
+
+start_tf_trainer(){
+  wait_running_pods
+
+  label="tf-job-trainer=${JOB_NAME}"
+  trainer_id=$(python /root/k8s_tools.py fetch_id ${label})
+
+  cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
+  --job_name=${TF_JOB_NAME} --task_index=${trainer_id} --batch_size=${BATCH_SIZE}"
+
+  stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
+  check_trainer_ret $?
+}
+
+start_tf(){
+    if [[ "${TF_JOB_NAME}" == "worker" ]]; then
+        start_tf_trainer
+    else
+        start_tf_pserver
+    fi
+}
+
+usage() {
+    echo "usage: tf_k8s [<args>]:"
+    echo "  start_tf         Start tensorflow jobs"
+}
+
+case "$1" in
+    start_tf)
+        start_tf
+        ;;
+    --help)
+        usage
+        ;;
+    *)
+        usage
+        ;;
+esac
diff --git a/benchmark/cluster/vgg16/tf_pserver.yaml b/benchmark/cluster/vgg16/tf_pserver.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5e37c700819119c8af05c40fe4b8d13911efc3e1
--- /dev/null
+++ b/benchmark/cluster/vgg16/tf_pserver.yaml
@@ -0,0 +1,56 @@
+apiVersion: extensions/v1beta1
+kind: ReplicaSet
+metadata:
+  name: vgg16job-tf-pserver
+spec:
+  replicas: 10
+  template:
+    metadata:
+      labels:
+        tf-job-pserver: vgg16job-tf
+    spec:
+      hostNetwork: true
+      imagePullSecrets:
+      - name: job-registry-secret
+      containers:
+      - name: pserver
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
+        imagePullPolicy: Always
+        command: ["tf_k8s", "start_tf"]
+        ports:
+        - name: jobport-30236
+          containerPort: 30236
+        env:
+        - name: PORT
+          value: "32036"
+        - name: ENTRY
+          value: "python vgg16_tf.py"
+        - name: JOB_NAME
+          value: vgg16job-tf
+        - name: PSERVERS_NUM
+          value: "10"
+        - name: TF_JOB_NAME 
+          value: "ps"
+        - name: TRAINERS_NUM
+          value: "20"
+        - name: BATCH_SIZE
+          value: "128"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: NUM_PASSES
+          value: "1"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        - name: POD_IP
+          valueFrom:
+            fieldRef:
+              fieldPath: "status.podIP"
+        resources:
+          requests:
+            memory: 10Gi
+            cpu: 4
+          limits:
+            memory: 10Gi
+            cpu: 4
diff --git a/benchmark/cluster/vgg16/tf_trainer.yaml b/benchmark/cluster/vgg16/tf_trainer.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..08795df3addfa7b618db24a65e57be190e268f06
--- /dev/null
+++ b/benchmark/cluster/vgg16/tf_trainer.yaml
@@ -0,0 +1,58 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: vgg16job-tf-trainer
+spec:
+  parallelism: 20
+  completions: 20
+  template:
+    metadata:
+      labels:
+        tf-job-trainer: vgg16job-tf
+    spec:
+      imagePullSecrets:
+      - name: job-registry-secret
+      hostNetwork: true
+      containers:
+      - name: trainer
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
+        imagePullPolicy: Always
+        command: ["tf_k8s", "start_tf"]
+        ports:
+        - name: jobport-30236
+          containerPort: 30236
+        env:
+        - name: PORT
+          value: "32036"
+        - name: JOB_NAME
+          value: vgg16job-tf
+        - name: TF_JOB_NAME 
+          value: "worker"
+        - name: ENTRY
+          value: "python vgg16_tf.py"
+        - name: PSERVERS_NUM
+          value: "10"
+        - name: BATCH_SIZE
+          value: "128"
+        - name: TRAINERS_NUM
+          value: "20"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: NUM_PASSES
+          value: "1"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        - name: POD_IP
+          valueFrom:
+            fieldRef:
+              fieldPath: "status.podIP"
+        resources:
+          requests:
+            memory: 40Gi
+            cpu: 2
+          limits:
+            memory: 40Gi
+            cpu: 2
+      restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py
index 99395699f2ff5a04f340a1ca73d6e9a853981f5c..7323241f4d3bdcbe9c9efcbaaedebe01adbd4701 100644
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@@ -68,6 +68,21 @@ parser.add_argument(
     type=str2bool,
     default=True,
     help='Whether to run as local mode.')
+
+parser.add_argument(
+    "--ps_hosts",
+    type=str,
+    default="",
+    help="Comma-separated list of hostname:port pairs")
+parser.add_argument(
+    "--trainer_hosts",
+    type=str,
+    default="",
+    help="Comma-separated list of hostname:port pairs")
+
+# Flags for defining the tf.train.Server
+parser.add_argument(
+    "--task_index", type=int, default=0, help="Index of task within the job")
 args = parser.parse_args()
 
 
@@ -180,8 +195,9 @@ def main():
                     iters += 1
                     num_samples += len(data)
                     print(
-                        "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f"
-                        % (pass_id, iters, loss, acc, time.time() - ts)
+                        "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
+                        % (pass_id, iters, loss, acc,
+                           len(data) / (time.time() - ts))
                     )  # The accuracy is the accumulation of batches, but not the current batch.
 
             pass_elapsed = time.time() - start_time
@@ -209,27 +225,24 @@ def main():
             batch_size=args.batch_size)
         train_loop(exe, fluid.default_main_program())
     else:
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # all pserver endpoints
-        eplist = []
-        for ip in pserver_ips.split(","):
-            eplist.append(':'.join([ip, "6174"]))
-        pserver_endpoints = ",".join(eplist)
-        print("pserver endpoints: ", pserver_endpoints)
         trainers = int(os.getenv("TRAINERS"))  # total trainer count
         print("trainers total: ", trainers)
-        current_endpoint = os.getenv(
-            "POD_IP") + ":6174"  # current pserver endpoint
+
         training_role = os.getenv(
             "TRAINING_ROLE",
             "TRAINER")  # get the training role: trainer/pserver
+
         t = fluid.DistributeTranspiler()
         t.transpile(
             optimize_ops,
             params_grads,
-            pservers=pserver_endpoints,
+            trainer_id=args.task_index,
+            pservers=args.ps_hosts,
             trainers=trainers)
 
         if training_role == "PSERVER":
+            current_endpoint = os.getenv("POD_IP") + ":" + os.getenv(
+                "PADDLE_INIT_PORT")
             if not current_endpoint:
                 print("need env SERVER_ENDPOINT")
                 exit(1)
diff --git a/benchmark/cluster/vgg16/vgg16_tf.py b/benchmark/cluster/vgg16/vgg16_tf.py
new file mode 100644
index 0000000000000000000000000000000000000000..996df0e314b867ea8de618dfd3977f490fbe8372
--- /dev/null
+++ b/benchmark/cluster/vgg16/vgg16_tf.py
@@ -0,0 +1,362 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VGG16 benchmark in TensorFlow
+You can get distribution example template structure here:
+https://medium.com/clusterone/how-to-write-distributed-tensorflow-code-with-an-example-on-tensorport-70bf3306adcb
+https://www.tensorflow.org/deploy/distributed
+"""
+
+import tensorflow as tf
+import paddle.v2 as paddle
+import numpy as np
+import argparse
+import time
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--batch_size', type=int, default=128, help="Batch size for training.")
+parser.add_argument(
+    '--learning_rate',
+    type=float,
+    default=1e-3,
+    help="Learning rate for training.")
+parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
+parser.add_argument(
+    '--device',
+    type=str,
+    default='CPU',
+    choices=['CPU', 'GPU'],
+    help="The device type.")
+parser.add_argument(
+    '--data_format',
+    type=str,
+    default='NHWC',
+    choices=['NCHW', 'NHWC'],
+    help='The data order, NCHW=[batch, channels, height, width].'
+    'Only support NHWC right now.')
+parser.add_argument(
+    '--data_set',
+    type=str,
+    default='cifar10',
+    choices=['cifar10', 'flowers'],
+    help='Optional dataset for benchmark.')
+
+parser.add_argument(
+    "--ps_hosts",
+    type=str,
+    default="",
+    help="Comma-separated list of hostname:port pairs")
+parser.add_argument(
+    "--worker_hosts",
+    type=str,
+    default="",
+    help="Comma-separated list of hostname:port pairs")
+parser.add_argument(
+    "--job_name", type=str, default="", help="One of 'worker', 'ps'")
+# Flags for defining the tf.train.Server
+parser.add_argument(
+    "--task_index", type=int, default=0, help="Index of task within the job")
+
+args = parser.parse_args()
+
+
+class VGG16Model(object):
+    def __init__(self):
+        self.parameters = []
+
+    def batch_norm_relu(self, inputs, is_training):
+        """Performs a batch normalization followed by a ReLU."""
+        # We set fused=True for a significant speed boost. See
+        # https://www.tensorflow.org/speed/speed_guide#common_fused_ops
+        inputs = tf.layers.batch_normalization(
+            inputs=inputs,
+            axis=1 if args.data_format == 'NCHW' else -1,
+            momentum=0.9,
+            epsilon=1e-05,
+            center=True,
+            scale=True,
+            training=is_training,
+            fused=True)
+        inputs = tf.nn.relu(inputs)
+        return inputs
+
+    def conv_bn_layer(self,
+                      name,
+                      images,
+                      kernel_shape,
+                      is_training,
+                      drop_rate=0.0):
+        with tf.name_scope(name) as scope:
+            kernel = tf.Variable(
+                tf.truncated_normal(
+                    kernel_shape, dtype=tf.float32, stddev=1e-1),
+                name='weights')
+            conv = tf.nn.conv2d(
+                images,
+                kernel, [1, 1, 1, 1],
+                data_format=args.data_format,
+                padding='SAME')
+            biases = tf.Variable(
+                tf.constant(
+                    0.0, shape=[kernel_shape[-1]], dtype=tf.float32),
+                trainable=True,
+                name='biases')
+            out = tf.nn.bias_add(conv, biases)
+            out = self.batch_norm_relu(out, is_training)
+            out = tf.layers.dropout(out, rate=drop_rate, training=is_training)
+            return out
+
+    def fc_layer(self, name, inputs, shape):
+        with tf.name_scope(name) as scope:
+            fc_w = tf.Variable(
+                tf.truncated_normal(
+                    shape, dtype=tf.float32, stddev=1e-1),
+                name='weights')
+            fc_b = tf.Variable(
+                tf.constant(
+                    0.0, shape=[shape[-1]], dtype=tf.float32),
+                trainable=True,
+                name='biases')
+            out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b)
+            return out
+
+    def network(self, images, class_dim, is_training):
+        """ VGG16 model structure.
+
+            TODO(kuke): enable this network to support the 'NCHW' data format
+        """
+
+        # conv1
+        conv1_1 = self.conv_bn_layer(
+            'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3)
+        conv1_2 = self.conv_bn_layer(
+            'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0)
+        # pool1
+        pool1 = tf.nn.max_pool(
+            conv1_2,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool1')
+        # conv2
+        conv2_1 = self.conv_bn_layer(
+            'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4)
+        conv2_2 = self.conv_bn_layer(
+            'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0)
+        # pool2
+        pool2 = tf.nn.max_pool(
+            conv2_2,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool2')
+        # conv3
+        conv3_1 = self.conv_bn_layer(
+            'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4)
+        conv3_2 = self.conv_bn_layer(
+            'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4)
+        conv3_3 = self.conv_bn_layer(
+            'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0)
+        # pool3
+        pool3 = tf.nn.max_pool(
+            conv3_3,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool3')
+        # conv4
+        conv4_1 = self.conv_bn_layer(
+            'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4)
+        conv4_2 = self.conv_bn_layer(
+            'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
+        conv4_3 = self.conv_bn_layer(
+            'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
+        # pool4
+        pool4 = tf.nn.max_pool(
+            conv4_3,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool4')
+        # conv5
+        conv5_1 = self.conv_bn_layer(
+            'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4)
+        conv5_2 = self.conv_bn_layer(
+            'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
+        conv5_3 = self.conv_bn_layer(
+            'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
+        # pool5
+        pool5 = tf.nn.max_pool(
+            conv5_3,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool4')
+        # flatten
+        shape = int(np.prod(pool5.get_shape()[1:]))
+        pool5_flat = tf.reshape(pool5, [-1, shape])
+        # fc1
+        drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training)
+        fc1 = self.fc_layer('fc1', drop, [shape, 512])
+        # fc2
+        bn = self.batch_norm_relu(fc1, is_training)
+        drop = tf.layers.dropout(bn, rate=0.5, training=is_training)
+        fc2 = self.fc_layer('fc2', drop, [512, 512])
+
+        fc3 = self.fc_layer('fc3', fc2, [512, class_dim])
+
+        return fc3
+
+
+def run_benchmark(cluster_spec, server):
+    """Run benchmark on cifar10 or flowers."""
+
+    if args.data_set == "cifar10":
+        class_dim = 10
+        raw_shape = (3, 32, 32)
+        dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else (
+            None, 3, 32, 32)
+    else:
+        class_dim = 102
+        raw_shape = (3, 224, 224)
+        dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else (
+            None, 3, 224, 224)
+
+    device = tf.train.replica_device_setter(
+        worker_device="/job:worker/task:{}".format(args.task_index),
+        cluster=cluster_spec)
+
+    with tf.device(device):
+        images = tf.placeholder(tf.float32, shape=dat_shape)
+        labels = tf.placeholder(tf.int64, shape=(None, ))
+        is_training = tf.placeholder('bool')
+        onehot_labels = tf.one_hot(labels, depth=class_dim)
+
+        vgg16 = VGG16Model()
+        logits = vgg16.network(images, class_dim, is_training)
+        loss = tf.losses.softmax_cross_entropy(
+            onehot_labels=onehot_labels, logits=logits)
+        avg_loss = tf.reduce_mean(loss)
+
+        correct = tf.equal(tf.argmax(logits, 1), labels)
+        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
+
+        optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
+        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+        global_step = tf.Variable(0, name='global_step', trainable=False)
+        with tf.control_dependencies(update_ops):
+            train_op = optimizer.minimize(avg_loss, global_step=global_step)
+
+        summary_op = tf.summary.merge_all()
+        init_op = tf.global_variables_initializer()
+
+    # data reader
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.test10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+
+    # test
+    def test():
+        test_accs = []
+        for batch_id, data in enumerate(test_reader()):
+            test_images = np.array(
+         map(lambda x: np.transpose(x[0].reshape(raw_shape),
+         axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
+            test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
+            test_accs.append(
+                accuracy.eval(feed_dict={
+                    images: test_images,
+                    labels: test_labels,
+                    is_training: False
+                }))
+        return np.mean(test_accs)
+
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+
+    hooks = [tf.train.StopAtStepHook(last_step=1000000)]
+
+    with tf.train.MonitoredTrainingSession(
+            master=server.target, is_chief=(args.task_index == 0),
+            hooks=hooks) as sess:
+        iters, num_samples, start_time = 0, 0, 0.0
+        for pass_id in range(args.num_passes):
+            # train
+            num_samples = 0
+            start_time = time.time()
+            for batch_id, data in enumerate(train_reader()):
+                train_images = np.array(
+                    map(lambda x: np.transpose(x[0].reshape(raw_shape),
+                    axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
+                train_labels = np.array(map(lambda x: x[1], data)).astype(
+                    'int64')
+                iter_begin_time = time.time()
+                _, loss, acc = sess.run([train_op, avg_loss, accuracy],
+                                        feed_dict={
+                                            images: train_images,
+                                            labels: train_labels,
+                                            is_training: True
+                                        })
+                iters += 1
+                print(
+                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed=%.2f imgs/sec"
+                    % (pass_id, iters, loss, acc,
+                       len(data) / (time.time() - iter_begin_time)))
+                num_samples += len(data)
+            train_elapsed = time.time() - start_time
+            # test
+            pass_test_acc = test()
+            print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" %
+                  (pass_id, num_samples / train_elapsed, pass_test_acc))
+
+
+def print_arguments():
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == '__main__':
+    print_arguments()
+
+    ps_hosts = args.ps_hosts.split(",")
+    worker_hosts = args.worker_hosts.split(",")
+
+    # Create a cluster from the parameter server and worker hosts.
+    cluster_spec = tf.train.ClusterSpec({
+        "ps": ps_hosts,
+        "worker": worker_hosts
+    })
+
+    # Create and start a server for the local task.
+    server = tf.train.Server(
+        cluster_spec, job_name=args.job_name, task_index=args.task_index)
+
+    if args.job_name == "ps":
+        print("start pserver")
+        server.join()
+    elif args.job_name == "worker":
+        print("start worker")
+        run_benchmark(cluster_spec, server)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index ae3295fe4115f457570203e61a56a637895e4770..7730453fc9292015465713232abda155a18a1aad 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -59,6 +59,7 @@ endif(NOT WITH_GOLANG)
 
 if(NOT WITH_GPU)
     add_definitions(-DHPPL_STUB_FUNC)
+    add_definitions("-DCUPTI_LIB_PATH=\"\"")
 
     list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
@@ -73,7 +74,14 @@ else()
     if(NOT CUDNN_FOUND)
         message(FATAL_ERROR "Paddle needs cudnn to compile")
     endif()
-
+    if(CUPTI_FOUND)
+        include_directories(${CUPTI_INCLUDE_DIR})
+        add_definitions(-DPADDLE_WITH_CUPTI)
+        add_definitions("-DCUPTI_LIB_PATH=\"${CUPTI_LIBRARY_PATH}\"")
+    else()
+        add_definitions("-DCUPTI_LIB_PATH=\"\"")
+        message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.")
+    endif()
     set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")
 
     # Include cuda and cudnn
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index de94bd5008effef1bf0fd3a125d4aed56e1b7f81..7edc8637727e300539a46bc3941ace87c87903b8 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -155,7 +155,8 @@ endif()
 include_directories(${CUDA_INCLUDE_DIRS})
 list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
 if(NOT WITH_DSO)
-    list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
+    # TODO(panyx0718): CUPTI only allows DSO?
+    list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
 endif(NOT WITH_DSO)
 
 # setting nvcc arch flags
diff --git a/cmake/cupti.cmake b/cmake/cupti.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..72ed0f1e5858d6d836743ceb038c7f4ad8f194cf
--- /dev/null
+++ b/cmake/cupti.cmake
@@ -0,0 +1,41 @@
+if(NOT WITH_GPU)
+    return()
+endif()
+
+
+set(CUPTI_ROOT "/usr" CACHE PATH "CUPTI ROOT")
+find_path(CUPTI_INCLUDE_DIR cupti.h
+        PATHS ${CUPTI_ROOT} ${CUPTI_ROOT}/include
+        $ENV{CUPTI_ROOT} $ENV{CUPTI_ROOT}/include
+        ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/include
+        NO_DEFAULT_PATH
+        )
+
+get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
+
+set(TARGET_ARCH "x86_64")
+if(NOT ${CMAKE_SYSTEM_PROCESSOR})
+    set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
+endif()
+
+list(APPEND CUPTI_CHECK_LIBRARY_DIRS
+        ${CUPTI_ROOT}
+        ${CUPTI_ROOT}/lib64
+        ${CUPTI_ROOT}/lib
+        ${CUPTI_ROOT}/lib/${TARGET_ARCH}-linux-gnu
+        $ENV{CUPTI_ROOT}
+        $ENV{CUPTI_ROOT}/lib64
+        $ENV{CUPTI_ROOT}/lib
+        /usr/lib
+        ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/lib64)
+find_library(CUPTI_LIBRARY NAMES libcupti.so libcupti.dylib # libcupti_static.a
+       PATHS ${CUPTI_CHECK_LIBRARY_DIRS} ${CUPTI_INCLUDE_DIR} ${__libpath_hist}
+       NO_DEFAULT_PATH
+       DOC "Path to cuPTI library.")
+
+get_filename_component(CUPTI_LIBRARY_PATH ${CUPTI_LIBRARY} DIRECTORY)
+if(CUPTI_INCLUDE_DIR AND CUPTI_LIBRARY)
+    set(CUPTI_FOUND ON)
+else()
+    set(CUPTI_FOUND OFF)
+endif()
diff --git a/doc/api/v2/fluid/data_feeder.rst b/doc/api/fluid/data_feeder.rst
similarity index 76%
rename from doc/api/v2/fluid/data_feeder.rst
rename to doc/api/fluid/data_feeder.rst
index a591c7334fd31c98a94b50a4344f251560a0f2f9..3df5c0307ffed9d101da58b385840b115920e906 100644
--- a/doc/api/v2/fluid/data_feeder.rst
+++ b/doc/api/fluid/data_feeder.rst
@@ -8,7 +8,7 @@ data_feeder
 DataFeeder
 ----------
 
-..  autoclass:: paddle.v2.fluid.data_feeder.DataFeeder
+..  autoclass:: paddle.fluid.data_feeder.DataFeeder
     :members:
     :noindex:
 
diff --git a/doc/api/v2/fluid/evaluator.rst b/doc/api/fluid/evaluator.rst
similarity index 67%
rename from doc/api/v2/fluid/evaluator.rst
rename to doc/api/fluid/evaluator.rst
index 00dcecfd628a35d83d1c596bf0aea819a1705862..ae9daeb7918d773d7330f419de96c6972a836710 100644
--- a/doc/api/v2/fluid/evaluator.rst
+++ b/doc/api/fluid/evaluator.rst
@@ -8,14 +8,14 @@ evaluator
 Accuracy
 --------
 
-..  autoclass:: paddle.v2.fluid.evaluator.Accuracy
+..  autoclass:: paddle.fluid.evaluator.Accuracy
     :members:
     :noindex:
 
 ChunkEvaluator
 --------------
 
-..  autoclass:: paddle.v2.fluid.evaluator.ChunkEvaluator
+..  autoclass:: paddle.fluid.evaluator.ChunkEvaluator
     :members:
     :noindex:
 
diff --git a/doc/api/v2/fluid/executor.rst b/doc/api/fluid/executor.rst
similarity index 56%
rename from doc/api/v2/fluid/executor.rst
rename to doc/api/fluid/executor.rst
index a028f6283f2ca333bdf6c9857a98661c0222b41e..a9cdf264e49691afc4b9425b7bfe54f8157ae6c2 100644
--- a/doc/api/v2/fluid/executor.rst
+++ b/doc/api/fluid/executor.rst
@@ -8,25 +8,25 @@ executor
 Executor
 --------
 
-..  autoclass:: paddle.v2.fluid.executor.Executor
+..  autoclass:: paddle.fluid.executor.Executor
     :members:
     :noindex:
 
 global_scope
 ------------
 
-..  autofunction:: paddle.v2.fluid.executor.global_scope
+..  autofunction:: paddle.fluid.executor.global_scope
     :noindex:
 
 scope_guard
 -----------
 
-..  autofunction:: paddle.v2.fluid.executor.scope_guard
+..  autofunction:: paddle.fluid.executor.scope_guard
     :noindex:
 
 switch_scope
 ------------
 
-..  autofunction:: paddle.v2.fluid.executor.switch_scope
+..  autofunction:: paddle.fluid.executor.switch_scope
     :noindex:
 
diff --git a/doc/api/v2/fluid/gen_doc.py b/doc/api/fluid/gen_doc.py
similarity index 95%
rename from doc/api/v2/fluid/gen_doc.py
rename to doc/api/fluid/gen_doc.py
index a2147fd3f7ea635d8f14210fbcd1a568ee2230ee..89ab880301b6ac687fd61f556f87f03792c37da3 100644
--- a/doc/api/v2/fluid/gen_doc.py
+++ b/doc/api/fluid/gen_doc.py
@@ -17,7 +17,7 @@ import argparse
 import sys
 import types
 
-import paddle.v2.fluid as fluid
+import paddle.fluid as fluid
 
 
 def parse_arg():
@@ -70,7 +70,7 @@ class DocGenerator(object):
 
     def print_class(self, name):
         self._print_header_(name, dot='-', is_title=False)
-        self.stream.write('''..  autoclass:: paddle.v2.fluid.{0}.{1}
+        self.stream.write('''..  autoclass:: paddle.fluid.{0}.{1}
     :members:
     :noindex:
 
@@ -78,7 +78,7 @@ class DocGenerator(object):
 
     def print_method(self, name):
         self._print_header_(name, dot='-', is_title=False)
-        self.stream.write('''..  autofunction:: paddle.v2.fluid.{0}.{1}
+        self.stream.write('''..  autofunction:: paddle.fluid.{0}.{1}
     :noindex:
 
 '''.format(self.module_name, name))
diff --git a/doc/api/v2/fluid/gen_doc.sh b/doc/api/fluid/gen_doc.sh
similarity index 100%
rename from doc/api/v2/fluid/gen_doc.sh
rename to doc/api/fluid/gen_doc.sh
diff --git a/doc/api/fluid/index.rst b/doc/api/fluid/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b0710d8b19956eb235890fdb2a2d764084416aa5
--- /dev/null
+++ b/doc/api/fluid/index.rst
@@ -0,0 +1,18 @@
+======================
+Fluid
+======================
+
+..  toctree::
+    :maxdepth: 1
+
+    layers.rst
+    data_feeder.rst
+    executor.rst
+    initializer.rst
+    evaluator.rst
+    nets.rst
+    optimizer.rst
+    param_attr.rst
+    profiler.rst
+    regularizer.rst
+    io.rst
diff --git a/doc/api/v2/fluid/initializer.rst b/doc/api/fluid/initializer.rst
similarity index 59%
rename from doc/api/v2/fluid/initializer.rst
rename to doc/api/fluid/initializer.rst
index c38be033fff2997930525f51c93995db09daa2b6..ee69925fda6b3fc850cfb632e8edd359e7fcff9c 100644
--- a/doc/api/v2/fluid/initializer.rst
+++ b/doc/api/fluid/initializer.rst
@@ -8,28 +8,28 @@ initializer
 Constant
 --------
 
-..  autoclass:: paddle.v2.fluid.initializer.Constant
+..  autoclass:: paddle.fluid.initializer.Constant
     :members:
     :noindex:
 
 Uniform
 -------
 
-..  autoclass:: paddle.v2.fluid.initializer.Uniform
+..  autoclass:: paddle.fluid.initializer.Uniform
     :members:
     :noindex:
 
 Normal
 ------
 
-..  autoclass:: paddle.v2.fluid.initializer.Normal
+..  autoclass:: paddle.fluid.initializer.Normal
     :members:
     :noindex:
 
 Xavier
 ------
 
-..  autoclass:: paddle.v2.fluid.initializer.Xavier
+..  autoclass:: paddle.fluid.initializer.Xavier
     :members:
     :noindex:
 
diff --git a/doc/api/v2/fluid/io.rst b/doc/api/fluid/io.rst
similarity index 52%
rename from doc/api/v2/fluid/io.rst
rename to doc/api/fluid/io.rst
index 37c9c273e369532e8ff596e9649cb695a98a2505..dd9d88b669957c22cd0a07fa4b7e219e2d6e5d61 100644
--- a/doc/api/v2/fluid/io.rst
+++ b/doc/api/fluid/io.rst
@@ -8,54 +8,54 @@ io
 save_vars
 ---------
 
-..  autofunction:: paddle.v2.fluid.io.save_vars
+..  autofunction:: paddle.fluid.io.save_vars
     :noindex:
 
 save_params
 -----------
 
-..  autofunction:: paddle.v2.fluid.io.save_params
+..  autofunction:: paddle.fluid.io.save_params
     :noindex:
 
 save_persistables
 -----------------
 
-..  autofunction:: paddle.v2.fluid.io.save_persistables
+..  autofunction:: paddle.fluid.io.save_persistables
     :noindex:
 
 load_vars
 ---------
 
-..  autofunction:: paddle.v2.fluid.io.load_vars
+..  autofunction:: paddle.fluid.io.load_vars
     :noindex:
 
 load_params
 -----------
 
-..  autofunction:: paddle.v2.fluid.io.load_params
+..  autofunction:: paddle.fluid.io.load_params
     :noindex:
 
 load_persistables
 -----------------
 
-..  autofunction:: paddle.v2.fluid.io.load_persistables
+..  autofunction:: paddle.fluid.io.load_persistables
     :noindex:
 
 save_inference_model
 --------------------
 
-..  autofunction:: paddle.v2.fluid.io.save_inference_model
+..  autofunction:: paddle.fluid.io.save_inference_model
     :noindex:
 
 load_inference_model
 --------------------
 
-..  autofunction:: paddle.v2.fluid.io.load_inference_model
+..  autofunction:: paddle.fluid.io.load_inference_model
     :noindex:
 
 get_inference_program
 ---------------------
 
-..  autofunction:: paddle.v2.fluid.io.get_inference_program
+..  autofunction:: paddle.fluid.io.get_inference_program
     :noindex:
 
diff --git a/doc/api/fluid/layers.rst b/doc/api/fluid/layers.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ae35d8c53476b34cb18331364267dd7c8b94dd64
--- /dev/null
+++ b/doc/api/fluid/layers.rst
@@ -0,0 +1,805 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+======
+layers
+======
+
+control_flow
+============
+
+split_lod_tensor
+----------------
+
+..  autofunction:: paddle.fluid.layers.split_lod_tensor
+    :noindex:
+
+merge_lod_tensor
+----------------
+
+..  autofunction:: paddle.fluid.layers.merge_lod_tensor
+    :noindex:
+
+BlockGuard
+----------
+
+..  autoclass:: paddle.fluid.layers.BlockGuard
+    :members:
+    :noindex:
+
+BlockGuardWithCompletion
+------------------------
+
+..  autoclass:: paddle.fluid.layers.BlockGuardWithCompletion
+    :members:
+    :noindex:
+
+StaticRNNMemoryLink
+-------------------
+
+..  autoclass:: paddle.fluid.layers.StaticRNNMemoryLink
+    :members:
+    :noindex:
+
+WhileGuard
+----------
+
+..  autoclass:: paddle.fluid.layers.WhileGuard
+    :members:
+    :noindex:
+
+While
+-----
+
+..  autoclass:: paddle.fluid.layers.While
+    :members:
+    :noindex:
+
+lod_rank_table
+--------------
+
+..  autofunction:: paddle.fluid.layers.lod_rank_table
+    :noindex:
+
+max_sequence_len
+----------------
+
+..  autofunction:: paddle.fluid.layers.max_sequence_len
+    :noindex:
+
+topk
+----
+
+..  autofunction:: paddle.fluid.layers.topk
+    :noindex:
+
+lod_tensor_to_array
+-------------------
+
+..  autofunction:: paddle.fluid.layers.lod_tensor_to_array
+    :noindex:
+
+array_to_lod_tensor
+-------------------
+
+..  autofunction:: paddle.fluid.layers.array_to_lod_tensor
+    :noindex:
+
+increment
+---------
+
+..  autofunction:: paddle.fluid.layers.increment
+    :noindex:
+
+array_write
+-----------
+
+..  autofunction:: paddle.fluid.layers.array_write
+    :noindex:
+
+create_array
+------------
+
+..  autofunction:: paddle.fluid.layers.create_array
+    :noindex:
+
+less_than
+---------
+
+..  autofunction:: paddle.fluid.layers.less_than
+    :noindex:
+
+array_read
+----------
+
+..  autofunction:: paddle.fluid.layers.array_read
+    :noindex:
+
+shrink_memory
+-------------
+
+..  autofunction:: paddle.fluid.layers.shrink_memory
+    :noindex:
+
+array_length
+------------
+
+..  autofunction:: paddle.fluid.layers.array_length
+    :noindex:
+
+IfElse
+------
+
+..  autoclass:: paddle.fluid.layers.IfElse
+    :members:
+    :noindex:
+
+DynamicRNN
+----------
+
+..  autoclass:: paddle.fluid.layers.DynamicRNN
+    :members:
+    :noindex:
+
+ConditionalBlock
+----------------
+
+..  autoclass:: paddle.fluid.layers.ConditionalBlock
+    :members:
+    :noindex:
+
+StaticRNN
+---------
+
+..  autoclass:: paddle.fluid.layers.StaticRNN
+    :members:
+    :noindex:
+
+reorder_lod_tensor_by_rank
+--------------------------
+
+..  autofunction:: paddle.fluid.layers.reorder_lod_tensor_by_rank
+    :noindex:
+
+ParallelDo
+----------
+
+..  autoclass:: paddle.fluid.layers.ParallelDo
+    :members:
+    :noindex:
+
+Print
+-----
+
+..  autofunction:: paddle.fluid.layers.Print
+    :noindex:
+
+device
+======
+
+get_places
+----------
+
+..  autofunction:: paddle.fluid.layers.get_places
+    :noindex:
+
+io
+==
+
+data
+----
+
+..  autofunction:: paddle.fluid.layers.data
+    :noindex:
+
+BlockGuardServ
+--------------
+
+..  autoclass:: paddle.fluid.layers.BlockGuardServ
+    :members:
+    :noindex:
+
+ListenAndServ
+-------------
+
+..  autoclass:: paddle.fluid.layers.ListenAndServ
+    :members:
+    :noindex:
+
+Send
+----
+
+..  autofunction:: paddle.fluid.layers.Send
+    :noindex:
+
+nn
+==
+
+fc
+--
+
+..  autofunction:: paddle.fluid.layers.fc
+    :noindex:
+
+embedding
+---------
+
+..  autofunction:: paddle.fluid.layers.embedding
+    :noindex:
+
+dynamic_lstm
+------------
+
+..  autofunction:: paddle.fluid.layers.dynamic_lstm
+    :noindex:
+
+dynamic_lstmp
+-------------
+
+..  autofunction:: paddle.fluid.layers.dynamic_lstmp
+    :noindex:
+
+dynamic_gru
+-----------
+
+..  autofunction:: paddle.fluid.layers.dynamic_gru
+    :noindex:
+
+gru_unit
+--------
+
+..  autofunction:: paddle.fluid.layers.gru_unit
+    :noindex:
+
+linear_chain_crf
+----------------
+
+..  autofunction:: paddle.fluid.layers.linear_chain_crf
+    :noindex:
+
+crf_decoding
+------------
+
+..  autofunction:: paddle.fluid.layers.crf_decoding
+    :noindex:
+
+cos_sim
+-------
+
+..  autofunction:: paddle.fluid.layers.cos_sim
+    :noindex:
+
+cross_entropy
+-------------
+
+..  autofunction:: paddle.fluid.layers.cross_entropy
+    :noindex:
+
+square_error_cost
+-----------------
+
+..  autofunction:: paddle.fluid.layers.square_error_cost
+    :noindex:
+
+accuracy
+--------
+
+..  autofunction:: paddle.fluid.layers.accuracy
+    :noindex:
+
+chunk_eval
+----------
+
+..  autofunction:: paddle.fluid.layers.chunk_eval
+    :noindex:
+
+sequence_conv
+-------------
+
+..  autofunction:: paddle.fluid.layers.sequence_conv
+    :noindex:
+
+conv2d
+------
+
+..  autofunction:: paddle.fluid.layers.conv2d
+    :noindex:
+
+sequence_pool
+-------------
+
+..  autofunction:: paddle.fluid.layers.sequence_pool
+    :noindex:
+
+pool2d
+------
+
+..  autofunction:: paddle.fluid.layers.pool2d
+    :noindex:
+
+batch_norm
+----------
+
+..  autofunction:: paddle.fluid.layers.batch_norm
+    :noindex:
+
+layer_norm
+----------
+
+..  autofunction:: paddle.fluid.layers.layer_norm
+    :noindex:
+
+beam_search_decode
+------------------
+
+..  autofunction:: paddle.fluid.layers.beam_search_decode
+    :noindex:
+
+conv2d_transpose
+----------------
+
+..  autofunction:: paddle.fluid.layers.conv2d_transpose
+    :noindex:
+
+sequence_expand
+---------------
+
+..  autofunction:: paddle.fluid.layers.sequence_expand
+    :noindex:
+
+lstm_unit
+---------
+
+..  autofunction:: paddle.fluid.layers.lstm_unit
+    :noindex:
+
+reduce_sum
+----------
+
+..  autofunction:: paddle.fluid.layers.reduce_sum
+    :noindex:
+
+reduce_mean
+-----------
+
+..  autofunction:: paddle.fluid.layers.reduce_mean
+    :noindex:
+
+reduce_max
+----------
+
+..  autofunction:: paddle.fluid.layers.reduce_max
+    :noindex:
+
+reduce_min
+----------
+
+..  autofunction:: paddle.fluid.layers.reduce_min
+    :noindex:
+
+sequence_first_step
+-------------------
+
+..  autofunction:: paddle.fluid.layers.sequence_first_step
+    :noindex:
+
+sequence_last_step
+------------------
+
+..  autofunction:: paddle.fluid.layers.sequence_last_step
+    :noindex:
+
+dropout
+-------
+
+..  autofunction:: paddle.fluid.layers.dropout
+    :noindex:
+
+split
+-----
+
+..  autofunction:: paddle.fluid.layers.split
+    :noindex:
+
+ctc_greedy_decoder
+------------------
+
+..  autofunction:: paddle.fluid.layers.ctc_greedy_decoder
+    :noindex:
+
+edit_distance
+-------------
+
+..  autofunction:: paddle.fluid.layers.edit_distance
+    :noindex:
+
+l2_normalize
+------------
+
+..  autofunction:: paddle.fluid.layers.l2_normalize
+    :noindex:
+
+matmul
+------
+
+..  autofunction:: paddle.fluid.layers.matmul
+    :noindex:
+
+warpctc
+-------
+
+..  autofunction:: paddle.fluid.layers.warpctc
+    :noindex:
+
+sequence_reshape
+----------------
+
+..  autofunction:: paddle.fluid.layers.sequence_reshape
+    :noindex:
+
+transpose
+---------
+
+..  autofunction:: paddle.fluid.layers.transpose
+    :noindex:
+
+im2sequence
+-----------
+
+..  autofunction:: paddle.fluid.layers.im2sequence
+    :noindex:
+
+nce
+---
+
+..  autofunction:: paddle.fluid.layers.nce
+    :noindex:
+
+beam_search
+-----------
+
+..  autofunction:: paddle.fluid.layers.beam_search
+    :noindex:
+
+row_conv
+--------
+
+..  autofunction:: paddle.fluid.layers.row_conv
+    :noindex:
+
+multiplex
+---------
+
+..  autofunction:: paddle.fluid.layers.multiplex
+    :noindex:
+
+ops
+===
+
+mean
+----
+
+..  autofunction:: paddle.fluid.layers.mean
+    :noindex:
+
+mul
+---
+
+..  autofunction:: paddle.fluid.layers.mul
+    :noindex:
+
+reshape
+-------
+
+..  autofunction:: paddle.fluid.layers.reshape
+    :noindex:
+
+scale
+-----
+
+..  autofunction:: paddle.fluid.layers.scale
+    :noindex:
+
+sigmoid_cross_entropy_with_logits
+---------------------------------
+
+..  autofunction:: paddle.fluid.layers.sigmoid_cross_entropy_with_logits
+    :noindex:
+
+elementwise_add
+---------------
+
+..  autofunction:: paddle.fluid.layers.elementwise_add
+    :noindex:
+
+elementwise_div
+---------------
+
+..  autofunction:: paddle.fluid.layers.elementwise_div
+    :noindex:
+
+elementwise_sub
+---------------
+
+..  autofunction:: paddle.fluid.layers.elementwise_sub
+    :noindex:
+
+elementwise_mul
+---------------
+
+..  autofunction:: paddle.fluid.layers.elementwise_mul
+    :noindex:
+
+elementwise_max
+---------------
+
+..  autofunction:: paddle.fluid.layers.elementwise_max
+    :noindex:
+
+elementwise_min
+---------------
+
+..  autofunction:: paddle.fluid.layers.elementwise_min
+    :noindex:
+
+elementwise_pow
+---------------
+
+..  autofunction:: paddle.fluid.layers.elementwise_pow
+    :noindex:
+
+clip
+----
+
+..  autofunction:: paddle.fluid.layers.clip
+    :noindex:
+
+clip_by_norm
+------------
+
+..  autofunction:: paddle.fluid.layers.clip_by_norm
+    :noindex:
+
+sequence_softmax
+----------------
+
+..  autofunction:: paddle.fluid.layers.sequence_softmax
+    :noindex:
+
+sigmoid
+-------
+
+..  autofunction:: paddle.fluid.layers.sigmoid
+    :noindex:
+
+logsigmoid
+----------
+
+..  autofunction:: paddle.fluid.layers.logsigmoid
+    :noindex:
+
+exp
+---
+
+..  autofunction:: paddle.fluid.layers.exp
+    :noindex:
+
+relu
+----
+
+..  autofunction:: paddle.fluid.layers.relu
+    :noindex:
+
+tanh
+----
+
+..  autofunction:: paddle.fluid.layers.tanh
+    :noindex:
+
+tanh_shrink
+-----------
+
+..  autofunction:: paddle.fluid.layers.tanh_shrink
+    :noindex:
+
+softshrink
+----------
+
+..  autofunction:: paddle.fluid.layers.softshrink
+    :noindex:
+
+sqrt
+----
+
+..  autofunction:: paddle.fluid.layers.sqrt
+    :noindex:
+
+abs
+---
+
+..  autofunction:: paddle.fluid.layers.abs
+    :noindex:
+
+ceil
+----
+
+..  autofunction:: paddle.fluid.layers.ceil
+    :noindex:
+
+floor
+-----
+
+..  autofunction:: paddle.fluid.layers.floor
+    :noindex:
+
+round
+-----
+
+..  autofunction:: paddle.fluid.layers.round
+    :noindex:
+
+reciprocal
+----------
+
+..  autofunction:: paddle.fluid.layers.reciprocal
+    :noindex:
+
+log
+---
+
+..  autofunction:: paddle.fluid.layers.log
+    :noindex:
+
+square
+------
+
+..  autofunction:: paddle.fluid.layers.square
+    :noindex:
+
+softplus
+--------
+
+..  autofunction:: paddle.fluid.layers.softplus
+    :noindex:
+
+softsign
+--------
+
+..  autofunction:: paddle.fluid.layers.softsign
+    :noindex:
+
+brelu
+-----
+
+..  autofunction:: paddle.fluid.layers.brelu
+    :noindex:
+
+leaky_relu
+----------
+
+..  autofunction:: paddle.fluid.layers.leaky_relu
+    :noindex:
+
+soft_relu
+---------
+
+..  autofunction:: paddle.fluid.layers.soft_relu
+    :noindex:
+
+elu
+---
+
+..  autofunction:: paddle.fluid.layers.elu
+    :noindex:
+
+relu6
+-----
+
+..  autofunction:: paddle.fluid.layers.relu6
+    :noindex:
+
+pow
+---
+
+..  autofunction:: paddle.fluid.layers.pow
+    :noindex:
+
+stanh
+-----
+
+..  autofunction:: paddle.fluid.layers.stanh
+    :noindex:
+
+hard_shrink
+-----------
+
+..  autofunction:: paddle.fluid.layers.hard_shrink
+    :noindex:
+
+thresholded_relu
+----------------
+
+..  autofunction:: paddle.fluid.layers.thresholded_relu
+    :noindex:
+
+hard_sigmoid
+------------
+
+..  autofunction:: paddle.fluid.layers.hard_sigmoid
+    :noindex:
+
+swish
+-----
+
+..  autofunction:: paddle.fluid.layers.swish
+    :noindex:
+
+tensor
+======
+
+create_tensor
+-------------
+
+..  autofunction:: paddle.fluid.layers.create_tensor
+    :noindex:
+
+create_parameter
+----------------
+
+..  autofunction:: paddle.fluid.layers.create_parameter
+    :noindex:
+
+create_global_var
+-----------------
+
+..  autofunction:: paddle.fluid.layers.create_global_var
+    :noindex:
+
+cast
+----
+
+..  autofunction:: paddle.fluid.layers.cast
+    :noindex:
+
+concat
+------
+
+..  autofunction:: paddle.fluid.layers.concat
+    :noindex:
+
+sums
+----
+
+..  autofunction:: paddle.fluid.layers.sums
+    :noindex:
+
+assign
+------
+
+..  autofunction:: paddle.fluid.layers.assign
+    :noindex:
+
+fill_constant_batch_size_like
+-----------------------------
+
+..  autofunction:: paddle.fluid.layers.fill_constant_batch_size_like
+    :noindex:
+
+fill_constant
+-------------
+
+..  autofunction:: paddle.fluid.layers.fill_constant
+    :noindex:
+
+ones
+----
+
+..  autofunction:: paddle.fluid.layers.ones
+    :noindex:
+
+zeros
+-----
+
+..  autofunction:: paddle.fluid.layers.zeros
+    :noindex:
+
diff --git a/doc/api/v2/fluid/nets.rst b/doc/api/fluid/nets.rst
similarity index 57%
rename from doc/api/v2/fluid/nets.rst
rename to doc/api/fluid/nets.rst
index 015581b7660848bdb0845fafe2d3fc05405e6ae6..7ae3187304f386a08c5cb8a4ba093423a58a7f36 100644
--- a/doc/api/v2/fluid/nets.rst
+++ b/doc/api/fluid/nets.rst
@@ -8,24 +8,24 @@ nets
 simple_img_conv_pool
 --------------------
 
-..  autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
+..  autofunction:: paddle.fluid.nets.simple_img_conv_pool
     :noindex:
 
 sequence_conv_pool
 ------------------
 
-..  autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
+..  autofunction:: paddle.fluid.nets.sequence_conv_pool
     :noindex:
 
 glu
 ---
 
-..  autofunction:: paddle.v2.fluid.nets.glu
+..  autofunction:: paddle.fluid.nets.glu
     :noindex:
 
 scaled_dot_product_attention
 ----------------------------
 
-..  autofunction:: paddle.v2.fluid.nets.scaled_dot_product_attention
+..  autofunction:: paddle.fluid.nets.scaled_dot_product_attention
     :noindex:
 
diff --git a/doc/api/v2/fluid/optimizer.rst b/doc/api/fluid/optimizer.rst
similarity index 56%
rename from doc/api/v2/fluid/optimizer.rst
rename to doc/api/fluid/optimizer.rst
index 1691ebb9a7cb16da96e04147d0adea322374f529..9b165f870459b4f9ef2efe24f5604a3fcb96f7f3 100644
--- a/doc/api/v2/fluid/optimizer.rst
+++ b/doc/api/fluid/optimizer.rst
@@ -8,42 +8,42 @@ optimizer
 SGD
 ---
 
-..  autoclass:: paddle.v2.fluid.optimizer.SGD
+..  autoclass:: paddle.fluid.optimizer.SGD
     :members:
     :noindex:
 
 Momentum
 --------
 
-..  autoclass:: paddle.v2.fluid.optimizer.Momentum
+..  autoclass:: paddle.fluid.optimizer.Momentum
     :members:
     :noindex:
 
 Adagrad
 -------
 
-..  autoclass:: paddle.v2.fluid.optimizer.Adagrad
+..  autoclass:: paddle.fluid.optimizer.Adagrad
     :members:
     :noindex:
 
 Adam
 ----
 
-..  autoclass:: paddle.v2.fluid.optimizer.Adam
+..  autoclass:: paddle.fluid.optimizer.Adam
     :members:
     :noindex:
 
 Adamax
 ------
 
-..  autoclass:: paddle.v2.fluid.optimizer.Adamax
+..  autoclass:: paddle.fluid.optimizer.Adamax
     :members:
     :noindex:
 
 DecayedAdagrad
 --------------
 
-..  autoclass:: paddle.v2.fluid.optimizer.DecayedAdagrad
+..  autoclass:: paddle.fluid.optimizer.DecayedAdagrad
     :members:
     :noindex:
 
diff --git a/doc/api/v2/fluid/param_attr.rst b/doc/api/fluid/param_attr.rst
similarity index 67%
rename from doc/api/v2/fluid/param_attr.rst
rename to doc/api/fluid/param_attr.rst
index 8083d0d858dafcd275eaddb9b475875ee42ef724..8e4ddb2b0492d0fcfcade199fdd6dfe43faa7075 100644
--- a/doc/api/v2/fluid/param_attr.rst
+++ b/doc/api/fluid/param_attr.rst
@@ -8,14 +8,14 @@ param_attr
 ParamAttr
 ---------
 
-..  autoclass:: paddle.v2.fluid.param_attr.ParamAttr
+..  autoclass:: paddle.fluid.param_attr.ParamAttr
     :members:
     :noindex:
 
 WeightNormParamAttr
 -------------------
 
-..  autoclass:: paddle.v2.fluid.param_attr.WeightNormParamAttr
+..  autoclass:: paddle.fluid.param_attr.WeightNormParamAttr
     :members:
     :noindex:
 
diff --git a/doc/api/v2/fluid/profiler.rst b/doc/api/fluid/profiler.rst
similarity index 58%
rename from doc/api/v2/fluid/profiler.rst
rename to doc/api/fluid/profiler.rst
index 4a1ff7cb6976e0054f77428b699ea679aa91394f..74d102dcb0db35766c34e3d14939a8aa5861686b 100644
--- a/doc/api/v2/fluid/profiler.rst
+++ b/doc/api/fluid/profiler.rst
@@ -8,18 +8,18 @@ profiler
 cuda_profiler
 -------------
 
-..  autofunction:: paddle.v2.fluid.profiler.cuda_profiler
+..  autofunction:: paddle.fluid.profiler.cuda_profiler
     :noindex:
 
 reset_profiler
 --------------
 
-..  autofunction:: paddle.v2.fluid.profiler.reset_profiler
+..  autofunction:: paddle.fluid.profiler.reset_profiler
     :noindex:
 
 profiler
 --------
 
-..  autofunction:: paddle.v2.fluid.profiler.profiler
+..  autofunction:: paddle.fluid.profiler.profiler
     :noindex:
 
diff --git a/doc/api/v2/fluid/regularizer.rst b/doc/api/fluid/regularizer.rst
similarity index 61%
rename from doc/api/v2/fluid/regularizer.rst
rename to doc/api/fluid/regularizer.rst
index 2c17d15599baa1d02eb87c7b6c40034769ebb3a4..dc9740c46392567d314121ac401540b0e7382703 100644
--- a/doc/api/v2/fluid/regularizer.rst
+++ b/doc/api/fluid/regularizer.rst
@@ -8,20 +8,20 @@ regularizer
 append_regularization_ops
 -------------------------
 
-..  autofunction:: paddle.v2.fluid.regularizer.append_regularization_ops
+..  autofunction:: paddle.fluid.regularizer.append_regularization_ops
     :noindex:
 
 L1Decay
 -------
 
-..  autoclass:: paddle.v2.fluid.regularizer.L1Decay
+..  autoclass:: paddle.fluid.regularizer.L1Decay
     :members:
     :noindex:
 
 L2Decay
 -------
 
-..  autoclass:: paddle.v2.fluid.regularizer.L2Decay
+..  autoclass:: paddle.fluid.regularizer.L2Decay
     :members:
     :noindex:
 
diff --git a/doc/api/index_cn.rst b/doc/api/index_cn.rst
deleted file mode 100644
index 84f9097a6cdc2da269bd6a0685796e14e26da37e..0000000000000000000000000000000000000000
--- a/doc/api/index_cn.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-API
-===
-
-..  toctree::
-    :maxdepth: 1
-
-    模型配置 <v2/model_configs.rst>
-    数据访问 <v2/data.rst>
-    训练与应用 <v2/run_logic.rst>
-    v2/fluid.rst
diff --git a/doc/api/index_en.rst b/doc/api/index_en.rst
index e6f632e1a5b9c4b50b7c6aa96a120030bd6ce338..fc8dbd07eba248942c03c2b609cfc8d8712ed0c7 100644
--- a/doc/api/index_en.rst
+++ b/doc/api/index_en.rst
@@ -4,7 +4,8 @@ API
 ..  toctree::
     :maxdepth: 1
 
+    overview.rst
     v2/model_configs.rst
     v2/data.rst
     v2/run_logic.rst
-    v2/fluid.rst
+    fluid/index.rst
diff --git a/doc/api/overview.rst b/doc/api/overview.rst
new file mode 100644
index 0000000000000000000000000000000000000000..16b6cf42660c51feee09c689c671d5ef06663efb
--- /dev/null
+++ b/doc/api/overview.rst
@@ -0,0 +1,16 @@
+V2 API Overview
+================
+
+The PaddlePaddle V2 API is designed to provide a modern user interface for PaddlePaddle V1(the original layer-based platform of PaddlePaddle),
+it proposes some high-level concepts such as `Layers <http://www.paddlepaddle.org/docs/develop/api/en/v2/config/layer.html>`_ , `Optimizer <http://www.paddlepaddle.org/docs/develop/api/en/v2/config/optimizer.html>`_ , `Evaluator <http://www.paddlepaddle.org/docs/develop/api/en/v2/config/evaluators.html>`_  and `Data Reader <http://www.paddlepaddle.org/docs/develop/api/en/v2/data/data_reader.html>`_ to make the model configuration more familiar to users.
+
+A model is composed of the computation described by a group of `Layers`, with `Evaluator` to define the error, `Optimizer` to update the parameters and `Data Reader` to feed in the data.
+
+We also provide the `interface for Training and Inference <http://www.paddlepaddle.org/docs/develop/api/en/v2/run_logic.html>`_ to help control the training and inference phrase,
+it has several easy to use methods
+
+- `paddle.train` 
+- `paddle.test`
+- `paddle.infer`
+
+to better expose the internal running details, different `events <http://www.paddlepaddle.org/docs/develop/api/en/v2/run_logic.html#event>`_ are available to users by writing some callbacks.
diff --git a/doc/api/v2/fluid.rst b/doc/api/v2/fluid.rst
deleted file mode 100644
index 5f15cad2b530dfb3702357b3c26885ac2a7b7beb..0000000000000000000000000000000000000000
--- a/doc/api/v2/fluid.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-======================
-Fluid
-======================
-
-..  toctree::
-    :maxdepth: 1
-
-    fluid/layers.rst
-    fluid/data_feeder.rst
-    fluid/executor.rst
-    fluid/initializer.rst
-    fluid/evaluator.rst
-    fluid/nets.rst
-    fluid/optimizer.rst
-    fluid/param_attr.rst
-    fluid/profiler.rst
-    fluid/regularizer.rst
-    fluid/io.rst
diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
deleted file mode 100644
index 58c493fd7412cf9dbe507c9622d67dae33a5fb25..0000000000000000000000000000000000000000
--- a/doc/api/v2/fluid/layers.rst
+++ /dev/null
@@ -1,805 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-======
-layers
-======
-
-control_flow
-============
-
-split_lod_tensor
-----------------
-
-..  autofunction:: paddle.v2.fluid.layers.split_lod_tensor
-    :noindex:
-
-merge_lod_tensor
-----------------
-
-..  autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
-    :noindex:
-
-BlockGuard
-----------
-
-..  autoclass:: paddle.v2.fluid.layers.BlockGuard
-    :members:
-    :noindex:
-
-BlockGuardWithCompletion
-------------------------
-
-..  autoclass:: paddle.v2.fluid.layers.BlockGuardWithCompletion
-    :members:
-    :noindex:
-
-StaticRNNMemoryLink
--------------------
-
-..  autoclass:: paddle.v2.fluid.layers.StaticRNNMemoryLink
-    :members:
-    :noindex:
-
-WhileGuard
-----------
-
-..  autoclass:: paddle.v2.fluid.layers.WhileGuard
-    :members:
-    :noindex:
-
-While
------
-
-..  autoclass:: paddle.v2.fluid.layers.While
-    :members:
-    :noindex:
-
-lod_rank_table
---------------
-
-..  autofunction:: paddle.v2.fluid.layers.lod_rank_table
-    :noindex:
-
-max_sequence_len
-----------------
-
-..  autofunction:: paddle.v2.fluid.layers.max_sequence_len
-    :noindex:
-
-topk
-----
-
-..  autofunction:: paddle.v2.fluid.layers.topk
-    :noindex:
-
-lod_tensor_to_array
--------------------
-
-..  autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
-    :noindex:
-
-array_to_lod_tensor
--------------------
-
-..  autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
-    :noindex:
-
-increment
----------
-
-..  autofunction:: paddle.v2.fluid.layers.increment
-    :noindex:
-
-array_write
------------
-
-..  autofunction:: paddle.v2.fluid.layers.array_write
-    :noindex:
-
-create_array
-------------
-
-..  autofunction:: paddle.v2.fluid.layers.create_array
-    :noindex:
-
-less_than
----------
-
-..  autofunction:: paddle.v2.fluid.layers.less_than
-    :noindex:
-
-array_read
-----------
-
-..  autofunction:: paddle.v2.fluid.layers.array_read
-    :noindex:
-
-shrink_memory
--------------
-
-..  autofunction:: paddle.v2.fluid.layers.shrink_memory
-    :noindex:
-
-array_length
-------------
-
-..  autofunction:: paddle.v2.fluid.layers.array_length
-    :noindex:
-
-IfElse
-------
-
-..  autoclass:: paddle.v2.fluid.layers.IfElse
-    :members:
-    :noindex:
-
-DynamicRNN
-----------
-
-..  autoclass:: paddle.v2.fluid.layers.DynamicRNN
-    :members:
-    :noindex:
-
-ConditionalBlock
-----------------
-
-..  autoclass:: paddle.v2.fluid.layers.ConditionalBlock
-    :members:
-    :noindex:
-
-StaticRNN
----------
-
-..  autoclass:: paddle.v2.fluid.layers.StaticRNN
-    :members:
-    :noindex:
-
-reorder_lod_tensor_by_rank
---------------------------
-
-..  autofunction:: paddle.v2.fluid.layers.reorder_lod_tensor_by_rank
-    :noindex:
-
-ParallelDo
-----------
-
-..  autoclass:: paddle.v2.fluid.layers.ParallelDo
-    :members:
-    :noindex:
-
-Print
------
-
-..  autofunction:: paddle.v2.fluid.layers.Print
-    :noindex:
-
-device
-======
-
-get_places
-----------
-
-..  autofunction:: paddle.v2.fluid.layers.get_places
-    :noindex:
-
-io
-==
-
-data
-----
-
-..  autofunction:: paddle.v2.fluid.layers.data
-    :noindex:
-
-BlockGuardServ
---------------
-
-..  autoclass:: paddle.v2.fluid.layers.BlockGuardServ
-    :members:
-    :noindex:
-
-ListenAndServ
--------------
-
-..  autoclass:: paddle.v2.fluid.layers.ListenAndServ
-    :members:
-    :noindex:
-
-Send
-----
-
-..  autofunction:: paddle.v2.fluid.layers.Send
-    :noindex:
-
-nn
-==
-
-fc
---
-
-..  autofunction:: paddle.v2.fluid.layers.fc
-    :noindex:
-
-embedding
----------
-
-..  autofunction:: paddle.v2.fluid.layers.embedding
-    :noindex:
-
-dynamic_lstm
-------------
-
-..  autofunction:: paddle.v2.fluid.layers.dynamic_lstm
-    :noindex:
-
-dynamic_lstmp
--------------
-
-..  autofunction:: paddle.v2.fluid.layers.dynamic_lstmp
-    :noindex:
-
-dynamic_gru
------------
-
-..  autofunction:: paddle.v2.fluid.layers.dynamic_gru
-    :noindex:
-
-gru_unit
---------
-
-..  autofunction:: paddle.v2.fluid.layers.gru_unit
-    :noindex:
-
-linear_chain_crf
-----------------
-
-..  autofunction:: paddle.v2.fluid.layers.linear_chain_crf
-    :noindex:
-
-crf_decoding
-------------
-
-..  autofunction:: paddle.v2.fluid.layers.crf_decoding
-    :noindex:
-
-cos_sim
--------
-
-..  autofunction:: paddle.v2.fluid.layers.cos_sim
-    :noindex:
-
-cross_entropy
--------------
-
-..  autofunction:: paddle.v2.fluid.layers.cross_entropy
-    :noindex:
-
-square_error_cost
------------------
-
-..  autofunction:: paddle.v2.fluid.layers.square_error_cost
-    :noindex:
-
-accuracy
---------
-
-..  autofunction:: paddle.v2.fluid.layers.accuracy
-    :noindex:
-
-chunk_eval
-----------
-
-..  autofunction:: paddle.v2.fluid.layers.chunk_eval
-    :noindex:
-
-sequence_conv
--------------
-
-..  autofunction:: paddle.v2.fluid.layers.sequence_conv
-    :noindex:
-
-conv2d
-------
-
-..  autofunction:: paddle.v2.fluid.layers.conv2d
-    :noindex:
-
-sequence_pool
--------------
-
-..  autofunction:: paddle.v2.fluid.layers.sequence_pool
-    :noindex:
-
-pool2d
-------
-
-..  autofunction:: paddle.v2.fluid.layers.pool2d
-    :noindex:
-
-batch_norm
-----------
-
-..  autofunction:: paddle.v2.fluid.layers.batch_norm
-    :noindex:
-
-layer_norm
-----------
-
-..  autofunction:: paddle.v2.fluid.layers.layer_norm
-    :noindex:
-
-beam_search_decode
-------------------
-
-..  autofunction:: paddle.v2.fluid.layers.beam_search_decode
-    :noindex:
-
-conv2d_transpose
-----------------
-
-..  autofunction:: paddle.v2.fluid.layers.conv2d_transpose
-    :noindex:
-
-sequence_expand
----------------
-
-..  autofunction:: paddle.v2.fluid.layers.sequence_expand
-    :noindex:
-
-lstm_unit
----------
-
-..  autofunction:: paddle.v2.fluid.layers.lstm_unit
-    :noindex:
-
-reduce_sum
-----------
-
-..  autofunction:: paddle.v2.fluid.layers.reduce_sum
-    :noindex:
-
-reduce_mean
------------
-
-..  autofunction:: paddle.v2.fluid.layers.reduce_mean
-    :noindex:
-
-reduce_max
-----------
-
-..  autofunction:: paddle.v2.fluid.layers.reduce_max
-    :noindex:
-
-reduce_min
-----------
-
-..  autofunction:: paddle.v2.fluid.layers.reduce_min
-    :noindex:
-
-sequence_first_step
--------------------
-
-..  autofunction:: paddle.v2.fluid.layers.sequence_first_step
-    :noindex:
-
-sequence_last_step
-------------------
-
-..  autofunction:: paddle.v2.fluid.layers.sequence_last_step
-    :noindex:
-
-dropout
--------
-
-..  autofunction:: paddle.v2.fluid.layers.dropout
-    :noindex:
-
-split
------
-
-..  autofunction:: paddle.v2.fluid.layers.split
-    :noindex:
-
-ctc_greedy_decoder
-------------------
-
-..  autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder
-    :noindex:
-
-edit_distance
--------------
-
-..  autofunction:: paddle.v2.fluid.layers.edit_distance
-    :noindex:
-
-l2_normalize
-------------
-
-..  autofunction:: paddle.v2.fluid.layers.l2_normalize
-    :noindex:
-
-matmul
-------
-
-..  autofunction:: paddle.v2.fluid.layers.matmul
-    :noindex:
-
-warpctc
--------
-
-..  autofunction:: paddle.v2.fluid.layers.warpctc
-    :noindex:
-
-sequence_reshape
-----------------
-
-..  autofunction:: paddle.v2.fluid.layers.sequence_reshape
-    :noindex:
-
-transpose
----------
-
-..  autofunction:: paddle.v2.fluid.layers.transpose
-    :noindex:
-
-im2sequence
------------
-
-..  autofunction:: paddle.v2.fluid.layers.im2sequence
-    :noindex:
-
-nce
----
-
-..  autofunction:: paddle.v2.fluid.layers.nce
-    :noindex:
-
-beam_search
------------
-
-..  autofunction:: paddle.v2.fluid.layers.beam_search
-    :noindex:
-
-row_conv
---------
-
-..  autofunction:: paddle.v2.fluid.layers.row_conv
-    :noindex:
-
-multiplex
----------
-
-..  autofunction:: paddle.v2.fluid.layers.multiplex
-    :noindex:
-
-ops
-===
-
-mean
-----
-
-..  autofunction:: paddle.v2.fluid.layers.mean
-    :noindex:
-
-mul
----
-
-..  autofunction:: paddle.v2.fluid.layers.mul
-    :noindex:
-
-reshape
--------
-
-..  autofunction:: paddle.v2.fluid.layers.reshape
-    :noindex:
-
-scale
------
-
-..  autofunction:: paddle.v2.fluid.layers.scale
-    :noindex:
-
-sigmoid_cross_entropy_with_logits
----------------------------------
-
-..  autofunction:: paddle.v2.fluid.layers.sigmoid_cross_entropy_with_logits
-    :noindex:
-
-elementwise_add
----------------
-
-..  autofunction:: paddle.v2.fluid.layers.elementwise_add
-    :noindex:
-
-elementwise_div
----------------
-
-..  autofunction:: paddle.v2.fluid.layers.elementwise_div
-    :noindex:
-
-elementwise_sub
----------------
-
-..  autofunction:: paddle.v2.fluid.layers.elementwise_sub
-    :noindex:
-
-elementwise_mul
----------------
-
-..  autofunction:: paddle.v2.fluid.layers.elementwise_mul
-    :noindex:
-
-elementwise_max
----------------
-
-..  autofunction:: paddle.v2.fluid.layers.elementwise_max
-    :noindex:
-
-elementwise_min
----------------
-
-..  autofunction:: paddle.v2.fluid.layers.elementwise_min
-    :noindex:
-
-elementwise_pow
----------------
-
-..  autofunction:: paddle.v2.fluid.layers.elementwise_pow
-    :noindex:
-
-clip
-----
-
-..  autofunction:: paddle.v2.fluid.layers.clip
-    :noindex:
-
-clip_by_norm
-------------
-
-..  autofunction:: paddle.v2.fluid.layers.clip_by_norm
-    :noindex:
-
-sequence_softmax
-----------------
-
-..  autofunction:: paddle.v2.fluid.layers.sequence_softmax
-    :noindex:
-
-sigmoid
--------
-
-..  autofunction:: paddle.v2.fluid.layers.sigmoid
-    :noindex:
-
-logsigmoid
-----------
-
-..  autofunction:: paddle.v2.fluid.layers.logsigmoid
-    :noindex:
-
-exp
----
-
-..  autofunction:: paddle.v2.fluid.layers.exp
-    :noindex:
-
-relu
-----
-
-..  autofunction:: paddle.v2.fluid.layers.relu
-    :noindex:
-
-tanh
-----
-
-..  autofunction:: paddle.v2.fluid.layers.tanh
-    :noindex:
-
-tanh_shrink
------------
-
-..  autofunction:: paddle.v2.fluid.layers.tanh_shrink
-    :noindex:
-
-softshrink
-----------
-
-..  autofunction:: paddle.v2.fluid.layers.softshrink
-    :noindex:
-
-sqrt
-----
-
-..  autofunction:: paddle.v2.fluid.layers.sqrt
-    :noindex:
-
-abs
----
-
-..  autofunction:: paddle.v2.fluid.layers.abs
-    :noindex:
-
-ceil
-----
-
-..  autofunction:: paddle.v2.fluid.layers.ceil
-    :noindex:
-
-floor
------
-
-..  autofunction:: paddle.v2.fluid.layers.floor
-    :noindex:
-
-round
------
-
-..  autofunction:: paddle.v2.fluid.layers.round
-    :noindex:
-
-reciprocal
-----------
-
-..  autofunction:: paddle.v2.fluid.layers.reciprocal
-    :noindex:
-
-log
----
-
-..  autofunction:: paddle.v2.fluid.layers.log
-    :noindex:
-
-square
-------
-
-..  autofunction:: paddle.v2.fluid.layers.square
-    :noindex:
-
-softplus
---------
-
-..  autofunction:: paddle.v2.fluid.layers.softplus
-    :noindex:
-
-softsign
---------
-
-..  autofunction:: paddle.v2.fluid.layers.softsign
-    :noindex:
-
-brelu
------
-
-..  autofunction:: paddle.v2.fluid.layers.brelu
-    :noindex:
-
-leaky_relu
-----------
-
-..  autofunction:: paddle.v2.fluid.layers.leaky_relu
-    :noindex:
-
-soft_relu
----------
-
-..  autofunction:: paddle.v2.fluid.layers.soft_relu
-    :noindex:
-
-elu
----
-
-..  autofunction:: paddle.v2.fluid.layers.elu
-    :noindex:
-
-relu6
------
-
-..  autofunction:: paddle.v2.fluid.layers.relu6
-    :noindex:
-
-pow
----
-
-..  autofunction:: paddle.v2.fluid.layers.pow
-    :noindex:
-
-stanh
------
-
-..  autofunction:: paddle.v2.fluid.layers.stanh
-    :noindex:
-
-hard_shrink
------------
-
-..  autofunction:: paddle.v2.fluid.layers.hard_shrink
-    :noindex:
-
-thresholded_relu
-----------------
-
-..  autofunction:: paddle.v2.fluid.layers.thresholded_relu
-    :noindex:
-
-hard_sigmoid
-------------
-
-..  autofunction:: paddle.v2.fluid.layers.hard_sigmoid
-    :noindex:
-
-swish
------
-
-..  autofunction:: paddle.v2.fluid.layers.swish
-    :noindex:
-
-tensor
-======
-
-create_tensor
--------------
-
-..  autofunction:: paddle.v2.fluid.layers.create_tensor
-    :noindex:
-
-create_parameter
-----------------
-
-..  autofunction:: paddle.v2.fluid.layers.create_parameter
-    :noindex:
-
-create_global_var
------------------
-
-..  autofunction:: paddle.v2.fluid.layers.create_global_var
-    :noindex:
-
-cast
-----
-
-..  autofunction:: paddle.v2.fluid.layers.cast
-    :noindex:
-
-concat
-------
-
-..  autofunction:: paddle.v2.fluid.layers.concat
-    :noindex:
-
-sums
-----
-
-..  autofunction:: paddle.v2.fluid.layers.sums
-    :noindex:
-
-assign
-------
-
-..  autofunction:: paddle.v2.fluid.layers.assign
-    :noindex:
-
-fill_constant_batch_size_like
------------------------------
-
-..  autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
-    :noindex:
-
-fill_constant
--------------
-
-..  autofunction:: paddle.v2.fluid.layers.fill_constant
-    :noindex:
-
-ones
-----
-
-..  autofunction:: paddle.v2.fluid.layers.ones
-    :noindex:
-
-zeros
------
-
-..  autofunction:: paddle.v2.fluid.layers.zeros
-    :noindex:
-
diff --git a/doc/build_and_install/build_from_source_cn.rst b/doc/build_and_install/build_from_source_cn.rst
index fec2d412f03f6b94422f0463d1985decd0c1bf99..cb766c3838133740892928b587edcf3843b7abce 100644
--- a/doc/build_and_install/build_from_source_cn.rst
+++ b/doc/build_and_install/build_from_source_cn.rst
@@ -189,7 +189,7 @@ PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种B
     "WITH_TESTING", "是否开启单元测试", "OFF"
     "WITH_DOC", "是否编译中英文文档", "OFF"
     "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练", "Auto"
-    "WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON"
+    "WITH_GOLANG", "是否编译go语言的可容错parameter server", "OFF"
     "WITH_MKL", "是否使用MKL数学库，如果为否则是用OpenBLAS", "ON"
 
 BLAS
diff --git a/doc/build_and_install/build_from_source_en.rst b/doc/build_and_install/build_from_source_en.rst
index 29a1439e4cec50c15cb965a788070f21c704caad..556cbfdf087c340a7f7a1760f92325ab87eeea89 100644
--- a/doc/build_and_install/build_from_source_en.rst
+++ b/doc/build_and_install/build_from_source_en.rst
@@ -191,7 +191,7 @@ You can add :code:`-D` argument to pass such options, like:
     "WITH_TESTING", "Build unit tests", "OFF"
     "WITH_DOC", "Build documentations", "OFF"
     "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
-    "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"
+    "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "OFF"
     "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON"
 
 
diff --git a/doc/design/concurrent_programming.md b/doc/design/concurrent_programming.md
index afc65e831d58ff427663806e56294292ccbef85b..f022e67fd3a048cd7e53c91d9a1fd0506487b665 100644
--- a/doc/design/concurrent_programming.md
+++ b/doc/design/concurrent_programming.md
@@ -12,7 +12,7 @@ The following table compares concepts in Fluid and Go
 
 | Go | Fluid |
 |----|-------|
-|user-defined functions | [layers](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid) |
+|user-defined functions | [layers](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid) |
 | control-flow and built-in functions | [intrinsics/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators) |
 | goroutines, channels | [class ThreadPool](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h) |
 | runtime | [class Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) |
diff --git a/doc/design/fluid.md b/doc/design/fluid.md
index 2acc168007d25a083f588b48f84e12e29baf4f47..f78fa8c1914124f33b9730f918c8887ced4f8d9d 100644
--- a/doc/design/fluid.md
+++ b/doc/design/fluid.md
@@ -89,7 +89,7 @@ with train_loop.block():
     h[t] = the_step(input[t])
 ```    
 
-An actual Fluid example is described  [here](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44).
+An actual Fluid example is described  [here](https://github.com/PaddlePaddle/Paddle/blob/bde090a97564b9c61a6aaa38b72ccc4889d102d9/python/paddle/fluid/tests/unittests/test_while_op.py#L50-L58).
 
 From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.
 
diff --git a/doc/design/memory_optimization.md b/doc/design/memory_optimization.md
index 1f68cef4cc28cd005acbeaa5c03cc0d84a83939c..285464ada728d8f7a086a26beca6cfa4418e98e4 100644
--- a/doc/design/memory_optimization.md
+++ b/doc/design/memory_optimization.md
@@ -101,7 +101,7 @@ In-place is a built-in attribute of an operator. Since we treat in-place and oth
 
 #### contruct control flow graph
 
-Following is the ProgramDesc protobuf of [machine translation](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book/test_machine_translation.py) example.
+Following is the ProgramDesc protobuf of [machine translation](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py) example.
 
 - Block0:
 
diff --git a/doc/faq/build_and_install/index_cn.rst b/doc/faq/build_and_install/index_cn.rst
index ed8a0c7e87da133138ecfc7ba6a8217d58b8f71d..31d2252eb5f5f6a87b1c93f36008fc4468795896 100644
--- a/doc/faq/build_and_install/index_cn.rst
+++ b/doc/faq/build_and_install/index_cn.rst
@@ -16,7 +16,7 @@
     $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
     $ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
 
-更多关于Docker的安装与使用, 请参考 `PaddlePaddle Docker 文档 <http://www.paddlepaddle.org/doc_cn/build_and_install/install/docker_install.html>`_ 。
+更多关于Docker的安装与使用, 请参考 `PaddlePaddle Docker 文档 <http://www.paddlepaddle.org/docs/0.11.0/documentation/zh/getstarted/build_and_install/docker_install_cn.html>`_ 。
 
 
 2. CMake源码编译, 找到的PythonLibs和PythonInterp版本不一致
diff --git a/doc/faq/index_cn.rst b/doc/faq/index_cn.rst
index 9929767cac212237b3e2c3a547ba9a3c9d5f0979..4537c7a481e2efbcfed5fa7be2c81c36e13cd108 100644
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@@ -1,6 +1,8 @@
 FAQ
 ====
 
+本文档对关于PaddlePaddle的一些常见问题提供了解答。如果您的问题未在此处，请您到 `PaddlePaddle社区 <https://github.com/PaddlePaddle/Paddle/issues>`_ 查找答案或直接提 `issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_ ，我们会及时进行回复。
+ 
 ..  toctree::
   :maxdepth: 1
 
diff --git a/doc/faq/local/index_cn.rst b/doc/faq/local/index_cn.rst
index 0306b1e5dd25a55545e464ce847291c33576575f..c6d3c5bfac5a276e253c248ffd415c7789b20b29 100644
--- a/doc/faq/local/index_cn.rst
+++ b/doc/faq/local/index_cn.rst
@@ -148,10 +148,10 @@ Paddle二进制在运行时捕获了浮点数异常，只要出现浮点数异
 
 ..  code-block:: python
 
-optimizer = paddle.optimizer.RMSProp(
-    learning_rate=1e-3,
-    gradient_clipping_threshold=10.0,
-    regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+    optimizer = paddle.optimizer.RMSProp(
+        learning_rate=1e-3,
+        gradient_clipping_threshold=10.0,
+        regularization=paddle.optimizer.L2Regularization(rate=8e-4))
 
 具体可以参考  `nmt_without_attention  <https://github.com/PaddlePaddle/models/blob/develop/nmt_without_attention/train.py#L35>`_ 示例。
 
@@ -159,13 +159,13 @@ optimizer = paddle.optimizer.RMSProp(
 
 ..  code-block:: python
 
-decoder_inputs = paddle.layer.fc(
-    act=paddle.activation.Linear(),
-    size=decoder_size * 3,
-    bias_attr=False,
-    input=[context, current_word],
-    layer_attr=paddle.attr.ExtraLayerAttribute(
-        error_clipping_threshold=100.0))
+    decoder_inputs = paddle.layer.fc(
+        act=paddle.activation.Linear(),
+        size=decoder_size * 3,
+        bias_attr=False,
+        input=[context, current_word],
+        layer_attr=paddle.attr.ExtraLayerAttribute(
+            error_clipping_threshold=100.0))
 
 完整代码可以参考示例 `machine translation <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py#L66>`_ 。
 
diff --git a/doc/faq/parameter/index_cn.rst b/doc/faq/parameter/index_cn.rst
index 6fa0c64413be1616a435640b0347904a49873349..1fa4b3e1311d2007ccba98fde9ff94300ea42c16 100644
--- a/doc/faq/parameter/index_cn.rst
+++ b/doc/faq/parameter/index_cn.rst
@@ -196,6 +196,6 @@ PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数
         obj="process",
         args={"src_dict_path": src_dict_path})
 
-完整源码可参考 `seqToseq <https://github.com/PaddlePaddle/Paddle/tree/develop/demo/seqToseq>`_ 示例。
+完整源码可参考 `sequence_recurrent <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_recurrent.py>`_ 示例。
 
 
diff --git a/doc/howto/capi/index_cn.rst b/doc/howto/capi/index_cn.rst
index e589a6d346a1e23a4eed9801e02727c80782ae8b..7f100717983f5e950b801e6b05ee48bfff273c62 100644
--- a/doc/howto/capi/index_cn.rst
+++ b/doc/howto/capi/index_cn.rst
@@ -1,6 +1,23 @@
 C-API预测库
 ==================
 
+当我们训练完一个神经网络模型之后，下一步就是用模型来做预测。预测就是准备输入数据，经过模型处理之后，得到预测结果的过程。
+
+相比于模型训练，预测有如下特点：
+
+#. 预测不需要训练过程中反向传播和参数更新的部分。
+#. 预测不需要标签(label)。
+#. 预测很多时候需要和用户系统整合在一起。
+
+因为上述特点，模型预测SDK需要单独设计，并具备以下特点：
+
+#. 预测SDK不包含反向传播和参数更新部分，以减小SDK的体积。
+#. 预测SDK需要提供一个简洁的用户接口，方便使用。
+#. 因为输入数据可能有多种结构，对输入数据的格式做清晰简洁的封装。
+#. 为了和用户系统兼容，SDK的接口需要是满足C标准的接口。
+
+PaddlePaddle提供了C-API，用于解决上述问题。关于C-API的使用，我们提供了如下指南：
+
 ..  toctree::
   :maxdepth: 1
 
diff --git a/doc/howto/capi/workflow_of_capi_cn.md b/doc/howto/capi/workflow_of_capi_cn.md
index a61d2267bfdb7c32da528735b20d7c6a531aaa1f..1ccc72eefbc730b2eab2d51f5b04e50728b735d7 100644
--- a/doc/howto/capi/workflow_of_capi_cn.md
+++ b/doc/howto/capi/workflow_of_capi_cn.md
@@ -65,6 +65,7 @@
     output_file = "output.paddle.model"
     merge_v2_model(net, param_file, output_file)
     ```
+
     对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)这个示例，可直接运行 `python` [merge_v2_model.py](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense/merge_v2_model.py)。序列化结果会写入当前运行目录下的`output.paddle.model`文件中。使用这种方式，运行时C-API可以通过指定`output.paddle.model`文件的路径来加载预测模型。
 
 #### 注意事项
diff --git a/doc/howto/cluster/fluid_cluster_train_en.md b/doc/howto/cluster/fluid_cluster_train_en.md
index ae825d9a517c7e9005d4e32f8f34b3f6a79be0c9..b4465e8269c2e1603c02404ea33f8c4572e76442 100644
--- a/doc/howto/cluster/fluid_cluster_train_en.md
+++ b/doc/howto/cluster/fluid_cluster_train_en.md
@@ -32,7 +32,7 @@ The non-cluster version of this demo with fluid API is as follows:
 
 ``` python
 import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
+import paddle.fluid as fluid
 
 x = fluid.layers.data(name='x', shape=[13], dtype='float32')
 y_predict = fluid.layers.fc(input=x, size=1, act=None)
@@ -125,11 +125,11 @@ for pass_id in range(100):
 
 ### E2E demo
 
-Please find the complete demo from [here](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py).
+Please find the complete demo from [here](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book_distribute/notest_dist_fit_a_line.py).
 First `cd` into the folder that contains the `python` files. In this case:
 
 ```bash
-cd /paddle/python/paddle/v2/fluid/tests/book_distribute
+cd /paddle/python/paddle/fluid/tests/book_distribute
 ```
 
 In parameter server node run the following in the command line:
diff --git a/doc/howto/cluster/multi_cluster/index_cn.rst b/doc/howto/cluster/multi_cluster/index_cn.rst
index ef56b6ddb38e59f20f7248de1ceb952c7627ce76..eabf95eda0b20f91913201a6b4e5b56fa440597e 100644
--- a/doc/howto/cluster/multi_cluster/index_cn.rst
+++ b/doc/howto/cluster/multi_cluster/index_cn.rst
@@ -1,20 +1,35 @@
 在不同集群中运行
 ================
+用户的集群环境不尽相同，为了方便大家的部署，我们提供了多种的集群部署方式，方便提交集群训练任务，以下将一一介绍:
 
-PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务，包括：
-- `Kubernetes <http://kubernetes.io>`_ Google开源的容器集群的调度框架，支持大规模集群生产环境的完整集群方案。
-- `OpenMPI <https://www.open-mpi.org>`_ 成熟的高性能并行计算框架。
-- `Fabric <http://www.fabfile.org>`_ 集群管理工具。可以使用`Fabric`编写集群任务提交和管理脚本。
+`Kubernetes <http://kubernetes.io>`_ 是Google开源的容器集群的调度框架，支持大规模集群生产环境的完整集群方案。以下指南展示了PaddlePaddle对Kubernetes的支持：
 
-对于不同的集群平台，会分别介绍集群作业的启动和停止方法。这些例子都可以在 `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ 找到。
+..  toctree::
+  :maxdepth: 1
+
+  k8s_cn.md
+  k8s_distributed_cn.md
 
-在使用分布式计算平台进行训练时，任务被调度在集群中时，分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数，比如节点的ID、IP和任务节点个数等。
+`OpenMPI <https://www.open-mpi.org>`_  是成熟的高性能并行计算框架，在HPC领域使用非常的广泛。以下指南介绍了如何使用OpenMPI来搭建PaddlePaddle的集群训练任务:
 
 ..  toctree::
   :maxdepth: 1
 
-  fabric_cn.md
   openmpi_cn.md
-  k8s_cn.md
-  k8s_distributed_cn.md
+
+`Fabric <http://www.fabfile.org>`_ 是一个方便的程序部署和管理工具。我们提供了使用Fabric 进行部署、管理的方法，如果想详细了解，请阅读以下指南:
+
+..  toctree::
+  :maxdepth: 1
+
+  fabric_cn.md
+
+我们也支持在AWS上部署PaddlePaddle，详细请了解:
+
+..  toctree::
+  :maxdepth: 1
+
   k8s_aws_cn.md
+
+您可以在 `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ 找到以上相关的例子。
+
diff --git a/doc/howto/optimization/cpu_profiling_cn.md b/doc/howto/optimization/cpu_profiling_cn.md
index 14eba0e2f34b115f5cd24920b5b1af07ec953d00..d59be670c2b33b64d9b6f96b53f50e5bf9f0613b 100644
--- a/doc/howto/optimization/cpu_profiling_cn.md
+++ b/doc/howto/optimization/cpu_profiling_cn.md
@@ -35,7 +35,7 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
 ```
    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
         1    0.284    0.284   29.514   29.514 main.py:1(<module>)
-     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
+     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/executor.py:20(run)
      4696   12.040    0.003   12.040    0.003 {built-in method run}
         1    0.144    0.144    6.534    6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
 ```
@@ -61,9 +61,9 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
 ```text
      4696   12.040    0.003   12.040    0.003 {built-in method run}
    300005    0.874    0.000    1.681    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
-   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
-     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
-        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(<module>)
+   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:219(__init__)
+     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp)
+        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/__init__.py:1(<module>)
 ```
 
 可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长，每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息，了解其调用关系。
@@ -76,9 +76,9 @@ Called By:
 
 Function                                                                                                 was called by...
                                                                                                              ncalls  tottime  cumtime
-/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)
-/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone)
-                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:487(clone)
+                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:534(append_backward)
 
 
 Called:
diff --git a/doc/howto/optimization/cpu_profiling_en.md b/doc/howto/optimization/cpu_profiling_en.md
index 368af40cc7308cf6f4c609361078fe3ba02213ed..01e5fddf61547f9fc86ef18a6f2e2ac508d22dbb 100644
--- a/doc/howto/optimization/cpu_profiling_en.md
+++ b/doc/howto/optimization/cpu_profiling_en.md
@@ -49,7 +49,7 @@ port, we will see the output like the following:
 ```
    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
         1    0.284    0.284   29.514   29.514 main.py:1(<module>)
-     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
+     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/executor.py:20(run)
      4696   12.040    0.003   12.040    0.003 {built-in method run}
         1    0.144    0.144    6.534    6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
 ```
@@ -74,9 +74,9 @@ focus on. We can sort above profiling file by tottime:
 ```text
      4696   12.040    0.003   12.040    0.003 {built-in method run}
    300005    0.874    0.000    1.681    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
-   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
-     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
-        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(<module>)
+   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:219(__init__)
+     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp)
+        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/__init__.py:1(<module>)
 ```
 
 We can see that the most time-consuming function is the `built-in
@@ -93,9 +93,9 @@ Called By:
 
 Function                                                                                                 was called by...
                                                                                                              ncalls  tottime  cumtime
-/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)
-/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone)
-                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:487(clone)
+                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:534(append_backward)
 
 
 Called:
diff --git a/doc/howto/read_source.md b/doc/howto/read_source.md
index 31987920f32f217ac2db42548874cfe7da57dd72..edf46aff8c6cc9fc01d26c6453b3a8123238ef91 100644
--- a/doc/howto/read_source.md
+++ b/doc/howto/read_source.md
@@ -1,6 +1,6 @@
 # PaddlePaddle Fluid Source Code Overview
 
-Examples: https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid/tests/book
+Examples: https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/tests/book
 
 Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework
 
@@ -26,16 +26,16 @@ sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
 sgd_optimizer.minimize(avg_cost)
 ```
 
-- Variables: `x`,  `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/framework.py#)
-- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid/layers)
+- Variables: `x`,  `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/framework.py#)
+- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/layers)
   - Every Layer has one or more operators and variables/parameters
     - All the operators are defined at [`paddle/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators). Other worth-looking files:
       - Base class: [`paddle/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h)
       - Operator Registration: [`paddle/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h) 
       - Operator Lookup: [`paddle/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_info.h)
 - Optimizer: `fluid.optimizer.SGD`. It does the following
-  - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/backward.py)]
-  - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/optimizer.py)]
+  - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/backward.py)]
+  - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/optimizer.py)]
 
 # Run Time
 
@@ -57,7 +57,7 @@ exe.run(fluid.default_main_program(),
 
 - Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h)
   - The device handle are at [paddle/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h)
-- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)]
+- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)]
   - Feeds the data: `feed=feeder.feed(data)`
   - Evaluates all the operators
   - Fetches the result: `fetch_list=[avg_cost]`
diff --git a/doc/mobile/cross_compiling_for_android_cn.md b/doc/mobile/cross_compiling_for_android_cn.md
index ae24ced770492743065e37654b494caf6b4c5bc0..cdd6917239371a660d0df05bb623f0b94f8f11a3 100644
--- a/doc/mobile/cross_compiling_for_android_cn.md
+++ b/doc/mobile/cross_compiling_for_android_cn.md
@@ -23,6 +23,12 @@ $ docker build -t username/paddle-android:dev . -f Dockerfile.android
 $ docker pull paddlepaddle/paddle:latest-dev-android
 ```
 
+对于国内用户，我们提供了加速访问的镜像源：
+
+```bash
+$ docker pull docker.paddlepaddlehub.com/paddle:latest-dev-android
+```
+
 ### 编译PaddlePaddle C-API库
 构建好开发镜像后，即可使用开发镜像来编译Android版PaddlePaddle C-API库。
 Android的Docker开发镜像向用户提供两个可配置的参数：
@@ -56,15 +62,15 @@ Android的Docker开发镜像向用户提供两个可配置的参数：
 
 - 编译`armeabi-v7a`，`Android API 21`的PaddlePaddle库
 
-  ```bash
-  $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
-  ```
+```bash
+$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
+```
 
 - 编译`arm64-v8a`，`Android API 21`的PaddlePaddle库
 
-  ```bash
-  $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
-  ```
+```bash
+$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
+```
 
 执行上述`docker run`命令时，容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置，并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`，`ANDROID_API<21`时，Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节，根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后，PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录，所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
 
@@ -155,7 +161,11 @@ cmake -DCMAKE_SYSTEM_NAME=Android \
       ..
 ```
 
-用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小，可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`；若希望最快的执行速度，则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS`来影响PaddlePaddle的编译过程。
+用户还可根据自己的需求设置其他编译参数。
+
+- 设置`CMAKE_BUILD_TYPE`为`MinSizeRel`，最小化生成的库的大小。
+- 设置`CMAKE_BUILD_TYPE`为`Release`，获得最快的执行速度，
+- 用户亦可以通过手动设置`CMAKE_C/CXX_FLAGS`来影响PaddlePaddle的编译过程。
 
 **性能TIPS**，为了达到最快的计算速度，在CMake参数配置上，有以下建议：
 
diff --git a/doc/mobile/cross_compiling_for_android_en.md b/doc/mobile/cross_compiling_for_android_en.md
index 0cf50181df4116beda3aa6faf836eda92edf6066..6af16fc114a2310e364023ec43cc3c64149af8f7 100644
--- a/doc/mobile/cross_compiling_for_android_en.md
+++ b/doc/mobile/cross_compiling_for_android_en.md
@@ -25,6 +25,12 @@ Users can directly use the published Docker image.
 $ docker pull paddlepaddle/paddle:latest-dev-android
 ```
 
+For users in China, we provide a faster mirror.
+
+```bash
+$ docker pull docker.paddlepaddlehub.com/paddle:latest-dev-android
+```
+
 ### Build the Inference Library
 
 We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below:
@@ -86,19 +92,19 @@ Android NDK includes everything we need to build the [*standalone toolchain*](ht
 
 - To build the standalone toolchain for `armeabi-v7a` and Android API level 21:
 
-  ```bash
-  your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
-          --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain
-  ```
+```bash
+your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+        --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain
+```
   
   The generated standalone toolchain will be in `your/path/to/arm_standalone_toolchain`.
 
 - To build the standalone toolchain for `arm64-v8a` and Android API level 21:
 
-  ```bash
-  your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
-          --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
-  ```
+```bash
+your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+        --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
+```
 
   The generated standalone toolchain will be in `your/path/to/arm64_standalone_toolchain`.
 
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index ef1bc07c2dbe71268c706a119056d3a9fcfc7f8c..0b4c6db6f98d8d73b362d3c98f52a3914a031c68 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -56,7 +56,7 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
-    shape_inference data_transform lod_tensor)
+    shape_inference data_transform lod_tensor profiler)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)
 
@@ -68,9 +68,9 @@ py_proto_compile(framework_py_proto SRCS framework.proto)
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
 add_custom_command(TARGET framework_py_proto POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/proto
-    COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/proto/
-    COMMENT "Copy generated python proto into directory paddle/v2/fluid/proto."
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto
+    COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/
+    COMMENT "Copy generated python proto into directory paddle/fluid/proto."
     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
 cc_library(backward SRCS backward.cc DEPS net_op)
@@ -80,7 +80,7 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
 
 cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
-framework_proto backward glog lod_rank_table profiler feed_fetch_method)
+framework_proto backward glog lod_rank_table feed_fetch_method)
 
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 0d2691e8115ad6de46dcd4fcd5b7fd79ed60ecb9..961e3e22f278d6e0346defd90190c53fd31ede08 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -25,7 +25,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(benchmark);
 DEFINE_bool(check_nan_inf, false,
@@ -58,13 +57,13 @@ static void CreateTensor(Variable* var, proto::VarType::Type var_type) {
     var->GetMutable<ReaderHolder>();
   } else if (var_type == proto::VarType::CHANNEL) {
     var->GetMutable<ChannelHolder>();
-  } else if (var_type == proto::VarType::NCCL_COM) {
-    // GetMutable will be called in ncclInit
+  } else if (var_type == proto::VarType::RAW) {
+    // GetMutable will be called in operator
   } else {
     PADDLE_THROW(
         "Variable type %d is not in "
         "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
-        "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, NCCL_COM]",
+        "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]",
         var_type);
   }
 }
@@ -126,9 +125,6 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
   for (auto& op_desc : block.AllOps()) {
     auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
 
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    platform::RecordEvent record_event(op->Type(), pool.Get(place_));
-
     VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
     op->Run(*local_scope, place_);
 
diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto
index 5b43f5a8a4a1c128b04ac206d387e30c55f533fe..38f22b89143c3e23c8368b9281ccc757a892a373 100644
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -113,7 +113,10 @@ message VarType {
     PLACE_LIST = 14;
     READER = 15;
     CHANNEL = 16;
-    NCCL_COM = 17;
+    // Any runtime decided variable type is raw
+    // raw variables should manage their own allocations
+    // in operators like nccl_op
+    RAW = 17;
   }
 
   required Type type = 1;
@@ -164,4 +167,6 @@ message BlockDesc {
 // Please refer to
 // https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
 // for more details.
+// TODO(panyx0718): A model can have multiple programs. Need a
+// way to distinguish them. Maybe ID or name?
 message ProgramDesc { repeated BlockDesc blocks = 1; }
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 4cf14c8da547d79258e99d0c64e83f9218a92910..e2f4e9cad1996578b7c51257785e1273d126f80f 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -31,8 +31,14 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) {
   os << "{";
   for (auto &v : lod) {
     os << "{";
+    bool is_first = true;
     for (auto &i : v) {
-      os << i << ",";
+      if (is_first) {
+        os << i;
+        is_first = false;
+      } else {
+        os << ", " << i;
+      }
     }
     os << "}";
   }
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index b72aad6fb538ac483e9ce6fc9cb866c75190f006..614dd8cd00eb866cb8cbc41c3e03c25f968a7d2b 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -125,6 +125,8 @@ class OpDesc {
 
   BlockDesc *Block() { return this->block_; }
 
+  const BlockDesc &BlockRef() const { return *this->block_; }
+
   void SetBlock(BlockDesc *block) { this->block_ = block; }
 
  private:
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 7debdd8525741ea4ae93fe7bff7b5817373fd7ce..ac6289c5abe8f40ae9ee32aa3d58cdef3ff0e836 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(benchmark);
 
@@ -497,7 +498,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   this->InferShape(&infer_shape_ctx);
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto dev_ctx = pool.Get(place);
-
+  // profile
+  platform::RecordEvent record_event(Type(), dev_ctx);
   // check if op[type] has kernel registered.
   auto& all_op_kernels = AllOpKernels();
   auto kernels_iter = all_op_kernels.find(type_);
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index 71c5ab3db937f70ff84391e98d28f023f6dddcfb..80eb9889670744ae527ea29609b33631a021bfa8 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -32,23 +32,11 @@ void ReadBinaryFile(const std::string& filename, std::string& contents) {
   inputfs.close();
 }
 
-bool IsParameter(const framework::VarDesc* var,
-                 const framework::ProgramDesc& main_program) {
-  if (var->Persistable()) {
-    // There are many unreachable variables in the program
-    for (size_t i = 0; i < main_program.Size(); ++i) {
-      const framework::BlockDesc& block = main_program.Block(i);
-      for (auto* op : block.AllOps()) {
-        if (op->Type() == framework::kFeedOpType) {
-          continue;
-        }
-        for (auto input_argument_name : op->InputArgumentNames()) {
-          if (input_argument_name == var->Name()) {
-            return true;
-          }
-        }
-      }
-    }
+bool IsPersistable(const framework::VarDesc* var) {
+  if (var->Persistable() &&
+      var->GetType() != framework::proto::VarType::FEED_MINIBATCH &&
+      var->GetType() != framework::proto::VarType::FETCH_LIST) {
+    return true;
   }
   return false;
 }
@@ -65,8 +53,8 @@ void LoadPersistables(framework::Executor& executor,
   std::vector<std::string> paramlist;
 
   for (auto* var : global_block.AllVars()) {
-    if (IsParameter(var, main_program)) {
-      VLOG(3) << "parameter's name: " << var->Name();
+    if (IsPersistable(var)) {
+      VLOG(3) << "persistable variable's name: " << var->Name();
 
       framework::VarDesc* new_var = load_block->Var(var->Name());
       new_var->SetShape(var->GetShape());
@@ -101,7 +89,6 @@ void LoadPersistables(framework::Executor& executor,
 
   executor.Run(*load_program, &scope, 0, true, true);
 
-  VLOG(3) << "Ran loading successfully";
   delete load_program;
 }
 
diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index c0aba39b97770886cf95456e9876d3cc7ff92266..e7ffb00ec8d8926193fe510ebdb7185f75c90906 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -4,7 +4,7 @@ function(inference_test TARGET_NAME)
   set(multiValueArgs ARGS)
   cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-  set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests)
+  set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/fluid/tests)
   set(arg_list "")
   if(inference_test_ARGS)
     foreach(arg ${inference_test_ARGS})
@@ -30,5 +30,5 @@ inference_test(label_semantic_roles)
 inference_test(recognize_digits ARGS mlp conv)
 inference_test(recommender_system)
 #inference_test(rnn_encoder_decoder)
-inference_test(understand_sentiment)
+inference_test(understand_sentiment ARGS conv)
 inference_test(word2vec)
diff --git a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
index 443193aae8b38323883d460bc37a9c14430fc8bb..184924016634bba26204d937744ca5fa87cd443c 100644
--- a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
@@ -32,16 +32,42 @@ TEST(inference, label_semantic_roles) {
   paddle::framework::LoDTensor word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1,
       ctx_p2, mark;
   paddle::framework::LoD lod{{0, 4, 10}};
-
-  SetupLoDTensor(word, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(
-      predicate, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(ctx_n2, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(ctx_n1, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(ctx_0, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(ctx_p1, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(ctx_p2, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(mark, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  int64_t word_dict_len = 44068;
+  int64_t predicate_dict_len = 3162;
+  int64_t mark_dict_len = 2;
+
+  SetupLoDTensor(word,
+                 lod,
+                 static_cast<int64_t>(0),
+                 static_cast<int64_t>(word_dict_len - 1));
+  SetupLoDTensor(predicate,
+                 lod,
+                 static_cast<int64_t>(0),
+                 static_cast<int64_t>(predicate_dict_len - 1));
+  SetupLoDTensor(ctx_n2,
+                 lod,
+                 static_cast<int64_t>(0),
+                 static_cast<int64_t>(word_dict_len - 1));
+  SetupLoDTensor(ctx_n1,
+                 lod,
+                 static_cast<int64_t>(0),
+                 static_cast<int64_t>(word_dict_len - 1));
+  SetupLoDTensor(ctx_0,
+                 lod,
+                 static_cast<int64_t>(0),
+                 static_cast<int64_t>(word_dict_len - 1));
+  SetupLoDTensor(ctx_p1,
+                 lod,
+                 static_cast<int64_t>(0),
+                 static_cast<int64_t>(word_dict_len - 1));
+  SetupLoDTensor(ctx_p2,
+                 lod,
+                 static_cast<int64_t>(0),
+                 static_cast<int64_t>(word_dict_len - 1));
+  SetupLoDTensor(mark,
+                 lod,
+                 static_cast<int64_t>(0),
+                 static_cast<int64_t>(mark_dict_len - 1));
 
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&word);
diff --git a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
index e67064fb61d18ff8db540a68e94729649e44cd1a..824b3274ebc7ba046e61798b3f61ef9924a75679 100644
--- a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
@@ -31,7 +31,12 @@ TEST(inference, understand_sentiment) {
 
   paddle::framework::LoDTensor words;
   paddle::framework::LoD lod{{0, 4, 10}};
-  SetupLoDTensor(words, lod, static_cast<int64_t>(0), static_cast<int64_t>(10));
+  int64_t word_dict_len = 5147;
+
+  SetupLoDTensor(words,
+                 lod,
+                 static_cast<int64_t>(0),
+                 static_cast<int64_t>(word_dict_len - 1));
 
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&words);
diff --git a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
index e2f2f36a8222e03f77eca65d6331b4a52c0eea82..1481760c529c29a7290f476e2a22e1ded5ab7787 100644
--- a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
@@ -31,12 +31,12 @@ TEST(inference, word2vec) {
 
   paddle::framework::LoDTensor first_word, second_word, third_word, fourth_word;
   paddle::framework::LoD lod{{0, 1}};
-  int64_t dict_size = 2072;  // Hard-coding the size of dictionary
+  int64_t dict_size = 2073;  // The size of dictionary
 
-  SetupLoDTensor(first_word, lod, static_cast<int64_t>(0), dict_size);
-  SetupLoDTensor(second_word, lod, static_cast<int64_t>(0), dict_size);
-  SetupLoDTensor(third_word, lod, static_cast<int64_t>(0), dict_size);
-  SetupLoDTensor(fourth_word, lod, static_cast<int64_t>(0), dict_size);
+  SetupLoDTensor(first_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(second_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(third_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);
 
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&first_word);
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index abe2032cc058e50a63ac72cccd90e060c6e14479..49518e50d8541477234f17ac5b8709aeb57662ff 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -101,8 +101,8 @@ void TestInference(const std::string& dirname,
   if (IsCombined) {
     // All parameters are saved in a single file.
     // Hard-coding the file names of program and parameters in unittest.
-    // Users are free to specify different filename
-    // (provided: the filenames are changed in the python api as well: io.py)
+    // The file names should be consistent with that used in Python API
+    //  `fluid.io.save_inference_model`.
     std::string prog_filename = "__model_combined__";
     std::string param_filename = "__params_combined__";
     inference_program = paddle::inference::Load(executor,
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 8f14fd376ae51eff0f56c5a8d679c49cec23bd68..4da46e94c5cd979507fed80b35ebedf0cc6791d0 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -11,6 +11,8 @@ function(op_library TARGET)
     set(cc_srcs)
     set(cu_srcs)
     set(cu_cc_srcs)
+    set(cudnn_cu_cc_srcs)
+    set(CUDNN_FILE)
     set(op_common_deps operator op_registry math_function)
     set(options "")
     set(oneValueArgs "")
@@ -30,10 +32,16 @@ function(op_library TARGET)
         if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
             list(APPEND cu_srcs ${TARGET}.cu)
         endif()
+        string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}")
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
+            list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
+        endif()
     else()
         foreach(src ${op_library_SRCS})
             if (${src} MATCHES ".*\\.cu$")
                 list(APPEND cu_srcs ${src})
+            elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
+                list(APPEND cudnn_cu_cc_srcs ${src})
             elseif(${src} MATCHES ".*\\.cu.cc$")
                 list(APPEND cu_cc_srcs ${src})
             elseif(${src} MATCHES ".*\\.cc$")
@@ -54,7 +62,7 @@ function(op_library TARGET)
         set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
     endif()
     if (WITH_GPU)
-        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
+        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
     else()
         cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS}
@@ -98,6 +106,12 @@ function(op_library TARGET)
         set(pybind_flag 1)
     endif()
 
+    # pybind USE_OP_DEVICE_KERNEL for CUDNN
+    list(LENGTH cudnn_cu_cc_srcs cudnn_cu_cc_srcs_len)
+    if (WITH_GPU AND ${cudnn_cu_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
+    endif()
+
     # pybind USE_OP
     if (${pybind_flag} EQUAL 0)
         file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
@@ -141,6 +155,7 @@ op_library(print_op DEPS lod_tensor)
 op_library(adagrad_op DEPS selected_rows_functor)
 op_library(maxout_op DEPS maxouting)
 op_library(unpool_op DEPS unpooling)
+op_library(pool_op DEPS pooling)
 op_library(pool_with_index_op DEPS pooling)
 op_library(lod_rank_table_op DEPS lod_rank_table)
 op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
@@ -152,43 +167,17 @@ op_library(lstm_op DEPS sequence2batch lstm_compute)
 op_library(lstmp_op DEPS sequence2batch lstm_compute)
 op_library(gru_op DEPS sequence2batch gru_compute)
 op_library(recurrent_op DEPS executor)
-op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale math_function)
+op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 op_library(cos_sim_op DEPS cos_sim_functor)
 op_library(parallel_do_op DEPS executor)
 op_library(create_reader_op DEPS reader)
 
-# Regist multiple Kernel to pybind
 if (WITH_GPU)
-
-op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS
-    vol2col depthwise_conv)
-
-op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function)
-op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling)
-op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc
-  conv_transpose_cudnn_op.cu.cc DEPS vol2col)
-file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(conv2d, CUDNN);\n")
-file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(pool2d, CUDNN);\n")
-file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(conv2d_transpose, CUDNN);\n")
+    op_library(conv_op DEPS vol2col depthwise_conv)
 else()
-op_library(conv_op SRCS conv_op.cc DEPS vol2col)
-op_library(pool_op SRCS pool_op.cc DEPS pooling)
-op_library(conv_transpose_op SRCS conv_transpose_op.cc DEPS vol2col)
+    op_library(conv_op DEPS vol2col)
 endif()
-
-cc_library(batch_size_like SRCS batch_size_like.cc DEPS op_registry)
-
-op_library(fill_constant_batch_size_like_op
-  SRCS fill_constant_batch_size_like_op.cc fill_constant_batch_size_like_op.cu.cc
-  DEPS batch_size_like)
-
-op_library(uniform_random_batch_size_like_op
-  SRCS uniform_random_batch_size_like_op.cc
-  DEPS batch_size_like uniform_random_op)
-
-op_library(gaussian_random_batch_size_like_op
-  SRCS gaussian_random_batch_size_like_op.cc
-  DEPS batch_size_like gaussian_random_op)
+op_library(conv_transpose_op DEPS vol2col)
 
 # FIXME(typhoonzero): save/load depends lodtensor serialization functions
 op_library(save_op DEPS lod_tensor)
diff --git a/paddle/fluid/operators/batch_size_like.cc b/paddle/fluid/operators/batch_size_like.cc
deleted file mode 100644
index 4d4a6d4c472fe2dedb0cd37bff7bbf5bdad3ead7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/batch_size_like.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/batch_size_like.h"
-
-namespace paddle {
-namespace operators {
-
-void BatchSizeLikeOp::InferShape(framework::InferShapeContext *ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("Input"),
-                 "Input(Input) of %s should not be null.", Type());
-  PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of %s should not be null.",
-                 Type());
-
-  auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
-  PADDLE_ENFORCE_GT(shape.size(), 0);
-  std::vector<int64_t> shape_int64(shape.size(), 0);
-  std::transform(shape.begin(), shape.end(), shape_int64.begin(),
-                 [](int a) { return static_cast<int64_t>(a); });
-  auto output_dim = framework::make_ddim(shape_int64);
-
-  int input_dim_idx = ctx->Attrs().Get<int>("input_dim_idx");
-  PADDLE_ENFORCE_GE(input_dim_idx, 0);
-  PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), input_dim_idx);
-
-  int output_dim_idx = ctx->Attrs().Get<int>("output_dim_idx");
-  PADDLE_ENFORCE_GE(output_dim_idx, 0);
-  PADDLE_ENFORCE_GT(static_cast<int>(shape.size()), output_dim_idx);
-
-  output_dim[output_dim_idx] = ctx->GetInputDim("Input")[input_dim_idx];
-  ctx->SetOutputDim("Out", output_dim);
-}
-
-BatchSizeLikeOpMaker::BatchSizeLikeOpMaker(OpProto *proto,
-                                           OpAttrChecker *op_checker)
-    : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-  AddInput("Input",
-           "(Tensor) Tensor "
-           "whose input_dim_idx'th dimension specifies the batch_size");
-  AddOutput("Out",
-            "(Tensor) Tensor of specified shape will be filled "
-            "with the specified value");
-  AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
-  AddAttr<int>("input_dim_idx",
-               "(int, default 0) The index of input's batch size dimension")
-      .SetDefault(0);
-  AddAttr<int>("output_dim_idx",
-               "(int, default 0) The index of output's batch size dimension")
-      .SetDefault(0);
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/batch_size_like.h b/paddle/fluid/operators/batch_size_like.h
index 87e8f053a73a23cd9e231ada6501d0d9344bb1a6..0bdf27e620a3a7c7b62b955f708a5e2aad1a6986 100644
--- a/paddle/fluid/operators/batch_size_like.h
+++ b/paddle/fluid/operators/batch_size_like.h
@@ -24,12 +24,50 @@ class BatchSizeLikeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of %s should not be null.", Type());
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of %s should not be null.", Type());
+
+    auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    PADDLE_ENFORCE_GT(shape.size(), 0);
+    std::vector<int64_t> shape_int64(shape.size(), 0);
+    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
+                   [](int a) { return static_cast<int64_t>(a); });
+    auto output_dim = framework::make_ddim(shape_int64);
+
+    int input_dim_idx = ctx->Attrs().Get<int>("input_dim_idx");
+    PADDLE_ENFORCE_GE(input_dim_idx, 0);
+    PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), input_dim_idx);
+
+    int output_dim_idx = ctx->Attrs().Get<int>("output_dim_idx");
+    PADDLE_ENFORCE_GE(output_dim_idx, 0);
+    PADDLE_ENFORCE_GT(static_cast<int>(shape.size()), output_dim_idx);
+
+    output_dim[output_dim_idx] = ctx->GetInputDim("Input")[input_dim_idx];
+    ctx->SetOutputDim("Out", output_dim);
+  }
 };
 
 class BatchSizeLikeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BatchSizeLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker);
+  BatchSizeLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(Tensor) Tensor "
+             "whose input_dim_idx'th dimension specifies the batch_size");
+    AddOutput("Out",
+              "(Tensor) Tensor of specified shape will be filled "
+              "with the specified value");
+    AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
+    AddAttr<int>("input_dim_idx",
+                 "(int, default 0) The index of input's batch size dimension")
+        .SetDefault(0);
+    AddAttr<int>("output_dim_idx",
+                 "(int, default 0) The index of output's batch size dimension")
+        .SetDefault(0);
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/bipartite_match_op.cc b/paddle/fluid/operators/bipartite_match_op.cc
index c536cf6b6b822c8d9553d7d2cf57902e5e6e5343..2b3f26c0a890c33f9b4f4c8a5a271123d7ff0b31 100644
--- a/paddle/fluid/operators/bipartite_match_op.cc
+++ b/paddle/fluid/operators/bipartite_match_op.cc
@@ -94,6 +94,38 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
     }
   }
 
+  void ArgMaxMatch(const Tensor& dist, int* match_indices, T* match_dist,
+                   T overlap_threshold) const {
+    constexpr T kEPS = static_cast<T>(1e-6);
+    int64_t row = dist.dims()[0];
+    int64_t col = dist.dims()[1];
+    auto* dist_data = dist.data<T>();
+    for (int64_t j = 0; j < col; ++j) {
+      if (match_indices[j] != -1) {
+        // the j-th column has been matched to one entity.
+        continue;
+      }
+      int max_row_idx = -1;
+      T max_dist = -1;
+      for (int i = 0; i < row; ++i) {
+        T dist = dist_data[i * col + j];
+        if (dist < kEPS) {
+          // distance is 0 between m-th row and j-th column
+          continue;
+        }
+        if (dist >= overlap_threshold && dist > max_dist) {
+          max_row_idx = i;
+          max_dist = dist;
+        }
+      }
+      if (max_row_idx != -1) {
+        PADDLE_ENFORCE_EQ(match_indices[j], -1);
+        match_indices[j] = max_row_idx;
+        match_dist[j] = max_dist;
+      }
+    }
+  }
+
   void Compute(const framework::ExecutionContext& context) const override {
     auto* dist_mat = context.Input<LoDTensor>("DistMat");
     auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices");
@@ -120,13 +152,21 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
 
     int* indices = match_indices->data<int>();
     T* dist = match_dist->data<T>();
+    auto type = context.Attr<std::string>("match_type");
+    auto threshold = context.Attr<float>("dist_threshold");
     if (n == 1) {
       BipartiteMatch(*dist_mat, indices, dist);
+      if (type == "per_prediction") {
+        ArgMaxMatch(*dist_mat, indices, dist, threshold);
+      }
     } else {
       auto lod = dist_mat->lod().back();
       for (size_t i = 0; i < lod.size() - 1; ++i) {
         Tensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]);
         BipartiteMatch(one_ins, indices + i * col, dist + i * col);
+        if (type == "per_prediction") {
+          ArgMaxMatch(one_ins, indices + i * col, dist + i * col, threshold);
+        }
       }
     }
   }
@@ -147,6 +187,19 @@ class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {
         "This tensor can contain LoD information to represent a batch of "
         "inputs. One instance of this batch can contain different numbers of "
         "entities.");
+    AddAttr<std::string>(
+        "match_type",
+        "(string, defalut: per_prediction) "
+        "The type of matching method, should be 'bipartite' or "
+        "'per_prediction', 'bipartite' by defalut.")
+        .SetDefault("bipartite")
+        .InEnum({"bipartite", "per_prediction"});
+    AddAttr<float>(
+        "dist_threshold",
+        "(float, defalut: 0.5) "
+        "If `match_type` is 'per_prediction', this threshold is to determine "
+        "the extra matching bboxes based on the maximum distance.")
+        .SetDefault(0.5);
     AddOutput("ColToRowMatchIndices",
               "(Tensor) A 2-D Tensor with shape [N, M] in int type. "
               "N is the batch size. If ColToRowMatchIndices[i][j] is -1, it "
@@ -168,10 +221,10 @@ distance matrix. For input 2D matrix, the bipartite matching algorithm can
 find the matched column for each row, also can find the matched row for
 each column. And this operator only calculate matched indices from column
 to row. For each instance, the number of matched indices is the number of
-of columns of the input ditance matrix.
+of columns of the input distance matrix.
 
 There are two outputs to save matched indices and distance.
-A simple description, this algothrim matched the best (maximum distance)
+A simple description, this algorithm matched the best (maximum distance)
 row entity to the column entity and the matched indices are not duplicated
 in each row of ColToRowMatchIndices. If the column entity is not matched
 any row entity, set -1 in ColToRowMatchIndices.
diff --git a/paddle/fluid/operators/compare_op.cc b/paddle/fluid/operators/compare_op.cc
index cdeb28cc1db2c3328d8f605e6584f5b5e1311e97..86f7046058c7001fcaa588727b1cdc0f3f20c35f 100644
--- a/paddle/fluid/operators/compare_op.cc
+++ b/paddle/fluid/operators/compare_op.cc
@@ -83,7 +83,7 @@ class CompareOp : public framework::OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle
 
-#define REGISTER_LOGICAL_OP(op_type, _equation)                      \
+#define REGISTER_COMPARE_OP(op_type, _equation)                      \
   struct _##op_type##Comment {                                       \
     static char type[];                                              \
     static char equation[];                                          \
@@ -96,11 +96,17 @@ class CompareOp : public framework::OperatorWithKernel {
       ::paddle::operators::CompareOpInferShape<_##op_type##Comment>, \
       ::paddle::framework::EmptyGradOpMaker);
 
-REGISTER_LOGICAL_OP(less_than, "Out = X < Y");
-REGISTER_LOGICAL_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor);
-REGISTER_LOGICAL_OP(less_equal, "Out = X <= Y");
-REGISTER_LOGICAL_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor);
-REGISTER_LOGICAL_OP(equal, "Out = X == Y");
-REGISTER_LOGICAL_KERNEL(equal, CPU, paddle::operators::EqualFunctor);
-REGISTER_LOGICAL_OP(not_equal, "Out = X != Y");
-REGISTER_LOGICAL_KERNEL(not_equal, CPU, paddle::operators::NotEqualFunctor);
+REGISTER_COMPARE_OP(less_than, "Out = X < Y");
+REGISTER_COMPARE_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor);
+REGISTER_COMPARE_OP(less_equal, "Out = X <= Y");
+REGISTER_COMPARE_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor);
+REGISTER_COMPARE_OP(greater_than, "Out = X > Y");
+REGISTER_COMPARE_KERNEL(greater_than, CPU,
+                        paddle::operators::GreaterThanFunctor);
+REGISTER_COMPARE_OP(greater_equal, "Out = X >= Y");
+REGISTER_COMPARE_KERNEL(greater_equal, CPU,
+                        paddle::operators::GreaterEqualFunctor);
+REGISTER_COMPARE_OP(equal, "Out = X == Y");
+REGISTER_COMPARE_KERNEL(equal, CPU, paddle::operators::EqualFunctor);
+REGISTER_COMPARE_OP(not_equal, "Out = X != Y");
+REGISTER_COMPARE_KERNEL(not_equal, CPU, paddle::operators::NotEqualFunctor);
diff --git a/paddle/fluid/operators/compare_op.cu b/paddle/fluid/operators/compare_op.cu
index 2cc0c7c57257ad5bd89606a5e29f17156cbda773..1bf85c64fb5b4d79c62118959fd72b13ed1c63ed 100644
--- a/paddle/fluid/operators/compare_op.cu
+++ b/paddle/fluid/operators/compare_op.cu
@@ -14,7 +14,11 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/compare_op.h"
 
-REGISTER_LOGICAL_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor);
-REGISTER_LOGICAL_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor);
-REGISTER_LOGICAL_KERNEL(equal, CUDA, paddle::operators::EqualFunctor);
-REGISTER_LOGICAL_KERNEL(not_equal, CUDA, paddle::operators::NotEqualFunctor);
+REGISTER_COMPARE_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor);
+REGISTER_COMPARE_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor);
+REGISTER_COMPARE_KERNEL(greater_than, CUDA,
+                        paddle::operators::GreaterThanFunctor);
+REGISTER_COMPARE_KERNEL(greater_equal, CUDA,
+                        paddle::operators::GreaterEqualFunctor);
+REGISTER_COMPARE_KERNEL(equal, CUDA, paddle::operators::EqualFunctor);
+REGISTER_COMPARE_KERNEL(not_equal, CUDA, paddle::operators::NotEqualFunctor);
diff --git a/paddle/fluid/operators/compare_op.h b/paddle/fluid/operators/compare_op.h
index 7e78269cf4767eb78cde1decaedc96a3c921716a..1cbabdaf6767815c1fedba0eabec9b5de678e047 100644
--- a/paddle/fluid/operators/compare_op.h
+++ b/paddle/fluid/operators/compare_op.h
@@ -34,6 +34,18 @@ struct LessEqualFunctor {
   HOSTDEVICE bool operator()(const T& a, const T& b) const { return a <= b; }
 };
 
+template <typename T>
+struct GreaterThanFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a > b; }
+};
+
+template <typename T>
+struct GreaterEqualFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a >= b; }
+};
+
 template <typename T>
 struct EqualFunctor {
   using ELEM_TYPE = T;
@@ -76,7 +88,7 @@ class CompareOpKernel
 }  // namespace operators
 }  // namespace paddle
 
-#define REGISTER_LOGICAL_KERNEL(op_type, dev, functor)                    \
+#define REGISTER_COMPARE_KERNEL(op_type, dev, functor)                    \
   REGISTER_OP_##dev##_KERNEL(                                             \
       op_type, ::paddle::operators::CompareOpKernel<                      \
                    ::paddle::platform::dev##DeviceContext, functor<int>>, \
diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h
index eb0e43ad2d84f681f39ed4adc5a27f6d3ab00f08..208a4481c6afe1b8f62e8f675c951c3349639f46 100644
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
@@ -34,12 +35,46 @@ class ConcatKernel : public framework::OpKernel<T> {
     auto out_stride = framework::stride_numel(out->dims());
 
     size_t output_offset = 0;
-    for (auto* in : ins) {
-      auto in_stride = framework::stride_numel(in->dims());
-      StridedNumelCopyWithAxis<T>(ctx.device_context(), axis,
-                                  out->data<T>() + output_offset, out_stride,
-                                  in->data<T>(), in_stride, in_stride[axis]);
-      output_offset += in_stride[axis];
+
+    // If axis >=1, copy to out immediately need to call many times
+    // of cuda memcpy. Copy the input to cpu and do the stride copy,
+    // then copy to gpu output.
+
+    if (platform::is_gpu_place(place) && axis >= 1) {
+      platform::CPUPlace copy_place;
+      auto& cpu_ctx = *platform::DeviceContextPool::Instance().Get(copy_place);
+      framework::Tensor cpu_out;
+      cpu_out.Resize(out->dims());
+      cpu_out.mutable_data<T>(copy_place);
+      auto& dev_ctx = ctx.device_context();
+      std::vector<std::unique_ptr<framework::Tensor>> cpu_ins;
+      for (auto* in : ins) {
+        std::unique_ptr<framework::Tensor> cpu_in(new framework::Tensor);
+        framework::TensorCopy(*in, copy_place, dev_ctx, cpu_in.get());
+        cpu_ins.emplace_back(std::move(cpu_in));
+      }
+      // TODO(dzhwinter): overlap copy and compute stream
+      // https://devblogs.nvidia.com/how-overlap-data-transfers-cuda-cc/
+      dev_ctx.Wait();
+
+      for (auto& in : cpu_ins) {
+        auto& cpu_in = *in.get();
+        auto in_stride = framework::stride_numel(cpu_in.dims());
+
+        StridedNumelCopyWithAxis<T>(
+            cpu_ctx, axis, cpu_out.data<T>() + output_offset, out_stride,
+            cpu_in.data<T>(), in_stride, in_stride[axis]);
+        output_offset += in_stride[axis];
+      }
+      framework::TensorCopy(cpu_out, place, dev_ctx, out);
+    } else {
+      for (auto* in : ins) {
+        auto in_stride = framework::stride_numel(in->dims());
+        StridedNumelCopyWithAxis<T>(ctx.device_context(), axis,
+                                    out->data<T>() + output_offset, out_stride,
+                                    in->data<T>(), in_stride, in_stride[axis]);
+        output_offset += in_stride[axis];
+      }
     }
   }
 };
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 2ecece707314f1e8b1b0bc9ad28f53ec5e1d405e..83b7708bf337b70f97c5e9126efd142b9b957b00 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -54,12 +54,6 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
 
   std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
   for (size_t i = 0; i < strides.size(); ++i) {
-    PADDLE_ENFORCE(in_dims[i + 2] + 2 * paddings[i] -
-                           (dilations[i] * (filter_dims[i + 2] - 1) + 1) >
-                       0,
-                   "Due to the settings of paddings, filter_dims and "
-                   "dilations, the output size is less than 0, please check "
-                   "again.");
     output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
                                           dilations[i], paddings[i],
                                           strides[i]));
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index c93c2e73f720ae025a4ad4f8146a7c6c3c382eea..12b45f1d65019f623268cb9da9004bac5e1f72a3 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -31,7 +31,14 @@ using Tensor = framework::Tensor;
 inline int ConvOutputSize(int input_size, int filter_size, int dilation,
                           int padding, int stride) {
   const int dkernel = dilation * (filter_size - 1) + 1;
-  const int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
+  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
+  PADDLE_ENFORCE(
+      output_size > 0,
+      "Due to the settings of padding(%d), filter_size(%d), dilation(%d) and "
+      "stride(%d), the output size is less than 0, please check "
+      "again. Input_size:%d",
+      padding, filter_size, dilation, stride, input_size);
+
   return output_size;
 }
 inline bool IsExpand(std::vector<int64_t>& filter_dim,
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index ee9044b1f5d46dc725c9583d0d90ab5681d2850c..7266f3276477891d3c7b6827316a428ef7a31c6e 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -177,8 +177,8 @@ std::shared_ptr<grpc::Channel> RPCClient::GetChannel(const std::string& ep) {
   args.SetMaxSendMessageSize(std::numeric_limits<int>::max());
   args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
 
-  auto ch = std::shared_ptr<grpc::Channel>(
-      grpc::CreateCustomChannel(ep, grpc::InsecureChannelCredentials(), args));
+  auto ch =
+      grpc::CreateCustomChannel(ep, grpc::InsecureChannelCredentials(), args);
 
   channels_[ep] = ch;
   return ch;
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index ee0e3533ce028992af3d4558e3fd198a09c4816b..8e9923c87ce22ed229f78ef15430e50cab16c947 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -129,6 +129,8 @@ class ListenAndServOp : public framework::OperatorBase {
       }
       if (exit_flag) {
         rpc_service_->ShutDown();
+        rpc_service_->SetCond(1);
+        break;
       }
       try {
         executor.Run(*program, &recv_scope, block->ID(), /*global_block*/
diff --git a/paddle/fluid/operators/nccl_op.cc b/paddle/fluid/operators/nccl_op.cc
index 0994bba782b42be994ae479f4c9c4de5a2e384ed..9185666c56c4621d42429c9cfdb079001c6336f1 100644
--- a/paddle/fluid/operators/nccl_op.cc
+++ b/paddle/fluid/operators/nccl_op.cc
@@ -65,7 +65,7 @@ class NCCLInitOpVarTypeInference : public framework::VarTypeInference {
                   framework::BlockDesc *block) const override {
     auto out_var_name = op_desc.Output("Communicator").front();
     auto &out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    auto var_type = framework::proto::VarType::NCCL_COM;
+    auto var_type = framework::proto::VarType::RAW;
     out_var.SetType(var_type);
   }
 };
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index c7729ad1322588c0558a136dbd5d48f757d38412..a87a3511ee46dd657c27da26feb43ba43a08f25d 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -19,6 +19,11 @@ namespace operators {
 
 int PoolOutputSize(int input_size, int filter_size, int padding, int stride) {
   int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  PADDLE_ENFORCE(output_size > 0,
+                 "Due to the settings of padding(%d), filter_size(%d) and "
+                 "stride(%d), the output size is less than 0, please check "
+                 "again. Input_size:%d",
+                 padding, filter_size, stride, input_size);
   return output_size;
 }
 
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index a90ffb4ff3e6e1bfcc0d00bc4714b3067fdede6c..3580932356fd5f29d5e4d00a70e64c207c64e41e 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -121,10 +121,15 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
 
 REGISTER_OP(reshape, ops::ReshapeOp, ops::ReshapeOpMaker, reshape_grad,
             ops::ReshapeGradOp);
-REGISTER_OP_CPU_KERNEL(reshape,
-                       ops::ReshapeKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    reshape_grad, ops::ReshapeGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(reshape, ops::ReshapeKernel<CPU, float>,
+                       ops::ReshapeKernel<CPU, double>,
+                       ops::ReshapeKernel<CPU, int>,
+                       ops::ReshapeKernel<CPU, int64_t>);
+REGISTER_OP_CPU_KERNEL(reshape_grad, ops::ReshapeGradKernel<CPU, float>,
+                       ops::ReshapeGradKernel<CPU, double>,
+                       ops::ReshapeGradKernel<CPU, int>,
+                       ops::ReshapeGradKernel<CPU, int64_t>);
diff --git a/paddle/fluid/operators/reshape_op.cu b/paddle/fluid/operators/reshape_op.cu
index d5ceaf784c0e4b1c8d527958be31d5186c2823d3..c628c634e2bc9ae260948a6e7ccf786cbd6c5c3c 100644
--- a/paddle/fluid/operators/reshape_op.cu
+++ b/paddle/fluid/operators/reshape_op.cu
@@ -13,10 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/reshape_op.h"
+using CUDA = paddle::platform::CUDADeviceContext;
 
-REGISTER_OP_CUDA_KERNEL(
-    reshape,
-    paddle::operators::ReshapeKernel<paddle::platform::CUDAPlace, float>);
-REGISTER_OP_CUDA_KERNEL(
-    reshape_grad,
-    paddle::operators::ReshapeGradKernel<paddle::platform::CUDAPlace, float>);
+REGISTER_OP_CUDA_KERNEL(reshape, paddle::operators::ReshapeKernel<CUDA, float>,
+                        paddle::operators::ReshapeKernel<CUDA, double>,
+                        paddle::operators::ReshapeKernel<CUDA, int>,
+                        paddle::operators::ReshapeKernel<CUDA, int64_t>);
+REGISTER_OP_CUDA_KERNEL(reshape_grad,
+                        paddle::operators::ReshapeGradKernel<CUDA, float>,
+                        paddle::operators::ReshapeGradKernel<CUDA, double>,
+                        paddle::operators::ReshapeGradKernel<CUDA, int>,
+                        paddle::operators::ReshapeGradKernel<CUDA, int64_t>);
diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc
index 58850bf566e00f88de19305110e2ef696b73467e..178976f96fdbd08cead7b7c518ea1fbaaa2a5db8 100644
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -121,9 +121,27 @@ This operator will send tensor to recv_op at the parameter server.
   }
 };
 
+class SendOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    auto out_var_name = op_desc.Output("RPCClient").front();
+    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
+    auto var_type = framework::proto::VarType::RAW;
+    out_var.SetType(var_type);
+  }
+};
+
+class SendOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(send, ops::SendOp, ops::SendOpMaker);
+REGISTER_OPERATOR(send, ops::SendOp, paddle::framework::EmptyGradOpMaker,
+                  ops::SendOpMaker, ops::SendOpVarTypeInference,
+                  ops::SendOpShapeInference);
diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc
index 008c012a32e0c88dfb0c05d7e485ffc367b3cac5..e9fb845b475ff5776bf948ab120a44c16ed87aa0 100644
--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -95,7 +95,7 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs,
   for (auto kv : outputs) {
     for (auto v : kv.second) {
       auto var = block->Var(v);
-      var->SetDataType(f::proto::DataType::FP32);
+      var->SetDataType(f::proto::VarType::FP32);
     }
   }
 
@@ -122,33 +122,37 @@ void StartServerNet(bool is_sparse) {
 
   // sub program run in listen_and_serv_op, for simple test we use sum
   f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
+  f::BlockDesc *optimize_block = program.MutableBlock(0);
   // X for server side tensors, RX for received tensers, must be of same shape.
-  AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, block);
+  AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block);
 
   f::AttributeMap attrs;
   attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
+  attrs.insert({"Fanin", 1});
   attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
   attrs.insert({"GradList", std::vector<std::string>({"x1"})});
-  attrs.insert({"OptimizeBlock", block});
+  attrs.insert({"OptimizeBlock", optimize_block});
   listen_and_serv_op =
-      f::OpRegistry::CreateOp("listen_and_serv", {}, {}, attrs);
+      f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs);
   listen_and_serv_op->Run(scope, place);
 }
 
 TEST(SendRecvOp, CPUDense) {
   std::thread server_thread(StartServerNet, false);
-  sleep(10);  // wait server to start
+  sleep(5);  // wait server to start
   // local net
   f::Scope scope;
   p::CPUPlace place;
   InitTensorsInScope(scope, place);
+  // create rpc client var
+  scope.Var("RPC_CLIENT_VAR");
 
   f::AttributeMap attrs;
   attrs.insert({"endpoints", std::vector<std::string>({"127.0.0.1:6174"})});
   attrs.insert({"epmap", std::vector<std::string>({"127.0.0.1:6174"})});
-  auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}},
-                                         {{"Out", {"Out"}}}, attrs);
+  auto send_op = f::OpRegistry::CreateOp(
+      "send", {{"X", {"x1"}}},
+      {{"Out", {"Out"}}, {"RPCClient", {"RPC_CLIENT_VAR"}}}, attrs);
   send_op->Run(scope, place);
 
   auto in_var = scope.Var("x1");
@@ -175,11 +179,13 @@ TEST(SendRecvOp, CPUSparse) {
   p::CPUPlace place;
   p::CPUDeviceContext ctx(place);
   InitSelectedRowsInScope(scope, place);
+  scope.Var("RPC_CLIENT_VAR");
   f::AttributeMap attrs;
   attrs.insert({"endpoints", std::vector<std::string>({"127.0.0.1:6174"})});
   attrs.insert({"epmap", std::vector<std::string>({"127.0.0.1:6174"})});
-  auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}},
-                                         {{"Out", {"Out"}}}, attrs);
+  auto send_op = f::OpRegistry::CreateOp(
+      "send", {{"X", {"x1"}}},
+      {{"Out", {"Out"}}, {"RPCClient", {"RPC_CLIENT_VAR"}}}, attrs);
   send_op->Run(scope, place);
 
   auto x0 = scope.Var("x0")->GetMutable<f::SelectedRows>();
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 0d0cee21d14f29c03ebabcb921ecc4f29f352b55..28a668c86aa322803a65b916b4273181f5652e21 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -1,3 +1,5 @@
+proto_library(profiler_proto SRCS profiler.proto)
+
 if(WITH_GPU)
   cc_library(enforce SRCS enforce.cc DEPS)
 else()
@@ -37,7 +39,8 @@ nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
 nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context)
 
-cc_library(profiler SRCS profiler.cc DEPS device_context)
+cc_library(device_tracer SRCS device_tracer.cc DEPS profiler_proto ${GPU_CTX_DEPS})
+cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
 
 nv_test(float16_gpu_test SRCS float16_test.cu)
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..87bbdfa5fd5d9781d5f2b310d2142b1b4decbf9b
--- /dev/null
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -0,0 +1,285 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/device_tracer.h"
+#include <map>
+#include <mutex>
+#include "glog/logging.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace paddle {
+namespace platform {
+namespace {
+
+thread_local const char *cur_annotation = nullptr;
+std::once_flag tracer_once_flag;
+DeviceTracer *tracer = nullptr;
+}  // namespace
+#ifdef PADDLE_WITH_CUPTI
+
+namespace {
+// TODO(panyx0718): Revisit the buffer size here.
+uint64_t kBufSize = 32 * 1024;
+uint64_t kAlignSize = 8;
+
+#define ALIGN_BUFFER(buffer, align)                                 \
+  (((uintptr_t)(buffer) & ((align)-1))                              \
+       ? ((buffer) + (align) - ((uintptr_t)(buffer) & ((align)-1))) \
+       : (buffer))
+
+#define CUPTI_CALL(call)                                                   \
+  do {                                                                     \
+    CUptiResult _status = call;                                            \
+    if (_status != CUPTI_SUCCESS) {                                        \
+      const char *errstr;                                                  \
+      dynload::cuptiGetResultString(_status, &errstr);                     \
+      fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
+              __FILE__, __LINE__, #call, errstr);                          \
+      exit(-1);                                                            \
+    }                                                                      \
+  } while (0)
+
+void EnableActivity() {
+  // Device activity record is created when CUDA initializes, so we
+  // want to enable it before cuInit() or any CUDA runtime call.
+  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY));
+  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));
+  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
+  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
+  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
+  // We don't track these activities for now.
+  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT));
+  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
+  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
+  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_NAME));
+  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MARKER));
+}
+
+void DisableActivity() {
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY));
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL));
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE));
+  // Disable all other activity record kinds.
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT));
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DRIVER));
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME));
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET));
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME));
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER));
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD));
+}
+
+void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size,
+                              size_t *maxNumRecords) {
+  uint8_t *buf = (uint8_t *)malloc(kBufSize + kAlignSize);
+  *size = kBufSize;
+  *buffer = ALIGN_BUFFER(buf, kAlignSize);
+  *maxNumRecords = 0;
+}
+
+void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
+                              size_t size, size_t validSize) {
+  CUptiResult status;
+  CUpti_Activity *record = NULL;
+  if (validSize > 0) {
+    do {
+      status = dynload::cuptiActivityGetNextRecord(buffer, validSize, &record);
+      if (status == CUPTI_SUCCESS) {
+        switch (record->kind) {
+          case CUPTI_ACTIVITY_KIND_KERNEL:
+          case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: {
+            auto *kernel =
+                reinterpret_cast<const CUpti_ActivityKernel3 *>(record);
+            tracer->AddKernelRecords(kernel->start, kernel->end,
+                                     kernel->deviceId, kernel->streamId,
+                                     kernel->correlationId);
+            break;
+          }
+          default: { break; }
+        }
+      } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
+        // Seems not an error in this case.
+        break;
+      } else {
+        CUPTI_CALL(status);
+      }
+    } while (1);
+
+    size_t dropped;
+    CUPTI_CALL(
+        dynload::cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped));
+    if (dropped != 0) {
+      fprintf(stderr, "Dropped %u activity records\n", (unsigned int)dropped);
+    }
+  }
+  free(buffer);
+}
+}  // namespace
+
+class DeviceTracerImpl : public DeviceTracer {
+ public:
+  DeviceTracerImpl() : enabled_(false) {}
+
+  void AddAnnotation(uint64_t id, const std::string &anno) {
+    std::lock_guard<std::mutex> l(trace_mu_);
+    correlations_[id] = anno;
+  }
+
+  void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
+                        uint32_t stream_id, uint32_t correlation_id) {
+    std::lock_guard<std::mutex> l(trace_mu_);
+    kernel_records_.push_back(
+        KernelRecord{start, end, device_id, stream_id, correlation_id});
+  }
+
+  bool IsEnabled() {
+    std::lock_guard<std::mutex> l(trace_mu_);
+    return enabled_;
+  }
+
+  void Enable() {
+    std::lock_guard<std::mutex> l(trace_mu_);
+    if (enabled_) {
+      fprintf(stderr, "DeviceTracer already enabled\n");
+      return;
+    }
+    EnableActivity();
+
+    // Register callbacks for buffer requests and completed by CUPTI.
+    CUPTI_CALL(dynload::cuptiActivityRegisterCallbacks(bufferRequested,
+                                                       bufferCompleted));
+
+    CUptiResult ret;
+    ret = dynload::cuptiSubscribe(
+        &subscriber_, static_cast<CUpti_CallbackFunc>(ApiCallback), this);
+    if (ret == CUPTI_ERROR_MAX_LIMIT_REACHED) {
+      fprintf(stderr, "CUPTI subcriber limit reached.\n");
+    } else if (ret != CUPTI_SUCCESS) {
+      fprintf(stderr, "Failed to create CUPTI subscriber.\n");
+    }
+    CUPTI_CALL(
+        dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API,
+                                     CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));
+
+    CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
+    enabled_ = true;
+  }
+
+  proto::Profile GenProfile() {
+    std::lock_guard<std::mutex> l(trace_mu_);
+    proto::Profile profile_pb;
+    profile_pb.set_start_ns(start_ns_);
+    profile_pb.set_end_ns(end_ns_);
+    std::map<std::string, std::vector<uint64_t>> event_times;
+    for (const KernelRecord &r : kernel_records_) {
+      if (correlations_.find(r.correlation_id) == correlations_.end()) {
+        fprintf(stderr, "cannot relate a kernel activity\n");
+        continue;
+      }
+      auto *event = profile_pb.add_events();
+      event->set_name(correlations_.at(r.correlation_id));
+      event->set_start_ns(r.start_ns);
+      event->set_end_ns(r.end_ns);
+      event->set_stream_id(r.stream_id);
+      event->set_device_id(r.device_id);
+      event_times[event->name()].push_back(r.end_ns - r.start_ns);
+    }
+    for (const auto &et : event_times) {
+      fprintf(
+          stderr, "%s: total: %fms invoked cuda kernels: %lu\n",
+          et.first.c_str(),
+          std::accumulate(et.second.begin(), et.second.end(), 0) / 1000000.0,
+          et.second.size());
+    }
+    return profile_pb;
+  }
+
+  void Disable() {
+    // flush might cause additional calls to DeviceTracker.
+    dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED);
+    std::lock_guard<std::mutex> l(trace_mu_);
+    DisableActivity();
+    dynload::cuptiUnsubscribe(subscriber_);
+    CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_));
+    PADDLE_ENFORCE(dynload::cuptiFinalize());
+    enabled_ = false;
+  }
+
+ private:
+  static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain,
+                                   CUpti_CallbackId cbid, const void *cbdata) {
+    auto *cbInfo = reinterpret_cast<const CUpti_CallbackData *>(cbdata);
+    DeviceTracer *tracer = reinterpret_cast<DeviceTracer *>(userdata);
+
+    if ((domain == CUPTI_CB_DOMAIN_DRIVER_API) &&
+        (cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)) {
+      if (cbInfo->callbackSite == CUPTI_API_ENTER) {
+        const std::string anno =
+            cur_annotation ? cur_annotation : cbInfo->symbolName;
+        tracer->AddAnnotation(cbInfo->correlationId, anno);
+      }
+    } else {
+      VLOG(1) << "Unhandled API Callback for " << domain << " " << cbid;
+    }
+  }
+
+  std::mutex trace_mu_;
+  bool enabled_;
+  uint64_t start_ns_;
+  uint64_t end_ns_;
+  std::vector<KernelRecord> kernel_records_;
+  std::unordered_map<uint32_t, std::string> correlations_;
+  CUpti_SubscriberHandle subscriber_;
+};
+
+#endif  // PADDLE_WITH_CUPTI
+
+class DeviceTracerDummy : public DeviceTracer {
+ public:
+  DeviceTracerDummy() {}
+
+  void AddAnnotation(uint64_t id, const std::string &anno) {}
+
+  void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
+                        uint32_t stream_id, uint32_t correlation_id) {}
+
+  bool IsEnabled() { return false; }
+
+  void Enable() {}
+
+  proto::Profile GenProfile() { return proto::Profile(); }
+
+  void Disable() {}
+};
+
+void CreateTracer(DeviceTracer **t) {
+#ifdef PADDLE_WITH_CUPTI
+  *t = new DeviceTracerImpl();
+#else
+  *t = new DeviceTracerDummy();
+#endif  // PADDLE_WITH_CUPTI
+}
+
+DeviceTracer *GetDeviceTracer() {
+  std::call_once(tracer_once_flag, CreateTracer, &tracer);
+  return tracer;
+}
+
+void SetCurAnnotation(const char *anno) { cur_annotation = anno; }
+
+void ClearCurAnnotation() { cur_annotation = nullptr; }
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h
new file mode 100644
index 0000000000000000000000000000000000000000..06cea84cc80ebefe9f5c396673cc9a35673f718f
--- /dev/null
+++ b/paddle/fluid/platform/device_tracer.h
@@ -0,0 +1,72 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/platform/dynload/cupti.h"
+#include "paddle/fluid/platform/profiler.pb.h"
+
+namespace paddle {
+namespace platform {
+
+///////////////////////
+// WARN: Under Development. Don't depend on it yet.
+//////////////////////
+
+// DeviceTracer performs the following tasks:
+// 1. Register cuda callbacks for various events: kernel, memcpy, etc.
+// 2. Collect cuda statistics: start/end ts, memory, etc.
+// 3. Generate a protobuf for further analysis.
+class DeviceTracer {
+ public:
+  struct KernelRecord {
+    uint64_t start_ns;
+    uint64_t end_ns;
+    uint32_t device_id;
+    uint32_t stream_id;
+    uint32_t correlation_id;
+  };
+
+  virtual ~DeviceTracer() {}
+  // Needs to be called once before use.
+  virtual void Enable() = 0;
+  // Needs to be called once after use.
+  virtual void Disable() = 0;
+
+  // Add a pair to correlate internal cuda id with high level
+  // annotation (string). So cuda statistics can be represented by
+  // human-readable annotations.
+  virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0;
+
+  // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
+  // added before for human readability.
+  virtual void AddKernelRecords(uint64_t start, uint64_t end,
+                                uint32_t device_id, uint32_t stream_id,
+                                uint32_t correlation_id) = 0;
+
+  // Generate a proto after done (Disabled).
+  virtual proto::Profile GenProfile() = 0;
+
+  virtual bool IsEnabled() = 0;
+};
+
+// Get a DeviceTracer.
+DeviceTracer* GetDeviceTracer();
+
+// Set a name for the cuda kernel operation being launched by the thread.
+void SetCurAnnotation(const char* anno);
+// Clear the name after the operation is done.
+void ClearCurAnnotation();
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 264b4ebf2c06d9e688a32a223dff3ec079333fd9..567c137a55e4e0cb0b5080893be305e847bb61e1 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -1,4 +1,8 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
-nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc nccl.cc
-        DEPS dynamic_loader)
+
+list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc nccl.cc)
+if (CUPTI_FOUND)
+    list(APPEND CUDA_SRCS cupti.cc)
+endif(CUPTI_FOUND)
+nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
 cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
diff --git a/paddle/fluid/platform/dynload/cupti.cc b/paddle/fluid/platform/dynload/cupti.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a25660c6ed411bbe444ac8aa10a324cbed9c9d4f
--- /dev/null
+++ b/paddle/fluid/platform/dynload/cupti.cc
@@ -0,0 +1,35 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUPTI
+
+#include "paddle/fluid/platform/dynload/cupti.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag cupti_dso_flag;
+void *cupti_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUPTI_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+
+#endif  // PADDLE_WITH_CUPTI
diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h
new file mode 100644
index 0000000000000000000000000000000000000000..a79868c18c14b6bcdf85d60e766c7ec8be993f28
--- /dev/null
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -0,0 +1,86 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_CUPTI
+#include <cuda.h>
+#include <cupti.h>
+#include <dlfcn.h>
+#include <mutex>
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag cupti_dso_flag;
+extern void *cupti_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cupti routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#ifdef PADDLE_USE_DSO
+#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)                    \
+  struct DynLoad__##__name {                                       \
+    template <typename... Args>                                    \
+    inline CUptiResult CUPTIAPI operator()(Args... args) {         \
+      typedef CUptiResult CUPTIAPI (*cuptiFunc)(Args...);          \
+      std::call_once(cupti_dso_flag,                               \
+                     paddle::platform::dynload::GetCUPTIDsoHandle, \
+                     &cupti_dso_handle);                           \
+      void *p_##__name = dlsym(cupti_dso_handle, #__name);         \
+      return reinterpret_cast<cuptiFunc>(p_##__name)(args...);     \
+    }                                                              \
+  };                                                               \
+  extern DynLoad__##__name __name
+#else
+#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)            \
+  struct DynLoad__##__name {                               \
+    template <typename... Args>                            \
+    inline CUptiResult CUPTIAPI operator()(Args... args) { \
+      return __name(args...);                              \
+    }                                                      \
+  };                                                       \
+  extern DynLoad__##__name __name
+#endif
+
+#define CUPTI_ROUTINE_EACH(__macro)           \
+  __macro(cuptiActivityEnable);               \
+  __macro(cuptiActivityDisable);              \
+  __macro(cuptiActivityRegisterCallbacks);    \
+  __macro(cuptiActivityGetAttribute);         \
+  __macro(cuptiActivitySetAttribute);         \
+  __macro(cuptiGetTimestamp);                 \
+  __macro(cuptiActivityGetNextRecord);        \
+  __macro(cuptiGetResultString);              \
+  __macro(cuptiActivityGetNumDroppedRecords); \
+  __macro(cuptiActivityFlushAll);             \
+  __macro(cuptiFinalize);                     \
+  __macro(cuptiSubscribe);                    \
+  __macro(cuptiUnsubscribe);                  \
+  __macro(cuptiEnableCallback);
+
+CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+
+#endif  // PADDLE_WITH_CUPTI
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index db1eb41f28e67ee4ed6b276714db989bd25ece2e..8eb5966e5776004a03fee17b74ae72614331a694 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -40,10 +40,14 @@ DEFINE_string(nccl_dir, "",
               "libcurand. For instance, /usr/local/cuda/lib64. If default, "
               "dlopen will search cuda from LD_LIBRARY_PATH");
 
+DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
+
 namespace paddle {
 namespace platform {
 namespace dynload {
 
+static const char* cupti_lib_path = CUPTI_LIB_PATH;
+
 static inline std::string join(const std::string& part1,
                                const std::string& part2) {
   // directory separator
@@ -143,6 +147,18 @@ void GetCUDNNDsoHandle(void** dso_handle) {
 #endif
 }
 
+void GetCUPTIDsoHandle(void** dso_handle) {
+  std::string cupti_path = cupti_lib_path;
+  if (!FLAGS_cupti_dir.empty()) {
+    cupti_path = FLAGS_cupti_dir;
+  }
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", dso_handle, false);
+#else
+  GetDsoHandleFromSearchPath(cupti_path, "libcupti.so", dso_handle, false);
+#endif
+}
+
 void GetCurandDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
   GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index 4ffc335332698d1aba262edf2800965e72de77cb..b5b9c4af916241c1c7361b506f74563ebcf69b9a 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -34,6 +34,8 @@ void GetCublasDsoHandle(void** dso_handle);
  */
 void GetCUDNNDsoHandle(void** dso_handle);
 
+void GetCUPTIDsoHandle(void** dso_handle);
+
 /**
  * @brief    load the DSO of CURAND
  *
diff --git a/paddle/fluid/platform/nccl_test.cu b/paddle/fluid/platform/nccl_test.cu
index 212ea8517e897f86a3c19bb5d996c567854811a6..32a293796c09e5254c5eb48d11fa74617b3465ac 100644
--- a/paddle/fluid/platform/nccl_test.cu
+++ b/paddle/fluid/platform/nccl_test.cu
@@ -129,9 +129,6 @@ TEST(NCCL, all_reduce) {
 }  // namespace paddle
 
 int main(int argc, char** argv) {
-  // FIXME(tonyyang-svail):
-  //   Due to the driver issue on our CI, disable for now
-  return 0;
   dev_count = paddle::platform::GetCUDADeviceCount();
   if (dev_count <= 1) {
     LOG(WARNING)
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 4804df7966dfedf7264eebaad3a42ed92739b096..0076762d2f8c3840497ad354234680c4e41607cb 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -15,7 +15,13 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 #include <iomanip>
 #include <map>
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#endif  // PADDLE_WITH_CUDA
 #include "glog/logging.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/platform/device_tracer.h"
+#include "paddle/fluid/string/printf.h"
 
 namespace paddle {
 namespace platform {
@@ -132,10 +138,13 @@ RecordEvent::RecordEvent(const std::string& name,
   dev_ctx_ = dev_ctx;
   name_ = name;
   PushEvent(name_, dev_ctx_);
+  // Maybe need the same push/pop behavior.
+  SetCurAnnotation(name_.c_str());
 }
 
 RecordEvent::~RecordEvent() {
   if (g_state == ProfilerState::kDisabled) return;
+  ClearCurAnnotation();
   PopEvent(name_, dev_ctx_);
 }
 
@@ -147,7 +156,14 @@ void EnableProfiler(ProfilerState state) {
                  "The profiling state should be disabled when calling ",
                  "EnableProfiler.");
   g_state = state;
-  g_profiler_place = (g_state == ProfilerState::kCUDA) ? "CUDA" : "CPU";
+  if (g_state == ProfilerState::kCUDA) {
+    g_profiler_place = "CUDA";
+  } else if (g_state == ProfilerState::kCPU) {
+    g_profiler_place = "CPU";
+  } else {
+    g_profiler_place = "All";
+    GetDeviceTracer()->Enable();
+  }
 #ifdef PADDLE_WITH_CUDA
   if (g_state == ProfilerState::kCUDA) {
     // Generate some dummy evenets first to reduce the startup overhead.
@@ -190,6 +206,12 @@ void DisableProfiler(EventSortingKey sorted_key) {
   Mark("_stop_profiler_", nullptr);
   g_state = ProfilerState::kDisabled;
 
+  DeviceTracer* tracer = GetDeviceTracer();
+  if (g_profiler_place == "All" && tracer && tracer->IsEnabled()) {
+    tracer->Disable();
+    tracer->GenProfile();
+  }
+
   std::vector<std::vector<Event>> all_events = GetAllEvents();
   ParseEvents(all_events, sorted_key);
   ResetProfiler();
@@ -254,9 +276,11 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
         }
 
         if (rit != pushed_events.rend()) {
-          double event_time = (g_profiler_place == "CUDA")
-                                  ? rit->CudaElapsedMs(events[i][j])
-                                  : rit->CpuElapsedMs(events[i][j]);
+          double event_time =
+              (g_profiler_place == "CUDA" || g_profiler_place == "All")
+                  ? rit->CudaElapsedMs(events[i][j])
+                  : rit->CpuElapsedMs(events[i][j]);
+
           std::string event_name =
               "thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
           max_name_width = std::max(max_name_width, event_name.size());
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index a3d22df70057e7967d9fc349ea0cbd73ceb8e0e9..775edb85c0c3b4b6d2cf2e86c527af0722c72fea 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <mutex>
 #include <vector>
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/profiler.pb.h"
 
 namespace paddle {
 namespace platform {
@@ -93,6 +94,7 @@ enum ProfilerState {
   kDisabled,  // disabled state
   kCPU,       // CPU profiling state
   kCUDA,      // GPU profiling state
+  kAll,       // Profile both CPU and GPU. (Currently experimental).
 };
 
 void Mark(const std::string& name, const DeviceContext* dev_ctx);
@@ -102,7 +104,7 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
 void PopEvent(const std::string& name, const DeviceContext* dev_ctx);
 
 struct RecordEvent {
-  explicit RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
+  RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
 
   ~RecordEvent();
 
@@ -110,9 +112,12 @@ struct RecordEvent {
   const DeviceContext* dev_ctx_;
   // Event name
   std::string name_;
+  // Need to distinguish name by op type, block_id, program_id and perhaps
+  // different kernel invocations within an op.
+  std::string full_name_;
 };
 
-// Return the event list of all threads. Asummed the returned value calls
+// Return the event list of all threads. Assumed the returned value calls
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
 std::vector<std::vector<Event>> GetAllEvents();
 
diff --git a/paddle/fluid/platform/profiler.proto b/paddle/fluid/platform/profiler.proto
new file mode 100644
index 0000000000000000000000000000000000000000..bdd86a0440d2b00eaee14195030456d0ad217f9a
--- /dev/null
+++ b/paddle/fluid/platform/profiler.proto
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax = "proto2";
+package paddle.platform.proto;
+
+message Event {
+  optional string name = 1;
+  optional uint64 start_ns = 2;
+  optional uint64 end_ns = 3;
+  optional uint32 device_id = 5;
+  optional uint32 stream_id = 6;
+}
+
+message Profile {
+  repeated Event events = 1;
+  optional uint64 start_ns = 2;
+  optional uint64 end_ns = 3;
+}
\ No newline at end of file
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index b725be79529c5ccdde12446b5b5c09eaf47550e6..b0a2497d919b65afbe5eeaf4fe47c19baa1aba1c 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -252,7 +252,7 @@ void BindVarDsec(py::module &m) {
       .value("CHANNEL", proto::VarType::CHANNEL)
       .value("PLACE_LIST", proto::VarType::PLACE_LIST)
       .value("READER", proto::VarType::READER)
-      .value("NCCL_COM", proto::VarType::NCCL_COM);
+      .value("RAW", proto::VarType::RAW);
 }
 
 void BindOpDesc(py::module &m) {
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index abe2b114492007ec19f2fcdb09aa173c88badbf5..ac7d1efb577505b70e10a70cdcfd3ed9c5fe1f5c 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -49,11 +49,6 @@ PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
 
 namespace paddle {
 namespace pybind {
-static size_t UniqueIntegerGenerator(const std::string &prefix) {
-  static std::unordered_map<std::string, std::atomic<size_t>> generators;
-  return generators[prefix].fetch_add(1);
-}
-
 bool IsCompiledWithCUDA() {
 #ifndef PADDLE_WITH_CUDA
   return false;
@@ -410,7 +405,6 @@ All parameter, weight, gradient are variables in Paddle.
            (void (Executor::*)(const ProgramDesc &, Scope *, int, bool, bool)) &
                Executor::Run);
 
-  m.def("unique_integer", UniqueIntegerGenerator);
   m.def("init_gflags", framework::InitGflags);
   m.def("init_glog", framework::InitGLOG);
   m.def("init_devices", &framework::InitDevices);
@@ -465,6 +459,7 @@ All parameter, weight, gradient are variables in Paddle.
       .value("kDisabled", platform::ProfilerState::kDisabled)
       .value("kCPU", platform::ProfilerState::kCPU)
       .value("kCUDA", platform::ProfilerState::kCUDA)
+      .value("kAll", platform::ProfilerState::kAll)
       .export_values();
 
   py::enum_<platform::EventSortingKey>(m, "EventSortingKey", py::arithmetic())
diff --git a/paddle/scripts/docker/README.md b/paddle/scripts/docker/README.md
index 65c46745556bc5ea91fdd4e33060f2535422e8e8..78c0cc378231f763597556cc5450f6f03ab2b291 100644
--- a/paddle/scripts/docker/README.md
+++ b/paddle/scripts/docker/README.md
@@ -58,7 +58,7 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
 | `WITH_AVX` | OFF | Set to "ON" to enable AVX support. |
 | `WITH_TESTING` | OFF | Build unit tests binaries. |
 | `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. |
-| `WITH_GOLANG` | ON | Build fault-tolerant parameter server written in go. |
+| `WITH_GOLANG` | OFF | Build fault-tolerant parameter server written in go. |
 | `WITH_SWIG_PY` | ON | Build with SWIG python API support. |
 | `WITH_C_API` | OFF | Build capi libraries for inference. |
 | `WITH_PYTHON` | ON | Build with python support. Turn this off if build is only for capi. |
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 56fa138786104df3b67cd5248d1625509cc913d1..06319fc638984f8f8ed897c362f516e1534bc8db 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -40,7 +40,7 @@ function cmake_gen() {
         -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
         -DWITH_MKL=${WITH_MKL:-ON}
         -DWITH_AVX=${WITH_AVX:-OFF}
-        -DWITH_GOLANG=${WITH_GOLANG:-ON}
+        -DWITH_GOLANG=${WITH_GOLANG:-OFF}
         -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
         -DWITH_SWIG_PY=ON
         -DWITH_C_API=${WITH_C_API:-OFF}
@@ -49,6 +49,7 @@ function cmake_gen() {
         -DCUDNN_ROOT=/usr/
         -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON}
         -DWITH_TESTING=${WITH_TESTING:-ON}
+        -DWITH_FAST_BUNDLE_TEST=ON
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
     ========================================
 EOF
@@ -64,7 +65,7 @@ EOF
         -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
         -DWITH_MKL=${WITH_MKL:-ON} \
         -DWITH_AVX=${WITH_AVX:-OFF} \
-        -DWITH_GOLANG=${WITH_GOLANG:-ON} \
+        -DWITH_GOLANG=${WITH_GOLANG:-OFF} \
         -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
         -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
         -DWITH_C_API=${WITH_C_API:-OFF} \
@@ -72,6 +73,7 @@ EOF
         -DCUDNN_ROOT=/usr/ \
         -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
         -DWITH_TESTING=${WITH_TESTING:-ON} \
+        -DWITH_FAST_BUNDLE_TEST=ON \
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 }
 
@@ -171,7 +173,7 @@ EOF
 EOF
 
     if [[ ${WITH_GPU} == "ON"  ]]; then
-        NCCL_DEPS="apt-get install -y libnccl-dev &&"
+        NCCL_DEPS="apt-get install -y libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 &&"
     else
         NCCL_DEPS=""
     fi
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index 270f2f4c181df847348be12a199534d47b3577f5..0fea6a80794a64abc2dbf1428d534840febcd450 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -28,10 +28,9 @@ int main(int argc, char** argv) {
   }
 #ifdef PADDLE_WITH_CUDA
   new_argv.push_back(
-      strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory,"
-             "warpctc_dir"));
+      strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
 #else
-  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory,warpctc_dir"));
+  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory"));
 #endif
   int new_argc = static_cast<int>(new_argv.size());
   char** new_argv_address = new_argv.data();
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 36919ab00bf6fc1f9aee350074e5532bfdc7d45e..0d497dcfce909d969081ac49dd760e70f025f73d 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -3,12 +3,14 @@ file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
 file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
 file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py)
 file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py)
+file(GLOB_RECURSE FLUID_PY_FILES ./paddle/fluid/ *.py)
 
 set(PY_FILES paddle/__init__.py
   ${TRAINER_PY_FILES}
   ${HELPERS_PY_FILES}
   ${UTILS_PY_FILES}
-  ${V2_PY_FILES})
+  ${V2_PY_FILES}
+  ${FLUID_PY_FILES})
 
 add_custom_target(copy_paddle_master)
 
@@ -43,10 +45,10 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 
 
-add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so
-        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so
+add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/python/paddle/fluid/core.so
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_SOURCE_DIR}/python/paddle/fluid/core.so
         DEPENDS paddle_pybind)
-add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so)
+add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/paddle/fluid/core.so)
 
 
 add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
@@ -72,7 +74,7 @@ if (WITH_TESTING)
     add_subdirectory(paddle/v2/tests)
     add_subdirectory(paddle/v2/reader/tests)
     add_subdirectory(paddle/v2/plot/tests)
-    add_subdirectory(paddle/v2/fluid/tests)
+    add_subdirectory(paddle/fluid/tests)
   endif()
 endif()
 install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
diff --git a/python/paddle/fluid/.gitignore b/python/paddle/fluid/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..80c1cf3fcb86aa50600e02e4765640a91560916e
--- /dev/null
+++ b/python/paddle/fluid/.gitignore
@@ -0,0 +1,2 @@
+proto
+core.so
diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/fluid/__init__.py
similarity index 98%
rename from python/paddle/v2/fluid/__init__.py
rename to python/paddle/fluid/__init__.py
index 361fb3f5ad9394a5cc1a9927005e7276ee056e90..39d13d3ab5fb8340509e01b0bd1de6f66ce99c21 100644
--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -39,6 +39,7 @@ from concurrency import (Go, make_channel, channel_send, channel_recv,
 import clip
 from memory_optimization_transpiler import memory_optimize
 import profiler
+import unique_name
 
 Tensor = LoDTensor
 
@@ -63,6 +64,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + [
     'DistributeTranspiler',
     'memory_optimize',
     'profiler',
+    'unique_name',
 ]
 
 
diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/fluid/backward.py
similarity index 99%
rename from python/paddle/v2/fluid/backward.py
rename to python/paddle/fluid/backward.py
index ba27aaa24601bd72bcdbd064242ea2b1c345340c..58fa7f1bebc082df11e07f5f97927b417af3e4e8 100644
--- a/python/paddle/v2/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -12,10 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle.v2.fluid import framework as framework
+from paddle.fluid import framework as framework
 from . import core
 import collections
 import copy
+import unique_name
 
 __all__ = [
     'append_backward',
@@ -391,7 +392,7 @@ def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map):
 
         for name in op_desc.output_arg_names():
             if block.desc.find_var(name.encode("ascii")):
-                new_name = "%s_%s" % (name, core.unique_integer(name))
+                new_name = unique_name.generate(name)
                 op_desc.rename_output(name, new_name)
                 var_map[name] = new_name
 
diff --git a/python/paddle/v2/fluid/clip.py b/python/paddle/fluid/clip.py
similarity index 100%
rename from python/paddle/v2/fluid/clip.py
rename to python/paddle/fluid/clip.py
diff --git a/python/paddle/v2/fluid/concurrency.py b/python/paddle/fluid/concurrency.py
similarity index 100%
rename from python/paddle/v2/fluid/concurrency.py
rename to python/paddle/fluid/concurrency.py
diff --git a/python/paddle/v2/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
similarity index 100%
rename from python/paddle/v2/fluid/data_feeder.py
rename to python/paddle/fluid/data_feeder.py
diff --git a/python/paddle/v2/fluid/debuger.py b/python/paddle/fluid/debuger.py
similarity index 100%
rename from python/paddle/v2/fluid/debuger.py
rename to python/paddle/fluid/debuger.py
diff --git a/python/paddle/v2/fluid/default_scope_funcs.py b/python/paddle/fluid/default_scope_funcs.py
similarity index 94%
rename from python/paddle/v2/fluid/default_scope_funcs.py
rename to python/paddle/fluid/default_scope_funcs.py
index eeb9fb204337b775e95a8e1a5628c86275a6e5c9..f8faf6942524612ccc63713240bb289eeeaf75eb 100644
--- a/python/paddle/v2/fluid/default_scope_funcs.py
+++ b/python/paddle/fluid/default_scope_funcs.py
@@ -26,7 +26,7 @@ A `scoped_function` will take a `function` as input. That function will be
 invoked in a new local scope.
 """
 
-import paddle.v2.fluid.core
+import paddle.fluid.core
 import threading
 
 __tl_scope__ = threading.local()
@@ -44,13 +44,13 @@ __all__ = [
 def get_cur_scope():
     """
     Get current scope.
-    :rtype: paddle.v2.fluid.core.Scope
+    :rtype: paddle.fluid.core.Scope
     """
     cur_scope_stack = getattr(__tl_scope__, 'cur_scope', None)
     if cur_scope_stack is None:
         __tl_scope__.cur_scope = list()
     if len(__tl_scope__.cur_scope) == 0:
-        __tl_scope__.cur_scope.append(paddle.v2.fluid.core.Scope())
+        __tl_scope__.cur_scope.append(paddle.fluid.core.Scope())
     return __tl_scope__.cur_scope[-1]
 
 
diff --git a/python/paddle/v2/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py
similarity index 99%
rename from python/paddle/v2/fluid/distribute_transpiler.py
rename to python/paddle/fluid/distribute_transpiler.py
index 2fcf3753c5f1211d3b27f38fbdc8d097c437c79a..8da9ca290b22ae69b1fd195d8614c31dc4e13e00 100644
--- a/python/paddle/v2/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -226,8 +226,7 @@ class DistributeTranspiler:
         rpc_client_var = program.global_block().create_var(
             name="RPC_CLIENT_VAR",
             persistable=True,
-            dtype='float32',  # dtype and shape is not used in fact
-            shape=[0])
+            type=core.VarDesc.VarType.RAW)
 
         # create send_op
         program.global_block().append_op(
diff --git a/python/paddle/v2/fluid/distribute_transpiler_simple.py b/python/paddle/fluid/distribute_transpiler_simple.py
similarity index 100%
rename from python/paddle/v2/fluid/distribute_transpiler_simple.py
rename to python/paddle/fluid/distribute_transpiler_simple.py
diff --git a/python/paddle/v2/fluid/distributed_spliter.py b/python/paddle/fluid/distributed_spliter.py
similarity index 100%
rename from python/paddle/v2/fluid/distributed_spliter.py
rename to python/paddle/fluid/distributed_spliter.py
diff --git a/python/paddle/v2/fluid/evaluator.py b/python/paddle/fluid/evaluator.py
similarity index 98%
rename from python/paddle/v2/fluid/evaluator.py
rename to python/paddle/fluid/evaluator.py
index 1f4618310cbba8ef3fdce6a3beb01876c5074e32..8cc49053337a25d917b85a69a453cf29b1597548 100644
--- a/python/paddle/v2/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -15,7 +15,8 @@
 import numpy as np
 
 import layers
-from framework import Program, unique_name, Variable, program_guard
+from framework import Program, Variable, program_guard
+import unique_name
 from layer_helper import LayerHelper
 
 __all__ = [
@@ -96,7 +97,7 @@ class Evaluator(object):
 
         """
         state = self.helper.create_variable(
-            name="_".join([unique_name(self.helper.name), suffix]),
+            name="_".join([unique_name.generate(self.helper.name), suffix]),
             persistable=True,
             dtype=dtype,
             shape=shape)
diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/fluid/executor.py
similarity index 100%
rename from python/paddle/v2/fluid/executor.py
rename to python/paddle/fluid/executor.py
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/fluid/framework.py
similarity index 98%
rename from python/paddle/v2/fluid/framework.py
rename to python/paddle/fluid/framework.py
index 0f6cb90e27c714d02d00402aeee4e5d718f77502..2ca8c320842a1d8dce04d190bb200f3d49c78154 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -20,6 +20,7 @@ import numpy as np
 
 import proto.framework_pb2 as framework_pb2
 from . import core
+import unique_name
 
 __all__ = [
     'Block',
@@ -47,20 +48,6 @@ def grad_var_name(var_name):
     return var_name + GRAD_VAR_SUFFIX
 
 
-def unique_name(prefix):
-    """
-    Generate unique names with prefix
-
-    Args:
-        prefix(str): The prefix of return string
-
-    Returns(str): A unique string with the prefix
-
-    """
-    uid = core.unique_integer(prefix)  # unique during whole process.
-    return "_".join([prefix, str(uid)])
-
-
 def convert_np_dtype_to_dtype_(np_dtype):
     """
     Convert the data type in numpy to the data type in Paddle
@@ -175,7 +162,7 @@ class Variable(object):
         self.error_clip = error_clip
 
         if name is None:
-            name = Variable._unique_var_name_()
+            name = unique_name.generate('_generated_var')
         is_new_var = False
         self.desc = self.block.desc.find_var(name)
 
@@ -307,12 +294,6 @@ class Variable(object):
     def type(self):
         return self.desc.type()
 
-    @staticmethod
-    def _unique_var_name_():
-        prefix = "_generated_var"
-        uid = core.unique_integer(prefix)  # unique during whole process.
-        return "_".join([prefix, str(uid)])
-
     def set_error_clip(self, error_clip):
         self.error_clip = error_clip
 
@@ -766,13 +747,8 @@ class Block(object):
         if not self.has_var(name):
             raise ValueError("var %s is not in current" % name)
         v = self.var(name)
-        stop_gradient = None
-        trainable = None
-        optimize_attr = None
-        regularizer = None
-        gradient_clip_attr = None
-        error_clip = None
         if type(v) == Parameter:
+            var_type = "Parameter"
             stop_gradient = v.stop_gradient
             trainable = v.trainable
             optimize_attr = v.optimize_attr
@@ -780,15 +756,16 @@ class Block(object):
             gradient_clip_attr = v.gradient_clip_attr
             error_clip = v.error_clip
         elif type(v) == Variable:
+            var_type = "Variable"
             error_clip = v.error_clip
             stop_gradient = v.stop_gradient
         else:
             raise ValueError("unsupported var type: %s", type(v))
 
         self.desc.rename_var(name, new_name)
+        # NOTE: v is destroyed by C++ after calling rename_var.
         d = self.desc.find_var(new_name)
-        var = None
-        if type(v) == Parameter:
+        if var_type == "Parameter":
             var = Parameter(
                 self,
                 d.shape(),
@@ -800,9 +777,10 @@ class Block(object):
                 regularizer=regularizer,
                 gradient_clip_attr=gradient_clip_attr,
                 error_clip=error_clip)
-        elif type(v) == Variable:
+        elif var_type == "Variable":
             var = Variable(
                 self,
+                type=v.type,
                 name=new_name,
                 error_clip=error_clip,
                 stop_gradient=stop_gradient)
diff --git a/python/paddle/v2/fluid/graphviz.py b/python/paddle/fluid/graphviz.py
similarity index 100%
rename from python/paddle/v2/fluid/graphviz.py
rename to python/paddle/fluid/graphviz.py
diff --git a/python/paddle/v2/fluid/initializer.py b/python/paddle/fluid/initializer.py
similarity index 100%
rename from python/paddle/v2/fluid/initializer.py
rename to python/paddle/fluid/initializer.py
diff --git a/python/paddle/v2/fluid/io.py b/python/paddle/fluid/io.py
similarity index 80%
rename from python/paddle/v2/fluid/io.py
rename to python/paddle/fluid/io.py
index 8a8bd089b5890f99c5f8f8961a0c2974208ca3e0..1817caa94275e4efa47ec1a5a0aa861255c75561 100644
--- a/python/paddle/v2/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -14,8 +14,8 @@
 
 import os
 
-from paddle.v2.fluid.evaluator import Evaluator
-from paddle.v2.fluid.framework import Program, Parameter, default_main_program, Variable
+from paddle.fluid.evaluator import Evaluator
+from paddle.fluid.framework import Program, Parameter, default_main_program, Variable
 from . import core
 
 __all__ = [
@@ -68,7 +68,7 @@ def save_vars(executor,
               main_program=None,
               vars=None,
               predicate=None,
-              save_file_name=None):
+              filename=None):
     """
     Save variables to directory by executor.
 
@@ -80,8 +80,8 @@ def save_vars(executor,
     as a bool. If it returns true, the corresponding input variable will be saved.
     :param vars: variables need to be saved. If vars is specified, program & predicate
     will be ignored
-    :param save_file_name: The name of a single file that all vars are saved to. 
-    If it is None, save variables to separate files.
+    :param filename: The name of a single file that all vars are saved to.
+        If it is None, save variables to separate files.
 
     :return: None
     """
@@ -95,7 +95,7 @@ def save_vars(executor,
             executor,
             dirname=dirname,
             vars=filter(predicate, main_program.list_vars()),
-            save_file_name=save_file_name)
+            filename=filename)
     else:
         save_program = Program()
         save_block = save_program.global_block()
@@ -103,7 +103,7 @@ def save_vars(executor,
         save_var_map = {}
         for each_var in vars:
             new_var = _clone_var_in_block_(save_block, each_var)
-            if save_file_name is None:
+            if filename is None:
                 save_block.append_op(
                     type='save',
                     inputs={'X': [new_var]},
@@ -112,7 +112,7 @@ def save_vars(executor,
             else:
                 save_var_map[new_var.name] = new_var
 
-        if save_file_name is not None:
+        if filename is not None:
             save_var_list = []
             for name in sorted(save_var_map.keys()):
                 save_var_list.append(save_var_map[name])
@@ -121,12 +121,12 @@ def save_vars(executor,
                 type='save_combine',
                 inputs={'X': save_var_list},
                 outputs={},
-                attrs={'file_path': os.path.join(dirname, save_file_name)})
+                attrs={'file_path': os.path.join(dirname, filename)})
 
         executor.run(save_program)
 
 
-def save_params(executor, dirname, main_program=None, save_file_name=None):
+def save_params(executor, dirname, main_program=None, filename=None):
     """
     Save all parameters to directory with executor.
     """
@@ -136,11 +136,10 @@ def save_params(executor, dirname, main_program=None, save_file_name=None):
         main_program=main_program,
         vars=None,
         predicate=is_parameter,
-        save_file_name=save_file_name)
+        filename=filename)
 
 
-def save_persistables(executor, dirname, main_program=None,
-                      save_file_name=None):
+def save_persistables(executor, dirname, main_program=None, filename=None):
     """
     Save all persistables to directory with executor.
     """
@@ -150,7 +149,7 @@ def save_persistables(executor, dirname, main_program=None,
         main_program=main_program,
         vars=None,
         predicate=is_persistable,
-        save_file_name=save_file_name)
+        filename=filename)
 
 
 def load_vars(executor,
@@ -158,7 +157,7 @@ def load_vars(executor,
               main_program=None,
               vars=None,
               predicate=None,
-              load_file_name=None):
+              filename=None):
     """
     Load variables from directory by executor.
 
@@ -170,8 +169,8 @@ def load_vars(executor,
     as a bool. If it returns true, the corresponding input variable will be loaded.
     :param vars: variables need to be loaded. If vars is specified, program &
     predicate will be ignored
-    :param load_file_name: The name of the single file that all vars are loaded from.   
-    If it is None, load variables from separate files.
+    :param filename: The name of the single file that all vars are loaded from.
+        If it is None, load variables from separate files.
 
     :return: None
     """
@@ -185,7 +184,7 @@ def load_vars(executor,
             executor,
             dirname=dirname,
             vars=filter(predicate, main_program.list_vars()),
-            load_file_name=load_file_name)
+            filename=filename)
     else:
         load_prog = Program()
         load_block = load_prog.global_block()
@@ -194,7 +193,7 @@ def load_vars(executor,
         for each_var in vars:
             assert isinstance(each_var, Variable)
             new_var = _clone_var_in_block_(load_block, each_var)
-            if load_file_name is None:
+            if filename is None:
                 load_block.append_op(
                     type='load',
                     inputs={},
@@ -203,7 +202,7 @@ def load_vars(executor,
             else:
                 load_var_map[new_var.name] = new_var
 
-        if load_file_name is not None:
+        if filename is not None:
             load_var_list = []
             for name in sorted(load_var_map.keys()):
                 load_var_list.append(load_var_map[name])
@@ -212,12 +211,12 @@ def load_vars(executor,
                 type='load_combine',
                 inputs={},
                 outputs={"Out": load_var_list},
-                attrs={'file_path': os.path.join(dirname, load_file_name)})
+                attrs={'file_path': os.path.join(dirname, filename)})
 
         executor.run(load_prog)
 
 
-def load_params(executor, dirname, main_program=None, load_file_name=None):
+def load_params(executor, dirname, main_program=None, filename=None):
     """
     load all parameters from directory by executor.
     """
@@ -226,11 +225,10 @@ def load_params(executor, dirname, main_program=None, load_file_name=None):
         dirname=dirname,
         main_program=main_program,
         predicate=is_parameter,
-        load_file_name=load_file_name)
+        filename=filename)
 
 
-def load_persistables(executor, dirname, main_program=None,
-                      load_file_name=None):
+def load_persistables(executor, dirname, main_program=None, filename=None):
     """
     load all persistables from directory by executor.
     """
@@ -239,7 +237,7 @@ def load_persistables(executor, dirname, main_program=None,
         dirname=dirname,
         main_program=main_program,
         predicate=is_persistable,
-        load_file_name=load_file_name)
+        filename=filename)
 
 
 def get_inference_program(target_vars, main_program=None):
@@ -299,7 +297,8 @@ def save_inference_model(dirname,
                          target_vars,
                          executor,
                          main_program=None,
-                         save_file_name=None):
+                         model_filename=None,
+                         params_filename=None):
     """
     Build a model especially for inference,
     and save it to directory by the executor.
@@ -310,8 +309,11 @@ def save_inference_model(dirname,
     :param executor: executor that save inference model
     :param main_program: original program, which will be pruned to build the inference model.
             Default default_main_program().
-    :param save_file_name: The name of a single file that all parameters are saved to. 
-    If it is None, save parameters to separate files.
+    :param model_filename: The name of file to save inference program.
+        If not specified, default filename `__model__` will be used.
+    :param params_filename: The name of file to save parameters.
+        It is used for the case that all parameters are saved in a single binary file.
+        If not specified, parameters are considered saved in separate files.
 
     :return: None
     """
@@ -342,15 +344,19 @@ def save_inference_model(dirname,
     prepend_feed_ops(inference_program, feeded_var_names)
     append_fetch_ops(inference_program, fetch_var_names)
 
-    if save_file_name == None:
-        model_file_name = dirname + "/__model__"
+    if model_filename is not None:
+        model_filename = os.path.basename(model_filename)
     else:
-        model_file_name = dirname + "/__model_combined__"
+        model_filename = "__model__"
+    model_filename = os.path.join(dirname, model_filename)
 
-    with open(model_file_name, "wb") as f:
+    if params_filename is not None:
+        params_filename = os.path.basename(params_filename)
+
+    with open(model_filename, "wb") as f:
         f.write(inference_program.desc.serialize_to_string())
 
-    save_persistables(executor, dirname, inference_program, save_file_name)
+    save_persistables(executor, dirname, inference_program, params_filename)
 
 
 def get_feed_targets_names(program):
@@ -371,15 +377,21 @@ def get_fetch_targets_names(program):
     return fetch_targets_names
 
 
-def load_inference_model(dirname, executor, load_file_name=None):
+def load_inference_model(dirname,
+                         executor,
+                         model_filename=None,
+                         params_filename=None):
     """
     Load inference model from a directory
 
     :param dirname: directory path
     :param executor: executor that load inference model
-    :param load_file_name: The name of the single file that all parameters are loaded from.   
-    If it is None, load parameters from separate files.
-    
+    :param model_filename: The name of file to load inference program.
+        If not specified, default filename `__model__` will be used.
+    :param params_filename: The name of file to load parameters.
+        It is used for the case that all parameters are saved in a single binary file.
+        If not specified, parameters are considered saved in separate files.
+
     :return: [program, feed_target_names, fetch_targets]
              program: program especially for inference.
              feed_target_names: Names of variables that need to feed data
@@ -388,16 +400,20 @@ def load_inference_model(dirname, executor, load_file_name=None):
     if not os.path.isdir(dirname):
         raise ValueError("There is no directory named '%s'", dirname)
 
-    if load_file_name == None:
-        model_file_name = dirname + "/__model__"
+    if model_filename is not None:
+        model_filename = os.path.basename(model_filename)
     else:
-        model_file_name = dirname + "/__model_combined__"
+        model_filename = "__model__"
+    model_filename = os.path.join(dirname, model_filename)
+
+    if params_filename is not None:
+        params_filename = os.path.basename(params_filename)
 
-    with open(model_file_name, "rb") as f:
+    with open(model_filename, "rb") as f:
         program_desc_str = f.read()
 
     program = Program.parse_from_string(program_desc_str)
-    load_persistables(executor, dirname, program, load_file_name)
+    load_persistables(executor, dirname, program, params_filename)
 
     feed_target_names = get_feed_targets_names(program)
     fetch_target_names = get_fetch_targets_names(program)
diff --git a/python/paddle/v2/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
similarity index 93%
rename from python/paddle/v2/fluid/layer_helper.py
rename to python/paddle/fluid/layer_helper.py
index e7abc23f2f1da967ed6fff2e758f3dc6f80d60a8..6437dbb446e13d9ba24098dd936bfba395c6a566 100644
--- a/python/paddle/v2/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -15,9 +15,9 @@
 import copy
 import itertools
 
-from framework import Variable, Parameter, default_main_program, default_startup_program, \
-    unique_name, dtype_is_floating
-from paddle.v2.fluid.initializer import Constant, Xavier
+from framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating
+import unique_name
+from paddle.fluid.initializer import Constant, Xavier
 from param_attr import ParamAttr, WeightNormParamAttr
 
 
@@ -27,7 +27,7 @@ class LayerHelper(object):
         self.layer_type = layer_type
         name = self.kwargs.get('name', None)
         if name is None:
-            self.kwargs['name'] = unique_name(self.layer_type)
+            self.kwargs['name'] = unique_name.generate(self.layer_type)
 
     @property
     def name(self):
@@ -117,17 +117,20 @@ class LayerHelper(object):
                       block=self.startup_program.global_block()):
             if out is None:
                 out = block.create_var(
-                    name=unique_name(".".join([self.name, 'weight_norm_norm'])),
+                    name=unique_name.generate(".".join(
+                        [self.name, 'weight_norm_norm'])),
                     dtype=dtype,
                     persistable=False)
             abs_out = block.create_var(
-                name=unique_name(".".join([self.name, 'weight_norm_abs'])),
+                name=unique_name.generate(".".join(
+                    [self.name, 'weight_norm_abs'])),
                 dtype=dtype,
                 persistable=False)
             block.append_op(
                 type='abs', inputs={'X': x}, outputs={'Out': abs_out})
             pow_out = block.create_var(
-                name=unique_name(".".join([self.name, 'weight_norm_pow'])),
+                name=unique_name.generate(".".join(
+                    [self.name, 'weight_norm_pow'])),
                 dtype=dtype,
                 persistable=False)
             block.append_op(
@@ -136,7 +139,8 @@ class LayerHelper(object):
                 outputs={'Out': pow_out},
                 attrs={'factor': float(p)})
             sum_out = block.create_var(
-                name=unique_name(".".join([self.name, 'weight_norm_sum'])),
+                name=unique_name.generate(".".join(
+                    [self.name, 'weight_norm_sum'])),
                 dtype=dtype,
                 persistable=False)
             block.append_op(
@@ -161,7 +165,7 @@ class LayerHelper(object):
                          block=self.startup_program.global_block()):
             if out is None:
                 out = block.create_var(
-                    name=unique_name(".".join(
+                    name=unique_name.generate(".".join(
                         [self.name, 'weight_norm_reshape'])),
                     dtype=dtype,
                     persistable=False)
@@ -178,7 +182,7 @@ class LayerHelper(object):
                            block=self.startup_program.global_block()):
             if out is None:
                 out = block.create_var(
-                    name=unique_name(".".join(
+                    name=unique_name.generate(".".join(
                         [self.name, 'weight_norm_transpose'])),
                     dtype=dtype,
                     persistable=False)
@@ -196,7 +200,8 @@ class LayerHelper(object):
             """Computes the norm over all dimensions except dim"""
             if out is None:
                 out = block.create_var(
-                    name=unique_name(".".join([self.name, 'weight_norm_norm'])),
+                    name=unique_name.generate(".".join(
+                        [self.name, 'weight_norm_norm'])),
                     dtype=dtype,
                     persistable=False)
             if dim is None:
@@ -286,7 +291,7 @@ class LayerHelper(object):
         assert isinstance(attr, ParamAttr)
         suffix = 'b' if is_bias else 'w'
         if attr.name is None:
-            attr.name = unique_name(".".join([self.name, suffix]))
+            attr.name = unique_name.generate(".".join([self.name, suffix]))
 
         if default_initializer is None and attr.initializer is None:
             if is_bias:
@@ -316,7 +321,7 @@ class LayerHelper(object):
 
     def create_tmp_variable(self, dtype, stop_gradient=False):
         return self.main_program.current_block().create_var(
-            name=unique_name(".".join([self.name, 'tmp'])),
+            name=unique_name.generate(".".join([self.name, 'tmp'])),
             dtype=dtype,
             persistable=False,
             stop_gradient=stop_gradient)
diff --git a/python/paddle/v2/fluid/layers/__init__.py b/python/paddle/fluid/layers/__init__.py
similarity index 100%
rename from python/paddle/v2/fluid/layers/__init__.py
rename to python/paddle/fluid/layers/__init__.py
diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
similarity index 97%
rename from python/paddle/v2/fluid/layers/control_flow.py
rename to python/paddle/fluid/layers/control_flow.py
index b9ab28a86a226c3027b2a449fd645d500d39f14b..1bb1aa30ee1019c6f80eb64b6dc20459e7a3073b 100644
--- a/python/paddle/v2/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -428,7 +428,8 @@ class StaticRNN(object):
                 raise ValueError(
                     "if init is None, memory at least need shape and batch_ref")
             parent_block = self.parent_block()
-            var_name = unique_name("@".join([self.helper.name, "memory_boot"]))
+            var_name = unique_name.generate("@".join(
+                [self.helper.name, "memory_boot"]))
             boot_var = parent_block.create_var(
                 name=var_name,
                 shape=shape,
@@ -450,7 +451,7 @@ class StaticRNN(object):
             return self.memory(init=boot_var)
         else:
             pre_mem = self.helper.create_variable(
-                name=unique_name("@".join([self.helper.name, "mem"])),
+                name=unique_name.generate("@".join([self.helper.name, "mem"])),
                 dtype=init.dtype,
                 shape=init.shape)
             self.memories[pre_mem.name] = StaticRNNMemoryLink(
@@ -652,7 +653,8 @@ class While(object):
         parent_block.append_op(
             type='while',
             inputs={
-                'X': [parent_block.var(x_name) for x_name in x_name_list],
+                'X':
+                [parent_block.var_recursive(x_name) for x_name in x_name_list],
                 'Condition': [self.cond_var]
             },
             outputs={'Out': out_vars,
@@ -709,7 +711,7 @@ def lod_rank_table(x, level=0):
     helper = LayerHelper("lod_rank_table", **locals())
     table = helper.create_variable(
         type=core.VarDesc.VarType.LOD_RANK_TABLE,
-        name=unique_name("lod_rank_table"))
+        name=unique_name.generate("lod_rank_table"))
     helper.append_op(
         type='lod_rank_table',
         inputs={'X': x},
@@ -807,7 +809,7 @@ def lod_tensor_to_array(x, table):
     """
     helper = LayerHelper("lod_tensor_to_array", **locals())
     array = helper.create_variable(
-        name=unique_name("lod_tensor_to_array"),
+        name=unique_name.generate("lod_tensor_to_array"),
         type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
         dtype=x.dtype)
     helper.append_op(
@@ -1264,11 +1266,11 @@ class IfElse(object):
         if id(x) not in self.input_table:
             parent_block = self.parent_block()
             out_true = parent_block.create_var(
-                name=unique_name('ifelse_input' + self.helper.name),
+                name=unique_name.generate('ifelse_input' + self.helper.name),
                 dtype=x.dtype)
 
             out_false = parent_block.create_var(
-                name=unique_name('ifelse_input' + self.helper.name),
+                name=unique_name.generate('ifelse_input' + self.helper.name),
                 dtype=x.dtype)
             parent_block.append_op(
                 type='split_lod_tensor',
@@ -1310,7 +1312,8 @@ class IfElse(object):
                 raise TypeError("Each output should be a variable")
             # create outside tensor
             outside_out = parent_block.create_var(
-                name=unique_name("_".join([self.helper.name, 'output'])),
+                name=unique_name.generate("_".join(
+                    [self.helper.name, 'output'])),
                 dtype=each_out.dtype)
             out_table.append(outside_out)
 
@@ -1373,7 +1376,7 @@ class DynamicRNN(object):
         parent_block = self._parent_block_()
         if self.lod_rank_table is None:
             self.lod_rank_table = parent_block.create_var(
-                name=unique_name('lod_rank_table'),
+                name=unique_name.generate('lod_rank_table'),
                 type=core.VarDesc.VarType.LOD_RANK_TABLE)
             self.lod_rank_table.stop_gradient = True
             parent_block.append_op(
@@ -1381,7 +1384,8 @@ class DynamicRNN(object):
                 inputs={"X": x},
                 outputs={"Out": self.lod_rank_table})
             self.max_seq_len = parent_block.create_var(
-                name=unique_name('dynamic_rnn_max_seq_len'), dtype='int64')
+                name=unique_name.generate('dynamic_rnn_max_seq_len'),
+                dtype='int64')
             self.max_seq_len.stop_gradient = False
             parent_block.append_op(
                 type='max_sequence_len',
@@ -1395,7 +1399,7 @@ class DynamicRNN(object):
                 outputs={'Out': self.cond})
 
         input_array = parent_block.create_var(
-            name=unique_name('dynamic_rnn_input_array'),
+            name=unique_name.generate('dynamic_rnn_input_array'),
             type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
             dtype=x.dtype)
         self.input_array.append((input_array, x.dtype))
@@ -1416,7 +1420,7 @@ class DynamicRNN(object):
                 "static_input() must be called after step_input().")
         parent_block = self._parent_block_()
         x_reordered = parent_block.create_var(
-            name=unique_name("dynamic_rnn_static_input_reordered"),
+            name=unique_name.generate("dynamic_rnn_static_input_reordered"),
             type=core.VarDesc.VarType.LOD_TENSOR,
             dtype=x.dtype)
         parent_block.append_op(
@@ -1478,7 +1482,7 @@ class DynamicRNN(object):
                         'invoked before '
                         'memory(init=init, need_reordered=True, ...).')
                 init_reordered = parent_block.create_var(
-                    name=unique_name('dynamic_rnn_mem_init_reordered'),
+                    name=unique_name.generate('dynamic_rnn_mem_init_reordered'),
                     type=core.VarDesc.VarType.LOD_TENSOR,
                     dtype=init.dtype)
                 parent_block.append_op(
@@ -1490,7 +1494,7 @@ class DynamicRNN(object):
                     outputs={'Out': [init_reordered]})
                 init_tensor = init_reordered
             mem_array = parent_block.create_var(
-                name=unique_name('dynamic_rnn_mem_array'),
+                name=unique_name.generate('dynamic_rnn_mem_array'),
                 type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
                 dtype=init.dtype)
             parent_block.append_op(
@@ -1510,9 +1514,10 @@ class DynamicRNN(object):
                 )
             parent_block = self._parent_block_()
             init = parent_block.create_var(
-                name=unique_name('mem_init'), dtype=dtype)
+                name=unique_name.generate('mem_init'), dtype=dtype)
             arr, dtype = self.input_array[0]
-            in0 = parent_block.create_var(name=unique_name('in0'), dtype=dtype)
+            in0 = parent_block.create_var(
+                name=unique_name.generate('in0'), dtype=dtype)
             parent_block.append_op(
                 type='read_from_array',
                 inputs={'X': [arr],
@@ -1551,7 +1556,7 @@ class DynamicRNN(object):
         parent_block = self._parent_block_()
         for each in outputs:
             outside_array = parent_block.create_var(
-                name=unique_name("_".join(
+                name=unique_name.generate("_".join(
                     [self.helper.name, "output_array", each.name])),
                 type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
                 dtype=each.dtype)
diff --git a/python/paddle/v2/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
similarity index 90%
rename from python/paddle/v2/fluid/layers/detection.py
rename to python/paddle/fluid/layers/detection.py
index 5ae4da1ea31d036217c5595f8b30842403896a7c..f380f5c00cfe709f625c82439577248687dceb7c 100644
--- a/python/paddle/v2/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -16,6 +16,7 @@ All layers just related to the detection neural network.
 """
 
 from layer_function_generator import generate_layer_fn
+from layer_function_generator import autodoc
 from ..layer_helper import LayerHelper
 import tensor
 import ops
@@ -28,6 +29,7 @@ __all__ = [
     'target_assign',
     'detection_output',
     'ssd_loss',
+    'detection_map',
 ]
 
 __auto__ = [
@@ -132,7 +134,48 @@ def detection_output(scores,
     return nmsed_outs
 
 
-def bipartite_match(dist_matrix, name=None):
+@autodoc()
+def detection_map(detect_res,
+                  label,
+                  pos_count=None,
+                  true_pos=None,
+                  false_pos=None,
+                  overlap_threshold=0.3,
+                  evaluate_difficult=True,
+                  ap_type='integral'):
+    helper = LayerHelper("detection_map", **locals())
+
+    map_out = helper.create_tmp_variable(dtype='float32')
+    accum_pos_count_out = helper.create_tmp_variable(dtype='int32')
+    accum_true_pos_out = helper.create_tmp_variable(dtype='float32')
+    accum_false_pos_out = helper.create_tmp_variable(dtype='float32')
+    helper.append_op(
+        type="detection_map",
+        inputs={
+            'Label': label,
+            'DetectRes': detect_res,
+            'PosCount': pos_count,
+            'TruePos': true_pos,
+            'FalsePos': false_pos
+        },
+        outputs={
+            'MAP': map_out,
+            'AccumPosCount': accum_pos_count_out,
+            'AccumTruePos': accum_true_pos_out,
+            'AccumFalsePos': accum_false_pos_out
+        },
+        attrs={
+            'overlap_threshold': overlap_threshold,
+            'evaluate_difficult': evaluate_difficult,
+            'ap_type': ap_type
+        })
+    return map_out, accum_pos_count_out, accum_true_pos_out, accum_false_pos_out
+
+
+def bipartite_match(dist_matrix,
+                    match_type=None,
+                    dist_threshold=None,
+                    name=None):
     """
     **Bipartite matchint operator**
 
@@ -164,6 +207,11 @@ def bipartite_match(dist_matrix, name=None):
             This tensor can contain LoD information to represent a batch of
             inputs. One instance of this batch can contain different numbers of
             entities.
+        match_type(string|None): The type of matching method, should be
+           'bipartite' or 'per_prediction', 'bipartite' by defalut.
+        dist_threshold(float|None): If `match_type` is 'per_prediction',
+            this threshold is to determine the extra matching bboxes based
+            on the maximum distance, 0.5 by defalut.
     Returns:
         match_indices(Variable): A 2-D Tensor with shape [N, M] in int type.
             N is the batch size. If match_indices[i][j] is -1, it
@@ -183,6 +231,10 @@ def bipartite_match(dist_matrix, name=None):
     helper.append_op(
         type='bipartite_match',
         inputs={'DistMat': dist_matrix},
+        attrs={
+            'match_type': match_type,
+            'dist_threshold': dist_threshold,
+        },
         outputs={
             'ColToRowMatchIndices': match_indices,
             'ColToRowMatchDist': match_distance
@@ -276,6 +328,7 @@ def ssd_loss(location,
              conf_loss_weight=1.0,
              match_type='per_prediction',
              mining_type='max_negative',
+             normalize=True,
              sample_size=None):
     """
     **Multi-box loss layer for object dection algorithm of SSD**
@@ -324,18 +377,20 @@ def ssd_loss(location,
             `overlap_threshold` to determine the extra matching bboxes when
              finding matched boxes. 0.5 by default.
         neg_pos_ratio (float): The ratio of the negative boxes to the positive
-            boxes, used only when mining_type is max_negative, 3.0 by defalut.
+            boxes, used only when mining_type is 'max_negative', 3.0 by defalut.
         neg_overlap (float): The negative overlap upper bound for the unmatched
-            predictions. Use only when mining_type is max_negative,
+            predictions. Use only when mining_type is 'max_negative',
             0.5 by default.
-        sample_size (int): The max sample size of negative box, used only when
-            mining_type is hard_example.
         loc_loss_weight (float): Weight for localization loss, 1.0 by default.
         conf_loss_weight (float): Weight for confidence loss, 1.0 by default.
         match_type (str): The type of matching method during training, should
-            be 'bipartite' or 'per_prediction'.
+            be 'bipartite' or 'per_prediction', 'per_prediction' by defalut.
         mining_type (str): The hard example mining type, should be 'hard_example'
             or 'max_negative', now only support `max_negative`.
+        normalize (bool): Whether to normalize the SSD loss by the total number
+            of output locations, True by defalut.
+        sample_size (int): The max sample size of negative box, used only when
+            mining_type is 'hard_example'.
 
     Returns:
         Variable: The weighted sum of the localization loss and confidence loss,
@@ -381,7 +436,8 @@ def ssd_loss(location,
     #   1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
     iou = iou_similarity(x=gt_box, y=prior_box)
     #   1.2 Compute matched boundding box by bipartite matching algorithm.
-    matched_indices, matched_dist = bipartite_match(iou)
+    matched_indices, matched_dist = bipartite_match(iou, match_type,
+                                                    overlap_threshold)
 
     # 2. Compute confidence for mining hard examples
     # 2.1. Get the target label based on matched indices
@@ -440,10 +496,15 @@ def ssd_loss(location,
     # 5.1 Compute confidence loss.
     target_label = __reshape_to_2d(target_label)
     target_label = tensor.cast(x=target_label, dtype='int64')
+
     conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)
     target_conf_weight = __reshape_to_2d(target_conf_weight)
     conf_loss = conf_loss * target_conf_weight
 
+    # the target_label and target_conf_weight do not have gradient.
+    target_label.stop_gradient = True
+    target_conf_weight.stop_gradient = True
+
     # 5.2 Compute regression loss.
     location = __reshape_to_2d(location)
     target_bbox = __reshape_to_2d(target_bbox)
@@ -452,8 +513,19 @@ def ssd_loss(location,
     target_loc_weight = __reshape_to_2d(target_loc_weight)
     loc_loss = loc_loss * target_loc_weight
 
+    # the target_bbox and target_loc_weight do not have gradient.
+    target_bbox.stop_gradient = True
+    target_loc_weight.stop_gradient = True
+
     # 5.3 Compute overall weighted loss.
     loss = conf_loss_weight * conf_loss + loc_loss_weight * loc_loss
+    # reshape to [N, Np], N is the batch size and Np is the prior box number.
+    loss = ops.reshape(x=loss, shape=[-1, num_prior])
+    loss = nn.reduce_sum(loss, dim=1, keep_dim=True)
+    if normalize:
+        normalizer = nn.reduce_sum(target_loc_weight)
+        loss = loss / normalizer
+
     return loss
 
 
diff --git a/python/paddle/v2/fluid/layers/device.py b/python/paddle/fluid/layers/device.py
similarity index 92%
rename from python/paddle/v2/fluid/layers/device.py
rename to python/paddle/fluid/layers/device.py
index 3fee263ac0fc0aa794b290c35e6a929572d83e6d..e0c1aab230aeed7fb858e91e7da7eae58032ee16 100644
--- a/python/paddle/v2/fluid/layers/device.py
+++ b/python/paddle/fluid/layers/device.py
@@ -25,7 +25,8 @@ __all__ = ['get_places']
 @autodoc()
 def get_places(device_count=None, device_type=None):
     helper = LayerHelper('get_places', **locals())
-    out_places = helper.create_variable(name=unique_name(helper.name + ".out"))
+    out_places = helper.create_variable(
+        name=unique_name.generate(helper.name + ".out"))
     attrs = dict()
     if device_count is not None:
         attrs['device_count'] = int(device_count)
diff --git a/python/paddle/v2/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
similarity index 100%
rename from python/paddle/v2/fluid/layers/io.py
rename to python/paddle/fluid/layers/io.py
diff --git a/python/paddle/v2/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
similarity index 94%
rename from python/paddle/v2/fluid/layers/layer_function_generator.py
rename to python/paddle/fluid/layers/layer_function_generator.py
index 88c9ae31b7902a1b098700839fb533c013b03103..bd79022a0c39cf18bd05d49ac62986d342a4ae06 100644
--- a/python/paddle/v2/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -130,7 +130,7 @@ def generate_layer_fn(op_type):
     o_name = not_intermediate_outputs[0].name
     intermediate_output_names = [output.name for output in intermediate_outputs]
 
-    def infer_and_check_dtype(op_proto, **kwargs):
+    def infer_and_check_dtype(op_proto, *args, **kwargs):
         """
         This function performs the sanity check for dtype and
         instance type.
@@ -141,6 +141,10 @@ def generate_layer_fn(op_type):
             val = kwargs.pop(name, [])
             if not isinstance(val, list) and not isinstance(val, tuple):
                 val = [val]
+            if len(val) == 0:
+                val = [args[0]]
+                args = args[1:]
+
             for each in val:
                 if not isinstance(each, Variable):
                     raise ValueError("input of {0} must be variable".format(
@@ -155,10 +159,10 @@ def generate_layer_fn(op_type):
 
         return dtype
 
-    def func(**kwargs):
+    def func(*args, **kwargs):
         helper = LayerHelper(op_type, **kwargs)
 
-        dtype = infer_and_check_dtype(op_proto, **kwargs)
+        dtype = infer_and_check_dtype(op_proto, *args, **kwargs)
 
         inputs = dict()
         for ipt in op_proto.inputs:
@@ -166,6 +170,9 @@ def generate_layer_fn(op_type):
             val = kwargs.pop(name, [])
             if not isinstance(val, list) and not isinstance(val, tuple):
                 val = [val]
+            if len(val) == 0 and len(args) != 0:
+                val = args[0]
+                args = args[1:]
             inputs[ipt.name] = val
 
         outputs = dict()
diff --git a/python/paddle/v2/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
similarity index 97%
rename from python/paddle/v2/fluid/layers/math_op_patch.py
rename to python/paddle/fluid/layers/math_op_patch.py
index 417a01b76f16336d38a3f7589f660b1a7779594e..faccc3ddf827e4211c9f2e61da7138e5d43f1d11 100644
--- a/python/paddle/v2/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -21,7 +21,7 @@ __all__ = ['monkey_patch_variable']
 
 def monkey_patch_variable():
     def unique_tmp_name():
-        return unique_name("tmp")
+        return unique_name.generate("tmp")
 
     def safe_get_dtype(var):
         try:
@@ -157,7 +157,9 @@ def monkey_patch_variable():
         ("__eq__", "equal", False),
         ("__ne__", "not_equal", False),
         ("__lt__", "less_than", False),
-        ("__le__", "less_equal", False)):
+        ("__le__", "less_equal", False),
+        ("__gt__", "greater_than", False),
+        ("__ge__", "greater_equal", False)):
         setattr(Variable, method_name,
                 _elemwise_method_creator_(method_name, op_type, reverse))
 
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
similarity index 98%
rename from python/paddle/v2/fluid/layers/nn.py
rename to python/paddle/fluid/layers/nn.py
index 4a47d3f425f716ea6ad215ff0e4b66330b75ce2f..76101fe64fa96fbd4d6eae15d9ac89086e7819b2 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -21,6 +21,7 @@ from ..framework import Variable
 from ..param_attr import ParamAttr
 from layer_function_generator import autodoc
 from tensor import concat
+import utils
 
 __all__ = [
     'fc',
@@ -1103,8 +1104,8 @@ def sequence_conv(input,
 def conv2d(input,
            num_filters,
            filter_size,
-           stride=None,
-           padding=None,
+           stride=1,
+           padding=0,
            groups=None,
            param_attr=None,
            bias_attr=None,
@@ -1217,12 +1218,10 @@ def conv2d(input,
             raise ValueError("num_channels must be divisible by groups.")
         num_filter_channels = num_channels / groups
 
-    if isinstance(filter_size, int):
-        filter_size = [filter_size, filter_size]
-    if isinstance(stride, int):
-        stride = [stride, stride]
-    if isinstance(padding, int):
-        padding = [padding, padding]
+    filter_size = utils.convert_to_list(filter_size, 2, 'filter_size')
+    stride = utils.convert_to_list(stride, 2, 'stride')
+    padding = utils.convert_to_list(padding, 2, 'padding')
+
     if not isinstance(use_cudnn, bool):
         raise ValueError("use_cudnn should be True or False")
 
@@ -1397,10 +1396,10 @@ def sequence_last_step(input):
 
 
 def pool2d(input,
-           pool_size,
-           pool_type,
-           pool_stride=None,
-           pool_padding=None,
+           pool_size=-1,
+           pool_type="max",
+           pool_stride=1,
+           pool_padding=0,
            global_pooling=False,
            use_cudnn=True,
            name=None):
@@ -1408,20 +1407,20 @@ def pool2d(input,
     This function adds the operator for pooling in 2 dimensions, using the
     pooling configurations mentioned in input parameters.
     """
-    if pool_padding is None:
-        pool_padding = [0, 0]
-    if pool_stride is None:
-        pool_stride = [1, 1]
     if pool_type not in ["max", "avg"]:
         raise ValueError(
             "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
             str(pool_type))
-    if isinstance(pool_size, int):
-        pool_size = [pool_size, pool_size]
-    if isinstance(pool_stride, int):
-        pool_stride = [pool_stride, pool_stride]
-    if isinstance(pool_padding, int):
-        pool_padding = [pool_padding, pool_padding]
+
+    if global_pooling is False and pool_size == -1:
+        raise ValueError(
+            "When the global_pooling is False, pool_size must be passed "
+            "and be a valid value. Received pool_size: " + str(pool_size))
+
+    pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
+    pool_padding = utils.convert_to_list(pool_padding, 2, 'pool_padding')
+    pool_stride = utils.convert_to_list(pool_stride, 2, 'pool_stride')
+
     if not isinstance(use_cudnn, bool):
         raise ValueError("use_cudnn should be True or False")
 
@@ -1484,21 +1483,21 @@ def batch_norm(input,
     bias = helper.create_parameter(
         attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
 
-    mean = helper.create_global_variable(
-        name=moving_mean_name,
-        dtype=input.dtype,
+    mean = helper.create_parameter(
+        attr=ParamAttr(
+            name=moving_mean_name, initializer=Constant(0.0), trainable=False),
         shape=param_shape,
-        persistable=True,
-        stop_gradient=True)
-    helper.set_variable_initializer(var=mean, initializer=Constant(0.0))
+        dtype=input.dtype)
+    mean.stop_gradient = True
 
-    variance = helper.create_global_variable(
-        name=moving_variance_name,
-        dtype=input.dtype,
+    variance = helper.create_parameter(
+        attr=ParamAttr(
+            name=moving_variance_name,
+            initializer=Constant(1.0),
+            trainable=False),
         shape=param_shape,
-        persistable=True,
-        stop_gradient=True)
-    helper.set_variable_initializer(var=variance, initializer=Constant(1.0))
+        dtype=input.dtype)
+    variance.stop_gradient = True
 
     # create output
     # mean and mean_out share the same memory
@@ -1650,9 +1649,9 @@ def conv2d_transpose(input,
                      num_filters,
                      output_size=None,
                      filter_size=None,
-                     padding=None,
-                     stride=None,
-                     dilation=None,
+                     padding=0,
+                     stride=1,
+                     dilation=1,
                      param_attr=None,
                      use_cudnn=True,
                      name=None):
@@ -1748,26 +1747,12 @@ def conv2d_transpose(input,
         raise TypeError("Input of conv2d_transpose must be Variable")
     input_channel = input.shape[1]
 
-    op_attr = dict()
-
-    if isinstance(padding, int):
-        op_attr['paddings'] = [padding, padding]
-    elif padding is not None:
-        op_attr['paddings'] = padding
-
-    if isinstance(stride, int):
-        op_attr['strides'] = [stride, stride]
-    elif stride is not None:
-        op_attr['strides'] = stride
-
-    if isinstance(dilation, int):
-        op_attr['dilations'] = [dilation, dilation]
-    elif dilation is not None:
-        op_attr['dilations'] = dilation
+    padding = utils.convert_to_list(padding, 2, 'padding')
+    stride = utils.convert_to_list(stride, 2, 'stride')
+    dilation = utils.convert_to_list(dilation, 2, 'dilation')
 
     if not isinstance(use_cudnn, bool):
         raise ValueError("use_cudnn should be True or False")
-    op_attr['use_cudnn'] = use_cudnn
 
     if filter_size is None:
         if output_size is None:
@@ -1775,10 +1760,6 @@ def conv2d_transpose(input,
         if isinstance(output_size, int):
             output_size = [output_size, output_size]
 
-        padding = op_attr.get('paddings', [0, 0])
-        stride = op_attr.get('strides', [1, 1])
-        dilation = op_attr.get('dilations', [1, 1])
-
         h_in = input.shape[2]
         w_in = input.shape[3]
 
@@ -1787,9 +1768,9 @@ def conv2d_transpose(input,
         filter_size_w = (output_size[1] - (w_in - 1) * stride[1] + 2 *
                          padding[1] - 1) / dilation[1] + 1
         filter_size = [filter_size_h, filter_size_w]
-
-    elif isinstance(filter_size, int):
-        filter_size = [filter_size, filter_size]
+    else:
+        filter_size = utils.convert_to_list(filter_size, 2,
+                                            'conv2d_transpose.filter_size')
 
     filter_shape = [input_channel, num_filters] + filter_size
     img_filter = helper.create_parameter(
@@ -1801,7 +1782,12 @@ def conv2d_transpose(input,
         inputs={'Input': [input],
                 'Filter': [img_filter]},
         outputs={'Output': out},
-        attrs=op_attr)
+        attrs={
+            'strides': stride,
+            'paddings': padding,
+            'dilations': dilation,
+            'use_cudnn': use_cudnn
+        })
 
     return out
 
diff --git a/python/paddle/v2/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
similarity index 100%
rename from python/paddle/v2/fluid/layers/ops.py
rename to python/paddle/fluid/layers/ops.py
diff --git a/python/paddle/v2/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
similarity index 99%
rename from python/paddle/v2/fluid/layers/tensor.py
rename to python/paddle/fluid/layers/tensor.py
index 97e8f082cffe2b139fb40ddbf9b27f463f47c8ab..8100e8f034fb5d6ca706d1408f448fa26193f282 100644
--- a/python/paddle/v2/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -160,8 +160,8 @@ def sums(input, out=None):
           a0 = layers.array_read(array=tmp, i=i)
           i = layers.increment(x=i)
           a1 = layers.array_read(array=tmp, i=i)
-          mean_a0 = layers.mean(x=a0)
-          mean_a1 = layers.mean(x=a1)
+          mean_a0 = layers.mean(a0)
+          mean_a1 = layers.mean(a1)
           a_sum = layers.sums(input=[mean_a0, mean_a1])
     """
     helper = LayerHelper('sum', **locals())
diff --git a/python/paddle/fluid/layers/utils.py b/python/paddle/fluid/layers/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..49ec3088831dff415e042e1b0a632f63106eb07b
--- /dev/null
+++ b/python/paddle/fluid/layers/utils.py
@@ -0,0 +1,59 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+
+
+def convert_to_list(value, n, name, dtype=np.int):
+    """
+    Converts a single numerical type or iterable of numerical
+    types into an numerical type list.
+
+    Arguments:
+      value: The value to validate and convert. Could an int, or any iterable
+        of ints.
+      n: The size of the list to be returned.
+      name: The name of the argument being validated, e.g. "stride" or
+        "filter_size". This is only used to format error messages.
+      dtype: the numerical type of the element of the list to be returned.
+
+    Returns:
+      A list of n dtypes.
+
+    Raises:
+      ValueError: If something else than an int/long or iterable thereof was
+        passed.
+    """
+    if isinstance(value, dtype):
+        return [value, ] * n
+    else:
+        try:
+            value_list = list(value)
+        except TypeError:
+            raise ValueError("The " + name +
+                             "'s type must be list or tuple. Received: " + str(
+                                 value))
+        if len(value_list) != n:
+            raise ValueError("The " + name + "'s length must be " + str(n) +
+                             ". Received: " + str(value))
+        for single_value in value_list:
+            try:
+                dtype(single_value)
+            except (ValueError, TypeError):
+                raise ValueError(
+                    "The " + name + "'s type must be a list or tuple of " + str(
+                        n) + " " + str(dtype) + " . Received: " + str(
+                            value) + " "
+                    "including element " + str(single_value) + " of type" + " "
+                    + str(type(single_value)))
+        return value_list
diff --git a/python/paddle/v2/fluid/learning_rate_decay.py b/python/paddle/fluid/learning_rate_decay.py
similarity index 100%
rename from python/paddle/v2/fluid/learning_rate_decay.py
rename to python/paddle/fluid/learning_rate_decay.py
diff --git a/python/paddle/v2/fluid/memory_optimization_transpiler.py b/python/paddle/fluid/memory_optimization_transpiler.py
similarity index 100%
rename from python/paddle/v2/fluid/memory_optimization_transpiler.py
rename to python/paddle/fluid/memory_optimization_transpiler.py
diff --git a/python/paddle/v2/fluid/net_drawer.py b/python/paddle/fluid/net_drawer.py
similarity index 97%
rename from python/paddle/v2/fluid/net_drawer.py
rename to python/paddle/fluid/net_drawer.py
index 66793a57858cc6f8c6e18ebe51e7403ddd56f242..73946a0721dc4a6d03074a4708cf574951412e66 100644
--- a/python/paddle/v2/fluid/net_drawer.py
+++ b/python/paddle/fluid/net_drawer.py
@@ -17,8 +17,8 @@ import json
 import logging
 from collections import defaultdict
 
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
+import paddle.fluid.core as core
+import paddle.fluid.proto.framework_pb2 as framework_pb2
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
diff --git a/python/paddle/v2/fluid/nets.py b/python/paddle/fluid/nets.py
similarity index 100%
rename from python/paddle/v2/fluid/nets.py
rename to python/paddle/fluid/nets.py
diff --git a/python/paddle/v2/fluid/op.py b/python/paddle/fluid/op.py
similarity index 99%
rename from python/paddle/v2/fluid/op.py
rename to python/paddle/fluid/op.py
index 6a413704583a0d830ab385c0f8571893edfc9288..0b76e94157e378b40baff641c466968e239d8a83 100644
--- a/python/paddle/v2/fluid/op.py
+++ b/python/paddle/fluid/op.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
+import paddle.fluid.core as core
+import paddle.fluid.proto.framework_pb2 as framework_pb2
 
 
 def get_all_op_protos():
diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
similarity index 92%
rename from python/paddle/v2/fluid/optimizer.py
rename to python/paddle/fluid/optimizer.py
index ecc42f6215bdd13f6ea4284dcd67b6026ad33129..93a19de92e1654df2424019d764f1cbbe6314686 100644
--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -17,7 +17,8 @@ from collections import defaultdict
 import framework
 import layers
 from backward import append_backward
-from framework import unique_name, program_guard
+from framework import program_guard
+import unique_name
 from initializer import Constant
 from layer_helper import LayerHelper
 from regularizer import append_regularization_ops
@@ -35,10 +36,18 @@ class Optimizer(object):
     """
 
     def __init__(self, learning_rate, global_step=None, regularization=None):
-        assert learning_rate is not None
+        if not isinstance(learning_rate, float) and \
+                not isinstance(learning_rate, framework.Variable):
+            raise TypeError("learning rate should be float or Variable")
         self._global_step = global_step
         self.regularization = regularization
-        self._global_learning_rate = learning_rate
+        self._learning_rate = learning_rate
+        # each program should have a independent learning rate
+        # program -> Variable(learning_rate)
+        self._learning_rate_map = dict()
+        if isinstance(self._learning_rate, framework.Variable):
+            self._learning_rate_map[framework.default_main_program(
+            )] = self._learning_rate
         # Dictionary of accumulators. Some optimizer subclasses need to
         # allocate and manage extra variables associated with the parameters
         # to train. These variables are called accumulators.
@@ -47,26 +56,33 @@ class Optimizer(object):
         self.helper = None
 
     def _create_global_learning_rate(self):
-        if isinstance(self._global_learning_rate, float):
-            self._global_learning_rate = layers.create_global_var(
-                name=unique_name("learning_rate"),
-                shape=[1],
-                value=float(self._global_learning_rate),
-                dtype='float32',
-                persistable=True)
-
-        if not isinstance(self._global_learning_rate, framework.Variable):
-            raise ValueError("learning rate should be a Variable, "
-                             "actual type is %s",
-                             type(self._global_learning_rate))
-
-    @property
-    def global_learning_rate(self):
+        lr = self.global_learning_rate()
+
+        if isinstance(lr, framework.Variable):
+            return
+        else:
+            if not isinstance(self._learning_rate, float):
+                raise TypeError(
+                    "learning rate variable is create outside optimizer,"
+                    "can not create new learning rate variable for new program")
+
+        # create learning rate in the current main program
+        self._learning_rate_map[framework.default_main_program(
+        )] = layers.create_global_var(
+            name=unique_name.generate("learning_rate"),
+            shape=[1],
+            value=float(self._learning_rate),
+            dtype='float32',
+            persistable=True)
+
+    def global_learning_rate(self, program=None):
         """
         get global decayed learning rate
         :return:
         """
-        return self._global_learning_rate
+        if program is None:
+            program = framework.default_main_program()
+        return self._learning_rate_map.get(program, None)
 
     def _append_optimize_op(self, block, param_and_grad):
         """ append optimize operator to block and return all the added optimize_op
@@ -77,7 +93,7 @@ class Optimizer(object):
         # create learning rate variable for every parameter
         param = param_and_grad[0]
         param_lr = param.optimize_attr['learning_rate']
-        return self._global_learning_rate * param_lr
+        return self.global_learning_rate() * param_lr
 
     def _create_accumulators(self, block, parameters):
         """Create all accumulators needed by the parameters
@@ -118,7 +134,7 @@ class Optimizer(object):
 
         assert isinstance(self.helper, LayerHelper)
         var = self.helper.create_global_variable(
-            name=unique_name(name),
+            name=unique_name.generate(name),
             persistable=True,
             dtype=dtype or param.dtype,
             type=param.type,
@@ -379,7 +395,7 @@ class AdamOptimizer(Optimizer):
         # Create beta1 and beta2 power tensors
         beta_shape = [1]
         self._beta1_pow_acc = self.helper.create_global_variable(
-            name=unique_name('beta1_pow_acc'),
+            name=unique_name.generate('beta1_pow_acc'),
             dtype='float32',
             shape=beta_shape,
             lod_level=0,
@@ -388,7 +404,7 @@ class AdamOptimizer(Optimizer):
             self._beta1_pow_acc, initializer=Constant(self._beta1))
 
         self._beta2_pow_acc = self.helper.create_global_variable(
-            name=unique_name('beta2_pow_acc'),
+            name=unique_name.generate('beta2_pow_acc'),
             dtype='float32',
             shape=beta_shape,
             lod_level=0,
@@ -481,7 +497,7 @@ class AdamaxOptimizer(Optimizer):
         # Create beta1 power accumulator tensor
         beta_shape = [1]
         self._beta1_pow_acc = self.helper.create_global_variable(
-            name=unique_name('beta1_pow_acc'),
+            name=unique_name.generate('beta1_pow_acc'),
             dtype='float32',
             shape=beta_shape,
             lod_level=0,
diff --git a/python/paddle/v2/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
similarity index 100%
rename from python/paddle/v2/fluid/param_attr.py
rename to python/paddle/fluid/param_attr.py
diff --git a/python/paddle/v2/fluid/profiler.py b/python/paddle/fluid/profiler.py
similarity index 93%
rename from python/paddle/v2/fluid/profiler.py
rename to python/paddle/fluid/profiler.py
index 4611986c9969f12b71290cf8ee03a50a6ad76f94..59e75209d39dc0f2b72ecf832ff15df192a2898e 100644
--- a/python/paddle/v2/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -97,9 +97,14 @@ def profiler(state, sorted_key=None):
             The `ave` means sorting by the average execution time.
     """
 
-    if state not in ['CPU', 'GPU']:
-        raise ValueError("The state must be 'CPU' or 'GPU'.")
-    prof_state = core.ProfilerState.kCUDA if state == "GPU" else core.ProfilerState.kCPU
+    if state not in ['CPU', 'GPU', "All"]:
+        raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.")
+    if state == "GPU":
+        prof_state = core.ProfilerState.kCUDA
+    elif state == "CPU":
+        prof_state = core.ProfilerState.kCPU
+    else:
+        prof_state = core.ProfilerState.kAll
     core.enable_profiler(prof_state)
     yield
 
diff --git a/python/paddle/v2/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
similarity index 100%
rename from python/paddle/v2/fluid/regularizer.py
rename to python/paddle/fluid/regularizer.py
diff --git a/python/paddle/v2/fluid/tests/.gitignore b/python/paddle/fluid/tests/.gitignore
similarity index 100%
rename from python/paddle/v2/fluid/tests/.gitignore
rename to python/paddle/fluid/tests/.gitignore
diff --git a/python/paddle/v2/fluid/tests/CMakeLists.txt b/python/paddle/fluid/tests/CMakeLists.txt
similarity index 89%
rename from python/paddle/v2/fluid/tests/CMakeLists.txt
rename to python/paddle/fluid/tests/CMakeLists.txt
index 5ff7b1b027e0e17d233f2a8a1c9775ccfbe1797e..d24417bbacb503d9ea70e68e7e0edb59e7dddbde 100644
--- a/python/paddle/v2/fluid/tests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/CMakeLists.txt
@@ -7,5 +7,4 @@ endforeach()
 
 add_subdirectory(unittests)
 add_subdirectory(book)
-add_subdirectory(book_distribute)
 add_subdirectory(book_memory_optimization)
diff --git a/python/paddle/v2/fluid/tests/__init__.py b/python/paddle/fluid/tests/__init__.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/__init__.py
rename to python/paddle/fluid/tests/__init__.py
diff --git a/python/paddle/v2/fluid/tests/book/.gitignore b/python/paddle/fluid/tests/book/.gitignore
similarity index 100%
rename from python/paddle/v2/fluid/tests/book/.gitignore
rename to python/paddle/fluid/tests/book/.gitignore
diff --git a/python/paddle/v2/fluid/tests/book/CMakeLists.txt b/python/paddle/fluid/tests/book/CMakeLists.txt
similarity index 100%
rename from python/paddle/v2/fluid/tests/book/CMakeLists.txt
rename to python/paddle/fluid/tests/book/CMakeLists.txt
diff --git a/python/paddle/v2/fluid/tests/book/__init__.py b/python/paddle/fluid/tests/book/__init__.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/book/__init__.py
rename to python/paddle/fluid/tests/book/__init__.py
diff --git a/python/paddle/v2/fluid/tests/book/notest_rnn_encoder_decoer.py b/python/paddle/fluid/tests/book/notest_rnn_encoder_decoer.py
similarity index 83%
rename from python/paddle/v2/fluid/tests/book/notest_rnn_encoder_decoer.py
rename to python/paddle/fluid/tests/book/notest_rnn_encoder_decoer.py
index c7db70f1b18e958d2c3d22aff1c48a4556a26fc9..983f8f4dbeac83566839de25ec9765eb248be768 100644
--- a/python/paddle/v2/fluid/tests/book/notest_rnn_encoder_decoer.py
+++ b/python/paddle/fluid/tests/book/notest_rnn_encoder_decoer.py
@@ -14,15 +14,15 @@
 
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid.layers as layers
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.framework as framework
+import paddle.fluid.layers as layers
 import contextlib
 import math
 import sys
 import unittest
-from paddle.v2.fluid.executor import Executor
+from paddle.fluid.executor import Executor
 
 dict_size = 30000
 source_dict_dim = target_dict_dim = dict_size
@@ -147,7 +147,7 @@ def seq_to_seq_net():
     label = fluid.layers.data(
         name='label_sequence', shape=[1], dtype='int64', lod_level=1)
     cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
+    avg_cost = fluid.layers.mean(cost)
 
     return avg_cost, prediction
 
@@ -228,32 +228,34 @@ def infer(use_cuda, save_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
 
-    # Use fluid.io.load_inference_model to obtain the inference program desc,
-    # the feed_target_names (the names of variables that will be feeded 
-    # data using feed operators), and the fetch_targets (variables that 
-    # we want to obtain data from using fetch operators).
-    [inference_program, feed_target_names,
-     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
-    lod = [0, 4, 10]
-    word_data = create_random_lodtensor(lod, place, low=0, high=1)
-    trg_word = create_random_lodtensor(lod, place, low=0, high=1)
-
-    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
-    # and results will contain a list of data corresponding to fetch_targets.
-    assert feed_target_names[0] == 'source_sequence'
-    assert feed_target_names[1] == 'target_sequence'
-    results = exe.run(inference_program,
-                      feed={
-                          feed_target_names[0]: word_data,
-                          feed_target_names[1]: trg_word,
-                      },
-                      fetch_list=fetch_targets,
-                      return_numpy=False)
-    print(results[0].lod())
-    np_data = np.array(results[0])
-    print("Inference shape: ", np_data.shape)
-    print("Inference results: ", np_data)
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        # Use fluid.io.load_inference_model to obtain the inference program desc,
+        # the feed_target_names (the names of variables that will be feeded
+        # data using feed operators), and the fetch_targets (variables that
+        # we want to obtain data from using fetch operators).
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+        lod = [0, 4, 10]
+        word_data = create_random_lodtensor(lod, place, low=0, high=1)
+        trg_word = create_random_lodtensor(lod, place, low=0, high=1)
+
+        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+        # and results will contain a list of data corresponding to fetch_targets.
+        assert feed_target_names[0] == 'source_sequence'
+        assert feed_target_names[1] == 'target_sequence'
+        results = exe.run(inference_program,
+                          feed={
+                              feed_target_names[0]: word_data,
+                              feed_target_names[1]: trg_word,
+                          },
+                          fetch_list=fetch_targets,
+                          return_numpy=False)
+        print(results[0].lod())
+        np_data = np.array(results[0])
+        print("Inference shape: ", np_data.shape)
+        print("Inference results: ", np_data)
 
 
 def main(use_cuda):
diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py
new file mode 100644
index 0000000000000000000000000000000000000000..93ef66851b0efd65361122853dadeefe11992ed5
--- /dev/null
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
@@ -0,0 +1,159 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+import contextlib
+import numpy
+import unittest
+import math
+import sys
+import os
+
+
+def train(use_cuda, save_dirname, is_local):
+    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    avg_cost = fluid.layers.mean(cost)
+
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+
+    BATCH_SIZE = 20
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.uci_housing.train(), buf_size=500),
+        batch_size=BATCH_SIZE)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    def train_loop(main_program):
+        feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+        exe.run(fluid.default_startup_program())
+
+        PASS_NUM = 100
+        for pass_id in range(PASS_NUM):
+            for data in train_reader():
+                avg_loss_value, = exe.run(main_program,
+                                          feed=feeder.feed(data),
+                                          fetch_list=[avg_cost])
+                print(avg_loss_value)
+                if avg_loss_value[0] < 10.0:
+                    if save_dirname is not None:
+                        fluid.io.save_inference_model(save_dirname, ['x'],
+                                                      [y_predict], exe)
+                    return
+                if math.isnan(float(avg_loss_value)):
+                    sys.exit("got NaN loss, training failed.")
+        raise AssertionError("Fit a line cost is too large, {0:2.2}".format(
+            avg_loss_value[0]))
+
+    if is_local:
+        train_loop(fluid.default_main_program())
+    else:
+        port = os.getenv("PADDLE_INIT_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, port]))
+        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
+        trainers = int(os.getenv("TRAINERS"))
+        current_endpoint = os.getenv("POD_IP") + ":" + port
+        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            optimize_ops,
+            params_grads,
+            trainer_id,
+            pservers=pserver_endpoints,
+            trainers=trainers)
+        if training_role == "PSERVER":
+            pserver_prog = t.get_pserver_program(current_endpoint)
+            pserver_startup = t.get_startup_program(current_endpoint,
+                                                    pserver_prog)
+            exe.run(pserver_startup)
+            exe.run(pserver_prog)
+        elif training_role == "TRAINER":
+            train_loop(t.get_trainer_program())
+
+
+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        # Use fluid.io.load_inference_model to obtain the inference program desc,
+        # the feed_target_names (the names of variables that will be feeded
+        # data using feed operators), and the fetch_targets (variables that
+        # we want to obtain data from using fetch operators).
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+        # The input's dimension should be 2-D and the second dim is 13
+        # The input data should be >= 0
+        batch_size = 10
+        tensor_x = numpy.random.uniform(0, 10,
+                                        [batch_size, 13]).astype("float32")
+        assert feed_target_names[0] == 'x'
+        results = exe.run(inference_program,
+                          feed={feed_target_names[0]: tensor_x},
+                          fetch_list=fetch_targets)
+        print("infer shape: ", results[0].shape)
+        print("infer results: ", results[0])
+
+
+def main(use_cuda, is_local=True):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    # Directory for saving the trained model
+    save_dirname = "fit_a_line.inference.model"
+
+    train(use_cuda, save_dirname, is_local)
+    infer(use_cuda, save_dirname)
+
+
+class TestFitALine(unittest.TestCase):
+    def test_cpu(self):
+        with self.program_scope_guard():
+            main(use_cuda=False)
+
+    def test_cuda(self):
+        with self.program_scope_guard():
+            main(use_cuda=True)
+
+    @contextlib.contextmanager
+    def program_scope_guard(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
similarity index 57%
rename from python/paddle/v2/fluid/tests/book/test_image_classification.py
rename to python/paddle/fluid/tests/book/test_image_classification.py
index 734ab3e4fbabc9360f7f11013143625932585c48..613f4a7bf1c41f9f320ba8d310545a182f95e316 100644
--- a/python/paddle/v2/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -15,12 +15,13 @@
 from __future__ import print_function
 
 import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
+import paddle.fluid as fluid
 import contextlib
 import math
 import sys
 import numpy
 import unittest
+import os
 
 
 def resnet_cifar10(input, depth=32):
@@ -92,7 +93,7 @@ def vgg16_bn_drop(input):
     return fc2
 
 
-def train(net_type, use_cuda, save_dirname):
+def train(net_type, use_cuda, save_dirname, is_local):
     classdim = 10
     data_shape = [3, 32, 32]
 
@@ -110,14 +111,14 @@ def train(net_type, use_cuda, save_dirname):
 
     predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
     cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
+    avg_cost = fluid.layers.mean(cost)
     acc = fluid.layers.accuracy(input=predict, label=label)
 
     # Test program 
     test_program = fluid.default_main_program().clone()
 
     optimizer = fluid.optimizer.Adam(learning_rate=0.001)
-    optimizer.minimize(avg_cost)
+    optimize_ops, params_grads = optimizer.minimize(avg_cost)
 
     BATCH_SIZE = 128
     PASS_NUM = 1
@@ -133,38 +134,68 @@ def train(net_type, use_cuda, save_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
     feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
-    exe.run(fluid.default_startup_program())
-
-    loss = 0.0
-    for pass_id in range(PASS_NUM):
-        for batch_id, data in enumerate(train_reader()):
-            exe.run(feed=feeder.feed(data))
-
-            if (batch_id % 10) == 0:
-                acc_list = []
-                avg_loss_list = []
-                for tid, test_data in enumerate(test_reader()):
-                    loss_t, acc_t = exe.run(program=test_program,
-                                            feed=feeder.feed(test_data),
-                                            fetch_list=[avg_cost, acc])
-                    if math.isnan(float(loss_t)):
-                        sys.exit("got NaN loss, training failed.")
-                    acc_list.append(float(acc_t))
-                    avg_loss_list.append(float(loss_t))
-                    break  # Use 1 segment for speeding up CI
-
-                acc_value = numpy.array(acc_list).mean()
-                avg_loss_value = numpy.array(avg_loss_list).mean()
-
-                print(
-                    'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
-                    format(pass_id, batch_id + 1,
-                           float(avg_loss_value), float(acc_value)))
-
-                if acc_value > 0.01:  # Low threshold for speeding up CI
-                    fluid.io.save_inference_model(save_dirname, ["pixel"],
-                                                  [predict], exe)
-                    return
+
+    def train_loop(main_program):
+        exe.run(fluid.default_startup_program())
+        loss = 0.0
+        for pass_id in range(PASS_NUM):
+            for batch_id, data in enumerate(train_reader()):
+                exe.run(main_program, feed=feeder.feed(data))
+
+                if (batch_id % 10) == 0:
+                    acc_list = []
+                    avg_loss_list = []
+                    for tid, test_data in enumerate(test_reader()):
+                        loss_t, acc_t = exe.run(program=test_program,
+                                                feed=feeder.feed(test_data),
+                                                fetch_list=[avg_cost, acc])
+                        if math.isnan(float(loss_t)):
+                            sys.exit("got NaN loss, training failed.")
+                        acc_list.append(float(acc_t))
+                        avg_loss_list.append(float(loss_t))
+                        break  # Use 1 segment for speeding up CI
+
+                    acc_value = numpy.array(acc_list).mean()
+                    avg_loss_value = numpy.array(avg_loss_list).mean()
+
+                    print(
+                        'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
+                        format(pass_id, batch_id + 1,
+                               float(avg_loss_value), float(acc_value)))
+
+                    if acc_value > 0.01:  # Low threshold for speeding up CI
+                        fluid.io.save_inference_model(save_dirname, ["pixel"],
+                                                      [predict], exe)
+                        return
+
+    if is_local:
+        train_loop(fluid.default_main_program())
+    else:
+        port = os.getenv("PADDLE_INIT_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, port]))
+        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
+        trainers = int(os.getenv("TRAINERS"))
+        current_endpoint = os.getenv("POD_IP") + ":" + port
+        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            optimize_ops,
+            params_grads,
+            trainer_id,
+            pservers=pserver_endpoints,
+            trainers=trainers)
+        if training_role == "PSERVER":
+            pserver_prog = t.get_pserver_program(current_endpoint)
+            pserver_startup = t.get_startup_program(current_endpoint,
+                                                    pserver_prog)
+            exe.run(pserver_startup)
+            exe.run(pserver_prog)
+        elif training_role == "TRAINER":
+            train_loop(t.get_trainer_program())
 
 
 def infer(use_cuda, save_dirname=None):
@@ -174,32 +205,36 @@ def infer(use_cuda, save_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
 
-    # Use fluid.io.load_inference_model to obtain the inference program desc,
-    # the feed_target_names (the names of variables that will be feeded 
-    # data using feed operators), and the fetch_targets (variables that 
-    # we want to obtain data from using fetch operators).
-    [inference_program, feed_target_names,
-     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
-    # The input's dimension of conv should be 4-D or 5-D.
-    tensor_img = numpy.random.rand(1, 3, 32, 32).astype("float32")
-
-    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
-    # and results will contain a list of data corresponding to fetch_targets.
-    results = exe.run(inference_program,
-                      feed={feed_target_names[0]: tensor_img},
-                      fetch_list=fetch_targets)
-    print("infer results: ", results[0])
-
-
-def main(net_type, use_cuda):
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        # Use fluid.io.load_inference_model to obtain the inference program desc,
+        # the feed_target_names (the names of variables that will be feeded
+        # data using feed operators), and the fetch_targets (variables that
+        # we want to obtain data from using fetch operators).
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+        # The input's dimension of conv should be 4-D or 5-D.
+        # Use normilized image pixels as input data, which should be in the range [0, 1.0].
+        batch_size = 1
+        tensor_img = numpy.random.rand(batch_size, 3, 32, 32).astype("float32")
+
+        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+        # and results will contain a list of data corresponding to fetch_targets.
+        results = exe.run(inference_program,
+                          feed={feed_target_names[0]: tensor_img},
+                          fetch_list=fetch_targets)
+        print("infer results: ", results[0])
+
+
+def main(net_type, use_cuda, is_local=True):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
 
     # Directory for saving the trained model
     save_dirname = "image_classification_" + net_type + ".inference.model"
 
-    train(net_type, use_cuda, save_dirname)
+    train(net_type, use_cuda, save_dirname, is_local)
     infer(use_cuda, save_dirname)
 
 
diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
similarity index 53%
rename from python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
rename to python/paddle/fluid/tests/book/test_label_semantic_roles.py
index b790246ec15777d254a0aaee90393e19f72c2ea1..13efe4efb1f2ba4ebd25868c7f3d94ca3c5fb1a1 100644
--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -17,16 +17,17 @@ import math
 import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.dataset.conll05 as conll05
-import paddle.v2.fluid as fluid
-from paddle.v2.fluid.initializer import init_on_cpu
+import paddle.fluid as fluid
+from paddle.fluid.initializer import init_on_cpu
 import contextlib
 import time
 import unittest
+import os
 
 word_dict, verb_dict, label_dict = conll05.get_dict()
 word_dict_len = len(word_dict)
 label_dict_len = len(label_dict)
-pred_len = len(verb_dict)
+pred_dict_len = len(verb_dict)
 
 mark_dict_len = 2
 word_dim = 32
@@ -53,7 +54,7 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
     # 8 features
     predicate_embedding = fluid.layers.embedding(
         input=predicate,
-        size=[pred_len, word_dim],
+        size=[pred_dict_len, word_dim],
         dtype='float32',
         is_sparse=IS_SPARSE,
         param_attr='vemb')
@@ -138,7 +139,7 @@ def create_random_lodtensor(lod, place, low, high):
     return res
 
 
-def train(use_cuda, save_dirname=None):
+def train(use_cuda, save_dirname=None, is_local=True):
     # define network topology
     word = fluid.layers.data(
         name='word_data', shape=[1], dtype='int64', lod_level=1)
@@ -164,7 +165,7 @@ def train(use_cuda, save_dirname=None):
         label=target,
         param_attr=fluid.ParamAttr(
             name='crfw', learning_rate=mix_hidden_lr))
-    avg_cost = fluid.layers.mean(x=crf_cost)
+    avg_cost = fluid.layers.mean(crf_cost)
 
     # TODO(qiao)
     # check other optimizers and check why out will be NAN
@@ -178,7 +179,7 @@ def train(use_cuda, save_dirname=None):
             decay_rate=0.5,
             staircase=True),
         global_step=global_step)
-    sgd_optimizer.minimize(avg_cost)
+    optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
 
     # TODO(qiao)
     # add dependency track and move this config before optimizer
@@ -204,44 +205,78 @@ def train(use_cuda, save_dirname=None):
         place=place)
     exe = fluid.Executor(place)
 
-    exe.run(fluid.default_startup_program())
-
-    embedding_param = fluid.global_scope().find_var(embedding_name).get_tensor()
-    embedding_param.set(
-        load_parameter(conll05.get_embedding(), word_dict_len, word_dim), place)
-
-    start_time = time.time()
-    batch_id = 0
-    for pass_id in xrange(PASS_NUM):
-        chunk_evaluator.reset(exe)
-        for data in train_data():
-            cost, precision, recall, f1_score = exe.run(
-                fluid.default_main_program(),
-                feed=feeder.feed(data),
-                fetch_list=[avg_cost] + chunk_evaluator.metrics)
-            pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(
-                exe)
-
-            if batch_id % 10 == 0:
-                print("avg_cost:" + str(cost) + " precision:" + str(
-                    precision) + " recall:" + str(recall) + " f1_score:" + str(
-                        f1_score) + " pass_precision:" + str(
-                            pass_precision) + " pass_recall:" + str(pass_recall)
-                      + " pass_f1_score:" + str(pass_f1_score))
-                if batch_id != 0:
-                    print("second per batch: " + str((time.time() - start_time)
-                                                     / batch_id))
-                # Set the threshold low to speed up the CI test
-                if float(pass_precision) > 0.05:
-                    if save_dirname is not None:
-                        fluid.io.save_inference_model(save_dirname, [
-                            'word_data', 'verb_data', 'ctx_n2_data',
-                            'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
-                            'ctx_p2_data', 'mark_data'
-                        ], [feature_out], exe)
-                    return
-
-            batch_id = batch_id + 1
+    def train_loop(main_program):
+        exe.run(fluid.default_startup_program())
+
+        embedding_param = fluid.global_scope().find_var(
+            embedding_name).get_tensor()
+        embedding_param.set(
+            load_parameter(conll05.get_embedding(), word_dict_len, word_dim),
+            place)
+
+        start_time = time.time()
+        batch_id = 0
+        for pass_id in xrange(PASS_NUM):
+            chunk_evaluator.reset(exe)
+            for data in train_data():
+                cost, precision, recall, f1_score = exe.run(
+                    main_program,
+                    feed=feeder.feed(data),
+                    fetch_list=[avg_cost] + chunk_evaluator.metrics)
+                pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(
+                    exe)
+
+                if batch_id % 10 == 0:
+                    print("avg_cost:" + str(cost) + " precision:" + str(
+                        precision) + " recall:" + str(recall) + " f1_score:" +
+                          str(f1_score) + " pass_precision:" + str(
+                              pass_precision) + " pass_recall:" + str(
+                                  pass_recall) + " pass_f1_score:" + str(
+                                      pass_f1_score))
+                    if batch_id != 0:
+                        print("second per batch: " + str((time.time(
+                        ) - start_time) / batch_id))
+                    # Set the threshold low to speed up the CI test
+                    if float(pass_precision) > 0.05:
+                        if save_dirname is not None:
+                            # TODO(liuyiqun): Change the target to crf_decode
+                            fluid.io.save_inference_model(save_dirname, [
+                                'word_data', 'verb_data', 'ctx_n2_data',
+                                'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
+                                'ctx_p2_data', 'mark_data'
+                            ], [feature_out], exe)
+                        return
+
+                batch_id = batch_id + 1
+
+    if is_local:
+        train_loop(fluid.default_main_program())
+    else:
+        port = os.getenv("PADDLE_INIT_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, port]))
+        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
+        trainers = int(os.getenv("TRAINERS"))
+        current_endpoint = os.getenv("POD_IP") + ":" + port
+        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            optimize_ops,
+            params_grads,
+            trainer_id,
+            pservers=pserver_endpoints,
+            trainers=trainers)
+        if training_role == "PSERVER":
+            pserver_prog = t.get_pserver_program(current_endpoint)
+            pserver_startup = t.get_startup_program(current_endpoint,
+                                                    pserver_prog)
+            exe.run(pserver_startup)
+            exe.run(pserver_prog)
+        elif training_role == "TRAINER":
+            train_loop(t.get_trainer_program())
 
 
 def infer(use_cuda, save_dirname=None):
@@ -251,61 +286,70 @@ def infer(use_cuda, save_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
 
-    # Use fluid.io.load_inference_model to obtain the inference program desc,
-    # the feed_target_names (the names of variables that will be feeded 
-    # data using feed operators), and the fetch_targets (variables that 
-    # we want to obtain data from using fetch operators).
-    [inference_program, feed_target_names,
-     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
-    lod = [0, 4, 10]
-    ts_word = create_random_lodtensor(lod, place, low=0, high=1)
-    ts_pred = create_random_lodtensor(lod, place, low=0, high=1)
-    ts_ctx_n2 = create_random_lodtensor(lod, place, low=0, high=1)
-    ts_ctx_n1 = create_random_lodtensor(lod, place, low=0, high=1)
-    ts_ctx_0 = create_random_lodtensor(lod, place, low=0, high=1)
-    ts_ctx_p1 = create_random_lodtensor(lod, place, low=0, high=1)
-    ts_ctx_p2 = create_random_lodtensor(lod, place, low=0, high=1)
-    ts_mark = create_random_lodtensor(lod, place, low=0, high=1)
-
-    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
-    # and results will contain a list of data corresponding to fetch_targets.
-    assert feed_target_names[0] == 'word_data'
-    assert feed_target_names[1] == 'verb_data'
-    assert feed_target_names[2] == 'ctx_n2_data'
-    assert feed_target_names[3] == 'ctx_n1_data'
-    assert feed_target_names[4] == 'ctx_0_data'
-    assert feed_target_names[5] == 'ctx_p1_data'
-    assert feed_target_names[6] == 'ctx_p2_data'
-    assert feed_target_names[7] == 'mark_data'
-
-    results = exe.run(inference_program,
-                      feed={
-                          feed_target_names[0]: ts_word,
-                          feed_target_names[1]: ts_pred,
-                          feed_target_names[2]: ts_ctx_n2,
-                          feed_target_names[3]: ts_ctx_n1,
-                          feed_target_names[4]: ts_ctx_0,
-                          feed_target_names[5]: ts_ctx_p1,
-                          feed_target_names[6]: ts_ctx_p2,
-                          feed_target_names[7]: ts_mark
-                      },
-                      fetch_list=fetch_targets,
-                      return_numpy=False)
-    print(results[0].lod())
-    np_data = np.array(results[0])
-    print("Inference Shape: ", np_data.shape)
-    print("Inference results: ", np_data)
-
-
-def main(use_cuda):
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        # Use fluid.io.load_inference_model to obtain the inference program desc,
+        # the feed_target_names (the names of variables that will be feeded
+        # data using feed operators), and the fetch_targets (variables that
+        # we want to obtain data from using fetch operators).
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+        lod = [0, 4, 10]
+        word = create_random_lodtensor(
+            lod, place, low=0, high=word_dict_len - 1)
+        pred = create_random_lodtensor(
+            lod, place, low=0, high=pred_dict_len - 1)
+        ctx_n2 = create_random_lodtensor(
+            lod, place, low=0, high=word_dict_len - 1)
+        ctx_n1 = create_random_lodtensor(
+            lod, place, low=0, high=word_dict_len - 1)
+        ctx_0 = create_random_lodtensor(
+            lod, place, low=0, high=word_dict_len - 1)
+        ctx_p1 = create_random_lodtensor(
+            lod, place, low=0, high=word_dict_len - 1)
+        ctx_p2 = create_random_lodtensor(
+            lod, place, low=0, high=word_dict_len - 1)
+        mark = create_random_lodtensor(
+            lod, place, low=0, high=mark_dict_len - 1)
+
+        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+        # and results will contain a list of data corresponding to fetch_targets.
+        assert feed_target_names[0] == 'word_data'
+        assert feed_target_names[1] == 'verb_data'
+        assert feed_target_names[2] == 'ctx_n2_data'
+        assert feed_target_names[3] == 'ctx_n1_data'
+        assert feed_target_names[4] == 'ctx_0_data'
+        assert feed_target_names[5] == 'ctx_p1_data'
+        assert feed_target_names[6] == 'ctx_p2_data'
+        assert feed_target_names[7] == 'mark_data'
+
+        results = exe.run(inference_program,
+                          feed={
+                              feed_target_names[0]: word,
+                              feed_target_names[1]: pred,
+                              feed_target_names[2]: ctx_n2,
+                              feed_target_names[3]: ctx_n1,
+                              feed_target_names[4]: ctx_0,
+                              feed_target_names[5]: ctx_p1,
+                              feed_target_names[6]: ctx_p2,
+                              feed_target_names[7]: mark
+                          },
+                          fetch_list=fetch_targets,
+                          return_numpy=False)
+        print(results[0].lod())
+        np_data = np.array(results[0])
+        print("Inference Shape: ", np_data.shape)
+
+
+def main(use_cuda, is_local=True):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
 
     # Directory for saving the trained model
     save_dirname = "label_semantic_roles.inference.model"
 
-    train(use_cuda, save_dirname)
+    train(use_cuda, save_dirname, is_local)
     infer(use_cuda, save_dirname)
 
 
diff --git a/python/paddle/v2/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py
similarity index 78%
rename from python/paddle/v2/fluid/tests/book/test_machine_translation.py
rename to python/paddle/fluid/tests/book/test_machine_translation.py
index d3405a9601d5c12b05e257ab2e28176fa1d743f4..caa9596a100de4f9364467690db1e80ee227c3c1 100644
--- a/python/paddle/v2/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
@@ -15,11 +15,12 @@ import contextlib
 
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid.layers as pd
-from paddle.v2.fluid.executor import Executor
+import paddle.fluid as fluid
+import paddle.fluid.framework as framework
+import paddle.fluid.layers as pd
+from paddle.fluid.executor import Executor
 import unittest
+import os
 
 dict_size = 30000
 source_dict_dim = target_dict_dim = dict_size
@@ -168,7 +169,7 @@ def to_lodtensor(data, place):
     return res
 
 
-def train_main(use_cuda, is_sparse):
+def train_main(use_cuda, is_sparse, is_local=True):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
@@ -178,10 +179,10 @@ def train_main(use_cuda, is_sparse):
     label = pd.data(
         name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
     cost = pd.cross_entropy(input=rnn_out, label=label)
-    avg_cost = pd.mean(x=cost)
+    avg_cost = pd.mean(cost)
 
     optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
-    optimizer.minimize(avg_cost)
+    optimize_ops, params_grads = optimizer.minimize(avg_cost)
 
     train_data = paddle.batch(
         paddle.reader.shuffle(
@@ -190,27 +191,57 @@ def train_main(use_cuda, is_sparse):
 
     exe = Executor(place)
 
-    exe.run(framework.default_startup_program())
-
-    batch_id = 0
-    for pass_id in xrange(1):
-        for data in train_data():
-            word_data = to_lodtensor(map(lambda x: x[0], data), place)
-            trg_word = to_lodtensor(map(lambda x: x[1], data), place)
-            trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
-            outs = exe.run(framework.default_main_program(),
-                           feed={
-                               'src_word_id': word_data,
-                               'target_language_word': trg_word,
-                               'target_language_next_word': trg_word_next
-                           },
-                           fetch_list=[avg_cost])
-            avg_cost_val = np.array(outs[0])
-            print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
-                  " avg_cost=" + str(avg_cost_val))
-            if batch_id > 3:
-                break
-            batch_id += 1
+    def train_loop(main_program):
+        exe.run(framework.default_startup_program())
+
+        batch_id = 0
+        for pass_id in xrange(1):
+            for data in train_data():
+                word_data = to_lodtensor(map(lambda x: x[0], data), place)
+                trg_word = to_lodtensor(map(lambda x: x[1], data), place)
+                trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
+                outs = exe.run(main_program,
+                               feed={
+                                   'src_word_id': word_data,
+                                   'target_language_word': trg_word,
+                                   'target_language_next_word': trg_word_next
+                               },
+                               fetch_list=[avg_cost])
+                avg_cost_val = np.array(outs[0])
+                print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
+                      " avg_cost=" + str(avg_cost_val))
+                if batch_id > 3:
+                    break
+                batch_id += 1
+
+    if is_local:
+        train_loop(framework.default_main_program())
+    else:
+        port = os.getenv("PADDLE_INIT_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, port]))
+        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
+        trainers = int(os.getenv("TRAINERS"))
+        current_endpoint = os.getenv("POD_IP") + ":" + port
+        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            optimize_ops,
+            params_grads,
+            trainer_id,
+            pservers=pserver_endpoints,
+            trainers=trainers)
+        if training_role == "PSERVER":
+            pserver_prog = t.get_pserver_program(current_endpoint)
+            pserver_startup = t.get_startup_program(current_endpoint,
+                                                    pserver_prog)
+            exe.run(pserver_startup)
+            exe.run(pserver_prog)
+        elif training_role == "TRAINER":
+            train_loop(t.get_trainer_program())
 
 
 def decode_main(use_cuda, is_sparse):
diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py
new file mode 100644
index 0000000000000000000000000000000000000000..b57fe08e1a367c33db31c89127b6c2bc08253655
--- /dev/null
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -0,0 +1,276 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+import sys
+import numpy
+import unittest
+import math
+import sys
+import os
+
+BATCH_SIZE = 64
+
+
+def loss_net(hidden, label):
+    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_loss = fluid.layers.mean(loss)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+    return prediction, avg_loss, acc
+
+
+def mlp(img, label):
+    hidden = fluid.layers.fc(input=img, size=200, act='tanh')
+    hidden = fluid.layers.fc(input=hidden, size=200, act='tanh')
+    return loss_net(hidden, label)
+
+
+def conv_net(img, label):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    return loss_net(conv_pool_2, label)
+
+
+def train(nn_type,
+          use_cuda,
+          parallel,
+          save_dirname=None,
+          model_filename=None,
+          params_filename=None,
+          is_local=True):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    if nn_type == 'mlp':
+        net_conf = mlp
+    else:
+        net_conf = conv_net
+
+    if parallel:
+        places = fluid.layers.get_places()
+        pd = fluid.layers.ParallelDo(places)
+        with pd.do():
+            img_ = pd.read_input(img)
+            label_ = pd.read_input(label)
+            prediction, avg_loss, acc = net_conf(img_, label_)
+            for o in [avg_loss, acc]:
+                pd.write_output(o)
+
+        avg_loss, acc = pd()
+        # get mean loss and acc through every devices.
+        avg_loss = fluid.layers.mean(avg_loss)
+        acc = fluid.layers.mean(acc)
+    else:
+        prediction, avg_loss, acc = net_conf(img, label)
+
+    test_program = fluid.default_main_program().clone()
+
+    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
+    optimize_ops, params_grads = optimizer.minimize(avg_loss)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    exe = fluid.Executor(place)
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=500),
+        batch_size=BATCH_SIZE)
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
+    feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
+
+    def train_loop(main_program):
+        exe.run(fluid.default_startup_program())
+
+        PASS_NUM = 100
+        for pass_id in range(PASS_NUM):
+            for batch_id, data in enumerate(train_reader()):
+                # train a mini-batch, fetch nothing
+                exe.run(main_program, feed=feeder.feed(data))
+                if (batch_id + 1) % 10 == 0:
+                    acc_set = []
+                    avg_loss_set = []
+                    for test_data in test_reader():
+                        acc_np, avg_loss_np = exe.run(
+                            program=test_program,
+                            feed=feeder.feed(test_data),
+                            fetch_list=[acc, avg_loss])
+                        acc_set.append(float(acc_np))
+                        avg_loss_set.append(float(avg_loss_np))
+                    # get test acc and loss
+                    acc_val = numpy.array(acc_set).mean()
+                    avg_loss_val = numpy.array(avg_loss_set).mean()
+                    if float(acc_val
+                             ) > 0.2:  # Smaller value to increase CI speed
+                        if save_dirname is not None:
+                            fluid.io.save_inference_model(
+                                save_dirname, ["img"], [prediction],
+                                exe,
+                                model_filename=model_filename,
+                                params_filename=params_filename)
+                        return
+                    else:
+                        print(
+                            'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
+                            format(pass_id, batch_id + 1,
+                                   float(avg_loss_val), float(acc_val)))
+                        if math.isnan(float(avg_loss_val)):
+                            sys.exit("got NaN loss, training failed.")
+        raise AssertionError("Loss of recognize digits is too large")
+
+    if is_local:
+        train_loop(fluid.default_main_program())
+    else:
+        port = os.getenv("PADDLE_INIT_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, port]))
+        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
+        pserver_endpoints = os.getenv("PSERVERS")
+        trainers = int(os.getenv("TRAINERS"))
+        current_endpoint = os.getenv("POD_IP") + ":" + port
+        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            optimize_ops,
+            params_grads,
+            trainer_id,
+            pservers=pserver_endpoints,
+            trainers=trainers)
+        if training_role == "PSERVER":
+            pserver_prog = t.get_pserver_program(current_endpoint)
+            pserver_startup = t.get_startup_program(current_endpoint,
+                                                    pserver_prog)
+            exe.run(pserver_startup)
+            exe.run(pserver_prog)
+        elif training_role == "TRAINER":
+            train_loop(t.get_trainer_program())
+
+
+def infer(use_cuda,
+          save_dirname=None,
+          model_filename=None,
+          params_filename=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        # Use fluid.io.load_inference_model to obtain the inference program desc,
+        # the feed_target_names (the names of variables that will be feeded
+        # data using feed operators), and the fetch_targets (variables that
+        # we want to obtain data from using fetch operators).
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(
+             save_dirname, exe, model_filename, params_filename)
+
+        # The input's dimension of conv should be 4-D or 5-D.
+        # Use normilized image pixels as input data, which should be in the range [-1.0, 1.0].
+        batch_size = 1
+        tensor_img = numpy.random.uniform(
+            -1.0, 1.0, [batch_size, 1, 28, 28]).astype("float32")
+
+        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+        # and results will contain a list of data corresponding to fetch_targets.
+        results = exe.run(inference_program,
+                          feed={feed_target_names[0]: tensor_img},
+                          fetch_list=fetch_targets)
+        print("infer results: ", results[0])
+
+
+def main(use_cuda, parallel, nn_type, combine):
+    save_dirname = None
+    model_filename = None
+    params_filename = None
+    if not use_cuda and not parallel:
+        save_dirname = "recognize_digits_" + nn_type + ".inference.model"
+        if combine == True:
+            model_filename = "__model_combined__"
+            params_filename = "__params_combined__"
+
+    # call train() with is_local argument to run distributed train
+    train(
+        nn_type=nn_type,
+        use_cuda=use_cuda,
+        parallel=parallel,
+        save_dirname=save_dirname,
+        model_filename=model_filename,
+        params_filename=params_filename)
+    infer(
+        use_cuda=use_cuda,
+        save_dirname=save_dirname,
+        model_filename=model_filename,
+        params_filename=params_filename)
+
+
+class TestRecognizeDigits(unittest.TestCase):
+    pass
+
+
+def inject_test_method(use_cuda, parallel, nn_type, combine):
+    def __impl__(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                main(use_cuda, parallel, nn_type, combine)
+
+    fn = 'test_{0}_{1}_{2}_{3}'.format(nn_type, 'cuda'
+                                       if use_cuda else 'cpu', 'parallel'
+                                       if parallel else 'normal', 'combine'
+                                       if combine else 'separate')
+
+    setattr(TestRecognizeDigits, fn, __impl__)
+
+
+def inject_all_tests():
+    for use_cuda in (False, True):
+        for parallel in (False, True):
+            for nn_type in ('mlp', 'conv'):
+                inject_test_method(use_cuda, parallel, nn_type, True)
+
+    # Two unit-test for saving parameters as separate files
+    inject_test_method(False, False, 'mlp', False)
+    inject_test_method(False, False, 'conv', False)
+
+
+inject_all_tests()
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/book/test_recommender_system.py b/python/paddle/fluid/tests/book/test_recommender_system.py
similarity index 57%
rename from python/paddle/v2/fluid/tests/book/test_recommender_system.py
rename to python/paddle/fluid/tests/book/test_recommender_system.py
index 1a7d8d57ffa7f459bac4b55882dd5f9b0ed8beda..5e258a2c5170f63aa1fbaab5f38efdba04c8d391 100644
--- a/python/paddle/v2/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -14,14 +14,15 @@
 
 import math
 import sys
+import os
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.nets as nets
-from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.optimizer import SGDOptimizer
+import paddle.fluid as fluid
+import paddle.fluid.framework as framework
+import paddle.fluid.layers as layers
+import paddle.fluid.nets as nets
+from paddle.fluid.executor import Executor
+from paddle.fluid.optimizer import SGDOptimizer
 
 IS_SPARSE = True
 USE_GPU = False
@@ -147,24 +148,23 @@ def model():
 
     label = layers.data(name='score', shape=[1], dtype='float32')
     square_cost = layers.square_error_cost(input=scale_infer, label=label)
-    avg_cost = layers.mean(x=square_cost)
+    avg_cost = layers.mean(square_cost)
 
     return scale_infer, avg_cost
 
 
-def train(use_cuda, save_dirname):
+def train(use_cuda, save_dirname, is_local=True):
     scale_infer, avg_cost = model()
 
     # test program
     test_program = fluid.default_main_program().clone()
 
     sgd_optimizer = SGDOptimizer(learning_rate=0.2)
-    opts = sgd_optimizer.minimize(avg_cost)
+    optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     exe = Executor(place)
-    exe.run(framework.default_startup_program())
 
     train_reader = paddle.batch(
         paddle.reader.shuffle(
@@ -212,36 +212,69 @@ def train(use_cuda, save_dirname):
             feed_tensors[key] = tensor
         return feed_tensors
 
-    PASS_NUM = 100
-    for pass_id in range(PASS_NUM):
-        for batch_id, data in enumerate(train_reader()):
-            # train a mini-batch
-            outs = exe.run(program=fluid.default_main_program(),
-                           feed=func_feed(feeding, data),
-                           fetch_list=[avg_cost])
-            out = np.array(outs[0])
-            if (batch_id + 1) % 10 == 0:
-                avg_cost_set = []
-                for test_data in test_reader():
-                    avg_cost_np = exe.run(program=test_program,
-                                          feed=func_feed(feeding, test_data),
-                                          fetch_list=[avg_cost])
-                    avg_cost_set.append(avg_cost_np[0])
-                    break  # test only 1 segment for speeding up CI
-
-                # get test avg_cost
-                test_avg_cost = np.array(avg_cost_set).mean()
-                if test_avg_cost < 6.0:
-                    # if avg_cost less than 6.0, we think our code is good.
-                    if save_dirname is not None:
-                        fluid.io.save_inference_model(save_dirname, [
-                            "user_id", "gender_id", "age_id", "job_id",
-                            "movie_id", "category_id", "movie_title"
-                        ], [scale_infer], exe)
-                    return
-
-            if math.isnan(float(out[0])):
-                sys.exit("got NaN loss, training failed.")
+    def train_loop(main_program):
+        exe.run(framework.default_startup_program())
+
+        PASS_NUM = 100
+        for pass_id in range(PASS_NUM):
+            for batch_id, data in enumerate(train_reader()):
+                # train a mini-batch
+                outs = exe.run(program=main_program,
+                               feed=func_feed(feeding, data),
+                               fetch_list=[avg_cost])
+                out = np.array(outs[0])
+                if (batch_id + 1) % 10 == 0:
+                    avg_cost_set = []
+                    for test_data in test_reader():
+                        avg_cost_np = exe.run(
+                            program=test_program,
+                            feed=func_feed(feeding, test_data),
+                            fetch_list=[avg_cost])
+                        avg_cost_set.append(avg_cost_np[0])
+                        break  # test only 1 segment for speeding up CI
+
+                    # get test avg_cost
+                    test_avg_cost = np.array(avg_cost_set).mean()
+                    if test_avg_cost < 6.0:
+                        # if avg_cost less than 6.0, we think our code is good.
+                        if save_dirname is not None:
+                            fluid.io.save_inference_model(save_dirname, [
+                                "user_id", "gender_id", "age_id", "job_id",
+                                "movie_id", "category_id", "movie_title"
+                            ], [scale_infer], exe)
+                        return
+
+                if math.isnan(float(out[0])):
+                    sys.exit("got NaN loss, training failed.")
+
+    if is_local:
+        train_loop(fluid.default_main_program())
+    else:
+        port = os.getenv("PADDLE_INIT_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, port]))
+        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
+        trainers = int(os.getenv("TRAINERS"))
+        current_endpoint = os.getenv("POD_IP") + ":" + port
+        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            optimize_ops,
+            params_grads,
+            trainer_id,
+            pservers=pserver_endpoints,
+            trainers=trainers)
+        if training_role == "PSERVER":
+            pserver_prog = t.get_pserver_program(current_endpoint)
+            pserver_startup = t.get_startup_program(current_endpoint,
+                                                    pserver_prog)
+            exe.run(pserver_startup)
+            exe.run(pserver_prog)
+        elif training_role == "TRAINER":
+            train_loop(t.get_trainer_program())
 
 
 def infer(use_cuda, save_dirname=None):
@@ -251,13 +284,6 @@ def infer(use_cuda, save_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
 
-    # Use fluid.io.load_inference_model to obtain the inference program desc,
-    # the feed_target_names (the names of variables that will be feeded
-    # data using feed operators), and the fetch_targets (variables that
-    # we want to obtain data from using fetch operators).
-    [inference_program, feed_target_names,
-     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
     def create_lod_tensor(data, lod=None):
         tensor = fluid.LoDTensor()
         if lod is None:
@@ -275,44 +301,53 @@ def infer(use_cuda, save_dirname=None):
         tensor.set(flattened_data, place)
         return tensor
 
-    # Use the first data from paddle.dataset.movielens.test() as input
-    assert feed_target_names[0] == "user_id"
-    user_id = create_lod_tensor([[1]])
-
-    assert feed_target_names[1] == "gender_id"
-    gender_id = create_lod_tensor([[1]])
-
-    assert feed_target_names[2] == "age_id"
-    age_id = create_lod_tensor([[0]])
-
-    assert feed_target_names[3] == "job_id"
-    job_id = create_lod_tensor([[10]])
-
-    assert feed_target_names[4] == "movie_id"
-    movie_id = create_lod_tensor([[783]])
-
-    assert feed_target_names[5] == "category_id"
-    category_id = create_lod_tensor([[10], [8], [9]], [[0, 3]])
-
-    assert feed_target_names[6] == "movie_title"
-    movie_title = create_lod_tensor([[1069], [4140], [2923], [710], [988]],
-                                    [[0, 5]])
-
-    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
-    # and results will contain a list of data corresponding to fetch_targets.
-    results = exe.run(inference_program,
-                      feed={
-                          feed_target_names[0]: user_id,
-                          feed_target_names[1]: gender_id,
-                          feed_target_names[2]: age_id,
-                          feed_target_names[3]: job_id,
-                          feed_target_names[4]: movie_id,
-                          feed_target_names[5]: category_id,
-                          feed_target_names[6]: movie_title
-                      },
-                      fetch_list=fetch_targets,
-                      return_numpy=False)
-    print("inferred score: ", np.array(results[0]))
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        # Use fluid.io.load_inference_model to obtain the inference program desc,
+        # the feed_target_names (the names of variables that will be feeded
+        # data using feed operators), and the fetch_targets (variables that
+        # we want to obtain data from using fetch operators).
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+        # Use the first data from paddle.dataset.movielens.test() as input
+        assert feed_target_names[0] == "user_id"
+        user_id = create_lod_tensor([[1]])
+
+        assert feed_target_names[1] == "gender_id"
+        gender_id = create_lod_tensor([[1]])
+
+        assert feed_target_names[2] == "age_id"
+        age_id = create_lod_tensor([[0]])
+
+        assert feed_target_names[3] == "job_id"
+        job_id = create_lod_tensor([[10]])
+
+        assert feed_target_names[4] == "movie_id"
+        movie_id = create_lod_tensor([[783]])
+
+        assert feed_target_names[5] == "category_id"
+        category_id = create_lod_tensor([[10], [8], [9]], [[0, 3]])
+
+        assert feed_target_names[6] == "movie_title"
+        movie_title = create_lod_tensor([[1069], [4140], [2923], [710], [988]],
+                                        [[0, 5]])
+
+        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+        # and results will contain a list of data corresponding to fetch_targets.
+        results = exe.run(inference_program,
+                          feed={
+                              feed_target_names[0]: user_id,
+                              feed_target_names[1]: gender_id,
+                              feed_target_names[2]: age_id,
+                              feed_target_names[3]: job_id,
+                              feed_target_names[4]: movie_id,
+                              feed_target_names[5]: category_id,
+                              feed_target_names[6]: movie_title
+                          },
+                          fetch_list=fetch_targets,
+                          return_numpy=False)
+        print("inferred score: ", np.array(results[0]))
 
 
 def main(use_cuda):
diff --git a/python/paddle/fluid/tests/book/test_understand_sentiment.py b/python/paddle/fluid/tests/book/test_understand_sentiment.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b7e84ea05cab5750865032ee7440cd5f5aa519b
--- /dev/null
+++ b/python/paddle/fluid/tests/book/test_understand_sentiment.py
@@ -0,0 +1,379 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+import contextlib
+import math
+import numpy as np
+import sys
+import os
+
+
+def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
+                    hid_dim=32):
+    emb = fluid.layers.embedding(
+        input=data, size=[input_dim, emb_dim], is_sparse=True)
+    conv_3 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=3,
+        act="tanh",
+        pool_type="sqrt")
+    conv_4 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=4,
+        act="tanh",
+        pool_type="sqrt")
+    prediction = fluid.layers.fc(input=[conv_3, conv_4],
+                                 size=class_dim,
+                                 act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(cost)
+    accuracy = fluid.layers.accuracy(input=prediction, label=label)
+    return avg_cost, accuracy, prediction
+
+
+def dyn_rnn_lstm(data, label, input_dim, class_dim=2, emb_dim=32,
+                 lstm_size=128):
+    emb = fluid.layers.embedding(
+        input=data, size=[input_dim, emb_dim], is_sparse=True)
+    sentence = fluid.layers.fc(input=emb, size=lstm_size, act='tanh')
+
+    rnn = fluid.layers.DynamicRNN()
+    with rnn.block():
+        word = rnn.step_input(sentence)
+        prev_hidden = rnn.memory(value=0.0, shape=[lstm_size])
+        prev_cell = rnn.memory(value=0.0, shape=[lstm_size])
+
+        def gate_common(ipt, hidden, size):
+            gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True)
+            gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False)
+            return gate0 + gate1
+
+        forget_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
+                                                         lstm_size))
+        input_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
+                                                        lstm_size))
+        output_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
+                                                         lstm_size))
+        cell_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
+                                                       lstm_size))
+
+        cell = forget_gate * prev_cell + input_gate * cell_gate
+        hidden = output_gate * fluid.layers.tanh(x=cell)
+        rnn.update_memory(prev_cell, cell)
+        rnn.update_memory(prev_hidden, hidden)
+        rnn.output(hidden)
+
+    last = fluid.layers.sequence_last_step(rnn())
+    prediction = fluid.layers.fc(input=last, size=class_dim, act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(cost)
+    accuracy = fluid.layers.accuracy(input=prediction, label=label)
+    return avg_cost, accuracy, prediction
+
+
+def stacked_lstm_net(data,
+                     label,
+                     input_dim,
+                     class_dim=2,
+                     emb_dim=128,
+                     hid_dim=512,
+                     stacked_num=3):
+    assert stacked_num % 2 == 1
+
+    emb = fluid.layers.embedding(
+        input=data, size=[input_dim, emb_dim], is_sparse=True)
+    # add bias attr
+
+    # TODO(qijun) linear act
+    fc1 = fluid.layers.fc(input=emb, size=hid_dim)
+    lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)
+
+    inputs = [fc1, lstm1]
+
+    for i in range(2, stacked_num + 1):
+        fc = fluid.layers.fc(input=inputs, size=hid_dim)
+        lstm, cell = fluid.layers.dynamic_lstm(
+            input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
+        inputs = [fc, lstm]
+
+    fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
+    lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
+
+    prediction = fluid.layers.fc(input=[fc_last, lstm_last],
+                                 size=class_dim,
+                                 act='softmax')
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(cost)
+    accuracy = fluid.layers.accuracy(input=prediction, label=label)
+    return avg_cost, accuracy, prediction
+
+
+def create_random_lodtensor(lod, place, low, high):
+    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
+    res = fluid.LoDTensor()
+    res.set(data, place)
+    res.set_lod([lod])
+    return res
+
+
+def train(word_dict,
+          net_method,
+          use_cuda,
+          parallel=False,
+          save_dirname=None,
+          is_local=True):
+    BATCH_SIZE = 128
+    PASS_NUM = 5
+    dict_dim = len(word_dict)
+    class_dim = 2
+
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+    if not parallel:
+        cost, acc_out, prediction = net_method(
+            data, label, input_dim=dict_dim, class_dim=class_dim)
+    else:
+        places = fluid.layers.get_places()
+        pd = fluid.layers.ParallelDo(places)
+        with pd.do():
+            cost, acc, _ = net_method(
+                pd.read_input(data),
+                pd.read_input(label),
+                input_dim=dict_dim,
+                class_dim=class_dim)
+            pd.write_output(cost)
+            pd.write_output(acc)
+
+        cost, acc = pd()
+        cost = fluid.layers.mean(cost)
+        acc_out = fluid.layers.mean(acc)
+        prediction = None
+        assert save_dirname is None
+
+    adagrad = fluid.optimizer.Adagrad(learning_rate=0.002)
+    optimize_ops, params_grads = adagrad.minimize(cost)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=1000),
+        batch_size=BATCH_SIZE)
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
+
+    def train_loop(main_program):
+        exe.run(fluid.default_startup_program())
+
+        for pass_id in xrange(PASS_NUM):
+            for data in train_data():
+                cost_val, acc_val = exe.run(main_program,
+                                            feed=feeder.feed(data),
+                                            fetch_list=[cost, acc_out])
+                print("cost=" + str(cost_val) + " acc=" + str(acc_val))
+                if cost_val < 0.4 and acc_val > 0.8:
+                    if save_dirname is not None:
+                        fluid.io.save_inference_model(save_dirname, ["words"],
+                                                      prediction, exe)
+                    return
+                if math.isnan(float(cost_val)):
+                    sys.exit("got NaN loss, training failed.")
+        raise AssertionError("Cost is too large for {0}".format(
+            net_method.__name__))
+
+    if is_local:
+        train_loop(fluid.default_main_program())
+    else:
+        port = os.getenv("PADDLE_INIT_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, port]))
+        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
+        trainers = int(os.getenv("TRAINERS"))
+        current_endpoint = os.getenv("POD_IP") + ":" + port
+        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            optimize_ops,
+            params_grads,
+            trainer_id,
+            pservers=pserver_endpoints,
+            trainers=trainers)
+        if training_role == "PSERVER":
+            pserver_prog = t.get_pserver_program(current_endpoint)
+            pserver_startup = t.get_startup_program(current_endpoint,
+                                                    pserver_prog)
+            exe.run(pserver_startup)
+            exe.run(pserver_prog)
+        elif training_role == "TRAINER":
+            train_loop(t.get_trainer_program())
+
+
+def infer(word_dict, use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        # Use fluid.io.load_inference_model to obtain the inference program desc,
+        # the feed_target_names (the names of variables that will be feeded
+        # data using feed operators), and the fetch_targets (variables that
+        # we want to obtain data from using fetch operators).
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+        word_dict_len = len(word_dict)
+
+        lod = [0, 4, 10]
+        tensor_words = create_random_lodtensor(
+            lod, place, low=0, high=word_dict_len - 1)
+
+        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+        # and results will contain a list of data corresponding to fetch_targets.
+        assert feed_target_names[0] == "words"
+        results = exe.run(inference_program,
+                          feed={feed_target_names[0]: tensor_words},
+                          fetch_list=fetch_targets,
+                          return_numpy=False)
+        print(results[0].lod())
+        np_data = np.array(results[0])
+        print("Inference Shape: ", np_data.shape)
+        print("Inference results: ", np_data)
+
+
+def main(word_dict, net_method, use_cuda, parallel=False, save_dirname=None):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    train(
+        word_dict,
+        net_method,
+        use_cuda,
+        parallel=parallel,
+        save_dirname=save_dirname)
+    infer(use_cuda, save_dirname)
+
+
+class TestUnderstandSentiment(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.word_dict = paddle.dataset.imdb.word_dict()
+
+    @contextlib.contextmanager
+    def new_program_scope(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+
+    def test_conv_cpu(self):
+        with self.new_program_scope():
+            main(
+                self.word_dict,
+                net_method=convolution_net,
+                use_cuda=False,
+                save_dirname="understand_sentiment_conv.inference.model")
+
+    def test_conv_cpu_parallel(self):
+        with self.new_program_scope():
+            main(
+                self.word_dict,
+                net_method=convolution_net,
+                use_cuda=False,
+                parallel=True)
+
+    @unittest.skip(reason="make CI faster")
+    def test_stacked_lstm_cpu(self):
+        with self.new_program_scope():
+            main(
+                self.word_dict,
+                net_method=stacked_lstm_net,
+                use_cuda=False,
+                save_dirname="understand_sentiment_stacked_lstm.inference.model")
+
+    def test_stacked_lstm_cpu_parallel(self):
+        with self.new_program_scope():
+            main(
+                self.word_dict,
+                net_method=stacked_lstm_net,
+                use_cuda=False,
+                parallel=True)
+
+    def test_conv_gpu(self):
+        with self.new_program_scope():
+            main(
+                self.word_dict,
+                net_method=convolution_net,
+                use_cuda=True,
+                save_dirname="understand_sentiment_conv.inference.model")
+
+    def test_conv_gpu_parallel(self):
+        with self.new_program_scope():
+            main(
+                self.word_dict,
+                net_method=convolution_net,
+                use_cuda=True,
+                parallel=True)
+
+    @unittest.skip(reason="make CI faster")
+    def test_stacked_lstm_gpu(self):
+        with self.new_program_scope():
+            main(
+                self.word_dict,
+                net_method=stacked_lstm_net,
+                use_cuda=True,
+                save_dirname="understand_sentiment_stacked_lstm.inference.model")
+
+    def test_stacked_lstm_gpu_parallel(self):
+        with self.new_program_scope():
+            main(
+                self.word_dict,
+                net_method=stacked_lstm_net,
+                use_cuda=True,
+                parallel=True)
+
+    @unittest.skip(reason='make CI faster')
+    def test_dynrnn_lstm_gpu(self):
+        with self.new_program_scope():
+            main(
+                self.word_dict,
+                net_method=dyn_rnn_lstm,
+                use_cuda=True,
+                parallel=False)
+
+    def test_dynrnn_lstm_gpu_parallel(self):
+        with self.new_program_scope():
+            main(
+                self.word_dict,
+                net_method=dyn_rnn_lstm,
+                use_cuda=True,
+                parallel=True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
new file mode 100644
index 0000000000000000000000000000000000000000..26b97c3e254f54b83515436660e44d4908c98fbe
--- /dev/null
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -0,0 +1,269 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+import unittest
+import os
+import numpy as np
+import math
+import sys
+
+
+def create_random_lodtensor(lod, place, low, high):
+    # The range of data elements is [low, high]
+    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
+    res = fluid.LoDTensor()
+    res.set(data, place)
+    res.set_lod([lod])
+    return res
+
+
+def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
+    PASS_NUM = 100
+    EMBED_SIZE = 32
+    HIDDEN_SIZE = 256
+    N = 5
+    BATCH_SIZE = 32
+    IS_SPARSE = is_sparse
+
+    def __network__(words):
+        embed_first = fluid.layers.embedding(
+            input=words[0],
+            size=[dict_size, EMBED_SIZE],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr='shared_w')
+        embed_second = fluid.layers.embedding(
+            input=words[1],
+            size=[dict_size, EMBED_SIZE],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr='shared_w')
+        embed_third = fluid.layers.embedding(
+            input=words[2],
+            size=[dict_size, EMBED_SIZE],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr='shared_w')
+        embed_forth = fluid.layers.embedding(
+            input=words[3],
+            size=[dict_size, EMBED_SIZE],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr='shared_w')
+
+        concat_embed = fluid.layers.concat(
+            input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
+        hidden1 = fluid.layers.fc(input=concat_embed,
+                                  size=HIDDEN_SIZE,
+                                  act='sigmoid')
+        predict_word = fluid.layers.fc(input=hidden1,
+                                       size=dict_size,
+                                       act='softmax')
+        cost = fluid.layers.cross_entropy(input=predict_word, label=words[4])
+        avg_cost = fluid.layers.mean(cost)
+        return avg_cost, predict_word
+
+    word_dict = paddle.dataset.imikolov.build_dict()
+    dict_size = len(word_dict)
+
+    first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
+    second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
+    third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
+    forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
+    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+
+    if not is_parallel:
+        avg_cost, predict_word = __network__(
+            [first_word, second_word, third_word, forth_word, next_word])
+    else:
+        places = fluid.layers.get_places()
+        pd = fluid.layers.ParallelDo(places)
+        with pd.do():
+            avg_cost, predict_word = __network__(
+                map(pd.read_input, [
+                    first_word, second_word, third_word, forth_word, next_word
+                ]))
+            pd.write_output(avg_cost)
+
+        avg_cost = fluid.layers.mean(pd())
+
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+
+    train_reader = paddle.batch(
+        paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(
+        feed_list=[first_word, second_word, third_word, forth_word, next_word],
+        place=place)
+
+    def train_loop(main_program):
+        exe.run(fluid.default_startup_program())
+
+        for pass_id in range(PASS_NUM):
+            for data in train_reader():
+                avg_cost_np = exe.run(main_program,
+                                      feed=feeder.feed(data),
+                                      fetch_list=[avg_cost])
+                if avg_cost_np[0] < 5.0:
+                    if save_dirname is not None:
+                        fluid.io.save_inference_model(save_dirname, [
+                            'firstw', 'secondw', 'thirdw', 'forthw'
+                        ], [predict_word], exe)
+                    return
+                if math.isnan(float(avg_cost_np[0])):
+                    sys.exit("got NaN loss, training failed.")
+
+        raise AssertionError("Cost is too large {0:2.2}".format(avg_cost_np[0]))
+
+    if is_local:
+        train_loop(fluid.default_main_program())
+    else:
+        port = os.getenv("PADDLE_INIT_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, port]))
+        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
+        trainers = int(os.getenv("TRAINERS"))
+        current_endpoint = os.getenv("POD_IP") + ":" + port
+        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            optimize_ops,
+            params_grads,
+            trainer_id,
+            pservers=pserver_endpoints,
+            trainers=trainers)
+        if training_role == "PSERVER":
+            pserver_prog = t.get_pserver_program(current_endpoint)
+            pserver_startup = t.get_startup_program(current_endpoint,
+                                                    pserver_prog)
+            exe.run(pserver_startup)
+            exe.run(pserver_prog)
+        elif training_role == "TRAINER":
+            train_loop(t.get_trainer_program())
+
+
+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        # Use fluid.io.load_inference_model to obtain the inference program desc,
+        # the feed_target_names (the names of variables that will be feeded
+        # data using feed operators), and the fetch_targets (variables that
+        # we want to obtain data from using fetch operators).
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+        word_dict = paddle.dataset.imikolov.build_dict()
+        dict_size = len(word_dict)
+
+        # Setup inputs, by creating 4 words, the lod of which should be [0, 1]
+        lod = [0, 1]
+        first_word = create_random_lodtensor(
+            lod, place, low=0, high=dict_size - 1)
+        second_word = create_random_lodtensor(
+            lod, place, low=0, high=dict_size - 1)
+        third_word = create_random_lodtensor(
+            lod, place, low=0, high=dict_size - 1)
+        fourth_word = create_random_lodtensor(
+            lod, place, low=0, high=dict_size - 1)
+
+        assert feed_target_names[0] == 'firstw'
+        assert feed_target_names[1] == 'secondw'
+        assert feed_target_names[2] == 'thirdw'
+        assert feed_target_names[3] == 'forthw'
+
+        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+        # and results will contain a list of data corresponding to fetch_targets.
+        results = exe.run(inference_program,
+                          feed={
+                              feed_target_names[0]: first_word,
+                              feed_target_names[1]: second_word,
+                              feed_target_names[2]: third_word,
+                              feed_target_names[3]: fourth_word
+                          },
+                          fetch_list=fetch_targets,
+                          return_numpy=False)
+        print(results[0].lod())
+        np_data = np.array(results[0])
+        print("Inference Shape: ", np_data.shape)
+
+
+def main(use_cuda, is_sparse, is_parallel):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    if not is_parallel:
+        save_dirname = "word2vec.inference.model"
+    else:
+        save_dirname = None
+
+    train(use_cuda, is_sparse, is_parallel, save_dirname)
+    infer(use_cuda, save_dirname)
+
+
+FULL_TEST = os.getenv('FULL_TEST',
+                      '0').lower() in ['true', '1', 't', 'y', 'yes', 'on']
+SKIP_REASON = "Only run minimum number of tests in CI server, to make CI faster"
+
+
+class W2VTest(unittest.TestCase):
+    pass
+
+
+def inject_test_method(use_cuda, is_sparse, is_parallel):
+    fn_name = "test_{0}_{1}_{2}".format("cuda" if use_cuda else "cpu", "sparse"
+                                        if is_sparse else "dense", "parallel"
+                                        if is_parallel else "normal")
+
+    def __impl__(*args, **kwargs):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                main(
+                    use_cuda=use_cuda,
+                    is_sparse=is_sparse,
+                    is_parallel=is_parallel)
+
+    if use_cuda and is_sparse:
+        fn = __impl__
+    else:
+        # skip the other test when on CI server
+        fn = unittest.skipUnless(
+            condition=FULL_TEST, reason=SKIP_REASON)(__impl__)
+
+    setattr(W2VTest, fn_name, fn)
+
+
+for use_cuda in (False, True):
+    for is_sparse in (False, True):
+        for is_parallel in (False, True):
+            inject_test_method(use_cuda, is_sparse, is_parallel)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/CMakeLists.txt b/python/paddle/fluid/tests/book_memory_optimization/CMakeLists.txt
similarity index 100%
rename from python/paddle/v2/fluid/tests/book_memory_optimization/CMakeLists.txt
rename to python/paddle/fluid/tests/book_memory_optimization/CMakeLists.txt
diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
similarity index 97%
rename from python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
rename to python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
index 944f8af0861f6c44f30c3c03c8ee27c406910c9b..784cfe58dfebd5451918789a3bd156b092978bd5 100644
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -14,7 +14,7 @@
 
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
+import paddle.fluid as fluid
 import math
 import sys
 
@@ -30,7 +30,7 @@ y_predict = fluid.layers.fc(input=x, size=1, act=None)
 y = fluid.layers.data(name='y', shape=[1], dtype='float32')
 
 cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-avg_cost = fluid.layers.mean(x=cost)
+avg_cost = fluid.layers.mean(cost)
 
 sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
 sgd_optimizer.minimize(avg_cost)
diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
similarity index 98%
rename from python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
rename to python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
index a556904107c25121a1c70ad82b113d582c41a010..57202cea1aa277631ea41a0f35ad0e308b961e3e 100644
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
@@ -17,7 +17,7 @@ from __future__ import print_function
 import sys
 
 import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
+import paddle.fluid as fluid
 import math
 import sys
 
@@ -117,7 +117,7 @@ else:
 
 predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
 cost = fluid.layers.cross_entropy(input=predict, label=label)
-avg_cost = fluid.layers.mean(x=cost)
+avg_cost = fluid.layers.mean(cost)
 
 optimizer = fluid.optimizer.Adam(learning_rate=0.001)
 opts = optimizer.minimize(avg_cost)
diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
similarity index 95%
rename from python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
rename to python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
index 4c1eae861bbe5662f58d41c08035e6d701ac8672..689a75afc7ccdf84142f5531a438e1f9af7af4ca 100644
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
@@ -14,11 +14,11 @@
 
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.executor import Executor
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.framework as framework
+import paddle.fluid.layers as layers
+from paddle.fluid.executor import Executor
 import math
 import sys
 
@@ -100,7 +100,7 @@ def main():
     label = layers.data(
         name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
     cost = layers.cross_entropy(input=rnn_out, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
+    avg_cost = fluid.layers.mean(cost)
 
     optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
     optimizer.minimize(avg_cost)
diff --git a/python/paddle/v2/fluid/tests/demo/fc_gan.py b/python/paddle/fluid/tests/demo/fc_gan.py
similarity index 97%
rename from python/paddle/v2/fluid/tests/demo/fc_gan.py
rename to python/paddle/fluid/tests/demo/fc_gan.py
index 67921db04ab3809c7f50f1f9615c10bb5dc79d07..7452ea2a34aa0c75d8e0990639b29705033af98b 100644
--- a/python/paddle/v2/fluid/tests/demo/fc_gan.py
+++ b/python/paddle/fluid/tests/demo/fc_gan.py
@@ -20,7 +20,7 @@ import matplotlib
 import numpy
 
 import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
+import paddle.fluid as fluid
 
 matplotlib.use('Agg')
 import matplotlib.pyplot as plt
@@ -96,7 +96,7 @@ def main():
             x=D(img),
             label=fluid.layers.data(
                 name='label', shape=[1], dtype='float32'))
-        d_loss = fluid.layers.mean(x=d_loss)
+        d_loss = fluid.layers.mean(d_loss)
 
     with fluid.program_guard(dg_program, startup_program):
         noise = fluid.layers.data(
@@ -107,7 +107,7 @@ def main():
             x=D(g_img),
             label=fluid.layers.fill_constant_batch_size_like(
                 input=noise, dtype='float32', shape=[-1, 1], value=1.0))
-        dg_loss = fluid.layers.mean(x=dg_loss)
+        dg_loss = fluid.layers.mean(dg_loss)
 
     opt = fluid.optimizer.Adam(learning_rate=LEARNING_RATE)
 
diff --git a/python/paddle/v2/fluid/tests/notest_concurrency.py b/python/paddle/fluid/tests/notest_concurrency.py
similarity index 90%
rename from python/paddle/v2/fluid/tests/notest_concurrency.py
rename to python/paddle/fluid/tests/notest_concurrency.py
index 9d87ed9c0736413ac4f740dbc83a1ed4ad8fd444..602d5f31eb311e8f87d3f5eae78cd41d64f61141 100644
--- a/python/paddle/v2/fluid/tests/notest_concurrency.py
+++ b/python/paddle/fluid/tests/notest_concurrency.py
@@ -13,9 +13,9 @@
 # limitations under the License.
 
 import unittest
-import paddle.v2.fluid as fluid
-import paddle.v2.fluid.core as core
-from paddle.v2.fluid.executor import Executor
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.executor import Executor
 
 
 class TestRoutineOp(unittest.TestCase):
diff --git a/python/paddle/v2/fluid/tests/notest_csp.py b/python/paddle/fluid/tests/notest_csp.py
similarity index 97%
rename from python/paddle/v2/fluid/tests/notest_csp.py
rename to python/paddle/fluid/tests/notest_csp.py
index 7fe234a20b5222eb85e6bcea2fcb05c53ddd57e9..f4be833deebd2c82e060e66a8bcf590020625cf8 100644
--- a/python/paddle/v2/fluid/tests/notest_csp.py
+++ b/python/paddle/fluid/tests/notest_csp.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import unittest
-import paddle.v2.fluid as fluid
+import paddle.fluid as fluid
 
 
 class TestCSPFramework(unittest.TestCase):
diff --git a/python/paddle/v2/fluid/tests/test_cpp_reader.py b/python/paddle/fluid/tests/test_cpp_reader.py
similarity index 98%
rename from python/paddle/v2/fluid/tests/test_cpp_reader.py
rename to python/paddle/fluid/tests/test_cpp_reader.py
index 6d2312dbcb5ff1c9486a807dec7466da8f7317e4..b65592057817cef83bf2157c55bacea5bbe34ea1 100644
--- a/python/paddle/v2/fluid/tests/test_cpp_reader.py
+++ b/python/paddle/fluid/tests/test_cpp_reader.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
+import paddle.fluid as fluid
 import numpy as np
 
 prog = fluid.framework.Program()
diff --git a/python/paddle/v2/fluid/tests/test_data_feeder.py b/python/paddle/fluid/tests/test_data_feeder.py
similarity index 96%
rename from python/paddle/v2/fluid/tests/test_data_feeder.py
rename to python/paddle/fluid/tests/test_data_feeder.py
index 3154293ee6344b17ecdd628d334cf2e598078ebb..861dd3174a21d59fe12e0b794ecb2a934946ac71 100644
--- a/python/paddle/v2/fluid/tests/test_data_feeder.py
+++ b/python/paddle/fluid/tests/test_data_feeder.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2.fluid as fluid
+import paddle.fluid as fluid
 
 
 def test_converter():
diff --git a/python/paddle/v2/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
similarity index 75%
rename from python/paddle/v2/fluid/tests/test_detection.py
rename to python/paddle/fluid/tests/test_detection.py
index 908f4e82a62eec49ca0e88d678bb1e3003024d8e..fc25786499ff054a32e5503e796992d7f1e3ba02 100644
--- a/python/paddle/v2/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -13,9 +13,9 @@
 # limitations under the License.
 
 from __future__ import print_function
-import paddle.v2.fluid as fluid
-import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.framework import Program, program_guard
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+from paddle.fluid.framework import Program, program_guard
 import unittest
 
 
@@ -145,5 +145,43 @@ class TestMultiBoxHead(unittest.TestCase):
         return mbox_locs, mbox_confs, box, var
 
 
+class TestDetectionMAP(unittest.TestCase):
+    def test_detection_map(self):
+        program = Program()
+        with program_guard(program):
+            detect_res = layers.data(
+                name='detect_res',
+                shape=[10, 6],
+                append_batch_size=False,
+                dtype='float32')
+            label = layers.data(
+                name='label',
+                shape=[10, 6],
+                append_batch_size=False,
+                dtype='float32')
+
+            map_out, accum_pos_count_out, accum_true_pos_out, accum_false_pos_out = layers.detection_map(
+                detect_res=detect_res, label=label)
+            self.assertIsNotNone(map_out)
+            self.assertIsNotNone(accum_pos_count_out)
+            self.assertIsNotNone(accum_true_pos_out)
+            self.assertIsNotNone(accum_false_pos_out)
+            self.assertEqual(map_out.shape, (1, ))
+            map_out, accum_pos_count_out2, accum_true_pos_out2, accum_false_pos_out2 = layers.detection_map(
+                detect_res=detect_res, label=label)
+            self.assertIsNotNone(map_out)
+            self.assertIsNotNone(accum_pos_count_out2)
+            self.assertIsNotNone(accum_true_pos_out2)
+            self.assertIsNotNone(accum_false_pos_out2)
+            self.assertEqual(map_out.shape, (1, ))
+            self.assertEqual(accum_pos_count_out.shape,
+                             accum_pos_count_out2.shape)
+            self.assertEqual(accum_true_pos_out.shape,
+                             accum_true_pos_out2.shape)
+            self.assertEqual(accum_false_pos_out.shape,
+                             accum_false_pos_out2.shape)
+        print(str(program))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_error_clip.py b/python/paddle/fluid/tests/test_error_clip.py
similarity index 97%
rename from python/paddle/v2/fluid/tests/test_error_clip.py
rename to python/paddle/fluid/tests/test_error_clip.py
index d577d0014dc136ee5ef92155e37009df60d9bf62..b2fd5ae29c724da52df0a5d3cb56d2ec9e5530f3 100644
--- a/python/paddle/v2/fluid/tests/test_error_clip.py
+++ b/python/paddle/fluid/tests/test_error_clip.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
+import paddle.fluid as fluid
 
 BATCH_SIZE = 128
 CLIP_MAX = 2e-6
@@ -33,7 +33,7 @@ with fluid.program_guard(main_program=prog):
     label = fluid.layers.data(name='y', shape=[1], dtype='int64')
 
     cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
+    avg_cost = fluid.layers.mean(cost)
 
 prog_clip = prog.clone()
 prog_clip.block(0).var(hidden1.name).set_error_clip(
diff --git a/python/paddle/v2/fluid/tests/test_gradient_clip.py b/python/paddle/fluid/tests/test_gradient_clip.py
similarity index 97%
rename from python/paddle/v2/fluid/tests/test_gradient_clip.py
rename to python/paddle/fluid/tests/test_gradient_clip.py
index 792262df84f5b6233548d56d7bf721b564520ba3..68b682f68b1fd147b821cfdb1e0866cf8aa04bff 100644
--- a/python/paddle/v2/fluid/tests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/test_gradient_clip.py
@@ -14,7 +14,7 @@
 
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
+import paddle.fluid as fluid
 
 BATCH_SIZE = 128
 CLIP = 1
@@ -30,7 +30,7 @@ with fluid.program_guard(main_program=prog):
     label = fluid.layers.data(name='y', shape=[1], dtype='int64')
 
     cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
+    avg_cost = fluid.layers.mean(cost)
 
 prog_clip = prog.clone()
 
diff --git a/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py b/python/paddle/fluid/tests/test_mnist_if_else_op.py
similarity index 93%
rename from python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
rename to python/paddle/fluid/tests/test_mnist_if_else_op.py
index 75a651cf2719028fb54dc806952512c80ad3d9c4..94395f6cfb4648967558ed265e798e3505c20fc1 100644
--- a/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
+++ b/python/paddle/fluid/tests/test_mnist_if_else_op.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.framework import Program, program_guard, default_main_program, default_startup_program
-from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.optimizer import MomentumOptimizer
-import paddle.v2.fluid.core as core
+import paddle.fluid.layers as layers
+from paddle.fluid.framework import Program, program_guard, default_main_program, default_startup_program
+from paddle.fluid.executor import Executor
+from paddle.fluid.optimizer import MomentumOptimizer
+import paddle.fluid.core as core
 import paddle.v2 as paddle
 import unittest
 import numpy as np
@@ -56,7 +56,7 @@ class TestMNISTIfElseOp(unittest.TestCase):
             prob = layers.merge_lod_tensor(
                 in_true=true_out, in_false=false_out, mask=cond, x=image)
             loss = layers.cross_entropy(input=prob, label=label)
-            avg_loss = layers.mean(x=loss)
+            avg_loss = layers.mean(loss)
 
             optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
             optimizer.minimize(avg_loss, startup_prog)
@@ -113,7 +113,7 @@ class TestMNISTIfElseOp(unittest.TestCase):
 
             prob = ie()
             loss = layers.cross_entropy(input=prob[0], label=label)
-            avg_loss = layers.mean(x=loss)
+            avg_loss = layers.mean(loss)
 
             optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
             optimizer.minimize(avg_loss, startup_prog)
diff --git a/python/paddle/v2/fluid/tests/test_python_operator_overriding.py b/python/paddle/fluid/tests/test_python_operator_overriding.py
similarity index 95%
rename from python/paddle/v2/fluid/tests/test_python_operator_overriding.py
rename to python/paddle/fluid/tests/test_python_operator_overriding.py
index e5198ec17d027f007b4a831ef2e427481f8ff8c4..b5ac97eac559e8c52a8949cfd63fc8671ba52514 100644
--- a/python/paddle/v2/fluid/tests/test_python_operator_overriding.py
+++ b/python/paddle/fluid/tests/test_python_operator_overriding.py
@@ -16,9 +16,9 @@ import unittest
 
 import numpy as np
 
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle.fluid.framework as framework
+import paddle.fluid as fluid
 
 
 class TestPythonOperatorOverride(unittest.TestCase):
diff --git a/python/paddle/v2/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
similarity index 98%
rename from python/paddle/v2/fluid/tests/unittests/CMakeLists.txt
rename to python/paddle/fluid/tests/unittests/CMakeLists.txt
index 9355f51311e33729c0cb8ff321010235aafa4063..f96c2ca4f0593b6c2624d449304f23425c69ab93 100644
--- a/python/paddle/v2/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -41,6 +41,7 @@ list(REMOVE_ITEM TEST_OPS test_while_op)
 list(REMOVE_ITEM TEST_OPS test_lod_array_length_op)
 list(REMOVE_ITEM TEST_OPS test_reorder_lod_tensor)
 list(REMOVE_ITEM TEST_OPS test_profiler)
+list(REMOVE_ITEM TEST_OPS test_nvprof)
 list(REMOVE_ITEM TEST_OPS test_normalization_wrapper)
 list(REMOVE_ITEM TEST_OPS test_executor_and_mul)
 list(REMOVE_ITEM TEST_OPS test_assign_value_op)
@@ -75,6 +76,7 @@ py_test_modules(test_while_op MODULES test_while_op)
 py_test_modules(test_lod_array_length_op MODULES test_lod_array_length_op)
 py_test_modules(test_reorder_lod_tensor MODULES test_reorder_lod_tensor)
 py_test_modules(test_profiler MODULES test_profiler)
+py_test_modules(test_nvprof MODULES test_nvprof)
 py_test_modules(test_normalization_wrapper MODULES test_normalization_wrapper)
 py_test_modules(test_executor_and_mul MODULES test_executor_and_mul)
 py_test_modules(test_assign_value_op MODULES test_assign_value_op)
diff --git a/python/paddle/v2/fluid/tests/unittests/__init__.py b/python/paddle/fluid/tests/unittests/__init__.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/__init__.py
rename to python/paddle/fluid/tests/unittests/__init__.py
diff --git a/python/paddle/v2/fluid/tests/unittests/decorators.py b/python/paddle/fluid/tests/unittests/decorators.py
similarity index 97%
rename from python/paddle/v2/fluid/tests/unittests/decorators.py
rename to python/paddle/fluid/tests/unittests/decorators.py
index 7081e4b9345b836dd1db918480f2ac86fa94c6c1..d1165e2a9199454dbcc1fda411afad20449bcc92 100644
--- a/python/paddle/v2/fluid/tests/unittests/decorators.py
+++ b/python/paddle/fluid/tests/unittests/decorators.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2.fluid as fluid
+import paddle.fluid as fluid
 
 __all__ = ['many_times', 'prog_scope']
 
diff --git a/python/paddle/v2/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
similarity index 98%
rename from python/paddle/v2/fluid/tests/unittests/op_test.py
rename to python/paddle/fluid/tests/unittests/op_test.py
index d8867550cae8882c2d71a641ef7e5424537e0500..f7e02595ec3b41ae7bb32353c258736968ca78d4 100644
--- a/python/paddle/v2/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -16,12 +16,12 @@ import unittest
 import numpy as np
 import random
 import itertools
-import paddle.v2.fluid.core as core
+import paddle.fluid.core as core
 import collections
-from paddle.v2.fluid.backward import append_backward
-from paddle.v2.fluid.op import Operator
-from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.framework import Program, OpProtoHolder
+from paddle.fluid.backward import append_backward
+from paddle.fluid.op import Operator
+from paddle.fluid.executor import Executor
+from paddle.fluid.framework import Program, OpProtoHolder
 
 
 def randomize_probability(batch_size, class_num, dtype='float32'):
diff --git a/python/paddle/v2/fluid/tests/unittests/test_accuracy_op.py b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_accuracy_op.py
rename to python/paddle/fluid/tests/unittests/test_accuracy_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_activation_op.py
rename to python/paddle/fluid/tests/unittests/test_activation_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_adadelta_op.py b/python/paddle/fluid/tests/unittests/test_adadelta_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_adadelta_op.py
rename to python/paddle/fluid/tests/unittests/test_adadelta_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_adagrad_op.py b/python/paddle/fluid/tests/unittests/test_adagrad_op.py
similarity index 98%
rename from python/paddle/v2/fluid/tests/unittests/test_adagrad_op.py
rename to python/paddle/fluid/tests/unittests/test_adagrad_op.py
index 320f43023c055c0fd96d34a88997d04702265afe..2f0ea79f4d6afe91ee7e0d747f3d8f4884d8f9ee 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_adagrad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adagrad_op.py
@@ -14,8 +14,8 @@
 
 import unittest
 import numpy as np
-import paddle.v2.fluid.core as core
-from paddle.v2.fluid.op import Operator
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
 from op_test import OpTest
 import math
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
similarity index 99%
rename from python/paddle/v2/fluid/tests/unittests/test_adam_op.py
rename to python/paddle/fluid/tests/unittests/test_adam_op.py
index d6c5a16ff2b5dad1d1b06be687bde1224fa99691..3c65f3d44adcebdca92f78f7834d4878a9fa3dfe 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -15,8 +15,8 @@
 import unittest
 import numpy as np
 from op_test import OpTest
-from paddle.v2.fluid import core
-from paddle.v2.fluid.op import Operator
+from paddle.fluid import core
+from paddle.fluid.op import Operator
 
 
 class TestAdamOp1(OpTest):
diff --git a/python/paddle/v2/fluid/tests/unittests/test_adamax_op.py b/python/paddle/fluid/tests/unittests/test_adamax_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_adamax_op.py
rename to python/paddle/fluid/tests/unittests/test_adamax_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_array_read_write_op.py b/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
similarity index 86%
rename from python/paddle/v2/fluid/tests/unittests/test_array_read_write_op.py
rename to python/paddle/fluid/tests/unittests/test_array_read_write_op.py
index 8917b9b906da6af568c28a29335c40c9737b842c..a49e9035a43e04fc1d1b2328d7562c053320b24b 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_array_read_write_op.py
+++ b/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
@@ -13,11 +13,11 @@
 # limitations under the License.
 
 import unittest
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward
-from paddle.v2.fluid.framework import default_main_program
+import paddle.fluid.core as core
+import paddle.fluid.layers as layers
+from paddle.fluid.executor import Executor
+from paddle.fluid.backward import append_backward
+from paddle.fluid.framework import default_main_program
 import numpy
 
 
@@ -49,15 +49,15 @@ class TestArrayReadWrite(unittest.TestCase):
         i = layers.increment(x=i)
         a2 = layers.array_read(array=arr, i=i)
 
-        mean_a0 = layers.mean(x=a0)
-        mean_a1 = layers.mean(x=a1)
-        mean_a2 = layers.mean(x=a2)
+        mean_a0 = layers.mean(a0)
+        mean_a1 = layers.mean(a1)
+        mean_a2 = layers.mean(a2)
 
         a_sum = layers.sums(input=[mean_a0, mean_a1, mean_a2])
 
-        mean_x0 = layers.mean(x=x[0])
-        mean_x1 = layers.mean(x=x[1])
-        mean_x2 = layers.mean(x=x[2])
+        mean_x0 = layers.mean(x[0])
+        mean_x1 = layers.mean(x[1])
+        mean_x2 = layers.mean(x[2])
 
         x_sum = layers.sums(input=[mean_x0, mean_x1, mean_x2])
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_assign_op.py b/python/paddle/fluid/tests/unittests/test_assign_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_assign_op.py
rename to python/paddle/fluid/tests/unittests/test_assign_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_assign_value_op.py b/python/paddle/fluid/tests/unittests/test_assign_value_op.py
similarity index 93%
rename from python/paddle/v2/fluid/tests/unittests/test_assign_value_op.py
rename to python/paddle/fluid/tests/unittests/test_assign_value_op.py
index 99d7e958c32678f357e2dfb94de59469261c5307..02f2e6eddc80fcce4ca5a444cff82db355c085ca 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_assign_value_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_value_op.py
@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2.fluid as fluid
-import paddle.v2.fluid.layers as layers
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
 import op_test
 import numpy
 import unittest
-import paddle.v2.fluid.framework as framework
+import paddle.fluid.framework as framework
 
 
 class TestAssignValueOp(op_test.OpTest):
diff --git a/python/paddle/v2/fluid/tests/unittests/test_auc_op.py b/python/paddle/fluid/tests/unittests/test_auc_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_auc_op.py
rename to python/paddle/fluid/tests/unittests/test_auc_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
similarity index 99%
rename from python/paddle/v2/fluid/tests/unittests/test_batch_norm_op.py
rename to python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index b7c0cb521a3b25881ccba199ac55ab8f712b1894..80e6fa6df3c21aa19feb571916f11c41ccd6bb10 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -15,9 +15,9 @@
 import unittest
 import numpy as np
 from op_test import OpTest
-import paddle.v2.fluid.core as core
-from paddle.v2.fluid.op import Operator
-from paddle.v2.fluid.framework import grad_var_name
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from paddle.fluid.framework import grad_var_name
 
 
 def get_backward_op(scope, op, no_grad_set):
diff --git a/python/paddle/v2/fluid/tests/unittests/test_beam_search_decode_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
similarity index 97%
rename from python/paddle/v2/fluid/tests/unittests/test_beam_search_decode_op.py
rename to python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
index 91f8f7b18bbbba120add77476d4203b61f9f57f6..4ee00605e22ba45d9e46a8bba27712c3fd97872a 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_beam_search_decode_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
@@ -15,8 +15,8 @@
 import unittest
 
 import numpy as np
-import paddle.v2.fluid.core as core
-from paddle.v2.fluid.op import Operator
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
 
 
 class TestBeamSearchDecodeOp(unittest.TestCase):
diff --git a/python/paddle/v2/fluid/tests/unittests/test_beam_search_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
similarity index 96%
rename from python/paddle/v2/fluid/tests/unittests/test_beam_search_op.py
rename to python/paddle/fluid/tests/unittests/test_beam_search_op.py
index 1596bb3970c870b51e69226678c9199844083613..bc708f3aff54f54d290684d68afa503a50a32dac 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_beam_search_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import logging
-from paddle.v2.fluid.op import Operator, DynamicRecurrentOp
-import paddle.v2.fluid.core as core
+from paddle.fluid.op import Operator, DynamicRecurrentOp
+import paddle.fluid.core as core
 import unittest
 import numpy as np
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_bilinear_tensor_product_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_bilinear_tensor_product_op.py
rename to python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_bipartite_match_op.py b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
similarity index 68%
rename from python/paddle/v2/fluid/tests/unittests/test_bipartite_match_op.py
rename to python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
index 9f9af2f55e2e9a1c624fb95f1c113e24c2de4a89..f7461ee6dab699064153332116449c8e20a0bac0 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_bipartite_match_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
@@ -46,7 +46,20 @@ def bipartite_match(distance, match_indices, match_dist):
             idx += 1
 
 
-def batch_bipartite_match(distance, lod):
+def argmax_match(distance, match_indices, match_dist, threshold):
+    r, c = distance.shape
+    for j in xrange(c):
+        if match_indices[j] != -1:
+            continue
+        col_dist = distance[:, j]
+        indices = np.argwhere(col_dist >= threshold).flatten()
+        if len(indices) < 1:
+            continue
+        match_indices[j] = indices[np.argmax(col_dist[indices])]
+        match_dist[j] = col_dist[match_indices[j]]
+
+
+def batch_bipartite_match(distance, lod, match_type=None, dist_threshold=None):
     """Bipartite Matching algorithm for batch input.
     Arg:
         distance (numpy.array) : The distance of two entries with shape [M, N].
@@ -59,6 +72,9 @@ def batch_bipartite_match(distance, lod):
     for i in range(len(lod) - 1):
         bipartite_match(distance[lod[i]:lod[i + 1], :], match_indices[i, :],
                         match_dist[i, :])
+        if match_type == 'per_prediction':
+            argmax_match(distance[lod[i]:lod[i + 1], :], match_indices[i, :],
+                         match_dist[i, :], dist_threshold)
     return match_indices, match_dist
 
 
@@ -71,8 +87,8 @@ class TestBipartiteMatchOpWithLoD(OpTest):
 
         self.inputs = {'DistMat': (dist, lod)}
         self.outputs = {
-            'ColToRowMatchIndices': (match_indices),
-            'ColToRowMatchDist': (match_dist),
+            'ColToRowMatchIndices': match_indices,
+            'ColToRowMatchDist': match_dist,
         }
 
     def test_check_output(self):
@@ -96,5 +112,27 @@ class TestBipartiteMatchOpWithoutLoD(OpTest):
         self.check_output()
 
 
+class TestBipartiteMatchOpWithPerPredictionType(OpTest):
+    def setUp(self):
+        self.op_type = 'bipartite_match'
+        lod = [[0, 5, 11, 23]]
+        dist = np.random.random((23, 237)).astype('float32')
+        match_indices, match_dist = batch_bipartite_match(dist, lod[0],
+                                                          'per_prediction', 0.5)
+
+        self.inputs = {'DistMat': (dist, lod)}
+        self.outputs = {
+            'ColToRowMatchIndices': match_indices,
+            'ColToRowMatchDist': match_dist,
+        }
+        self.attrs = {
+            'match_type': 'per_prediction',
+            'dist_threshold': 0.5,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/unittests/test_box_coder_op.py b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_box_coder_op.py
rename to python/paddle/fluid/tests/unittests/test_box_coder_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_calc_gradient.py b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
similarity index 82%
rename from python/paddle/v2/fluid/tests/unittests/test_calc_gradient.py
rename to python/paddle/fluid/tests/unittests/test_calc_gradient.py
index 1b38dcf343a450f3a10725def9d18366c87e72c6..06e676cd83e77549afd679e730426c590cc046bf 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_calc_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
@@ -14,11 +14,11 @@
 
 import unittest
 
-import paddle.v2.fluid as fluid
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid.optimizer as optimizer
-from paddle.v2.fluid.backward import calc_gradient
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle.fluid.framework as framework
+import paddle.fluid.optimizer as optimizer
+from paddle.fluid.backward import calc_gradient
 
 
 class TestCalcGradient(unittest.TestCase):
@@ -26,7 +26,7 @@ class TestCalcGradient(unittest.TestCase):
         x = layers.create_parameter(dtype="float32", shape=[5, 10])
         y = layers.create_parameter(dtype="float32", shape=[10, 8])
         mul_out = layers.mul(x=x, y=y)
-        mean_out = layers.mean(x=mul_out)
+        mean_out = layers.mean(mul_out)
         a = calc_gradient(mean_out, mul_out)
         b = calc_gradient(mean_out, x)
         place = fluid.CPUPlace()
diff --git a/python/paddle/v2/fluid/tests/unittests/test_cast_op.py b/python/paddle/fluid/tests/unittests/test_cast_op.py
similarity index 97%
rename from python/paddle/v2/fluid/tests/unittests/test_cast_op.py
rename to python/paddle/fluid/tests/unittests/test_cast_op.py
index 3d05a319cd4d0ab1a2237ee34a30dfa58827ff0c..8fb8d03828393ccfe57c0848d79b960c641ad39a 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_cast_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cast_op.py
@@ -15,7 +15,7 @@
 import op_test
 import unittest
 import numpy as np
-import paddle.v2.fluid.core as core
+import paddle.fluid.core as core
 
 
 class TestCastOp(op_test.OpTest):
diff --git a/python/paddle/v2/fluid/tests/unittests/test_chunk_eval_op.py b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_chunk_eval_op.py
rename to python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_clip_by_norm_op.py b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_clip_by_norm_op.py
rename to python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_clip_op.py b/python/paddle/fluid/tests/unittests/test_clip_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_clip_op.py
rename to python/paddle/fluid/tests/unittests/test_clip_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_compare_op.py b/python/paddle/fluid/tests/unittests/test_compare_op.py
similarity index 87%
rename from python/paddle/v2/fluid/tests/unittests/test_compare_op.py
rename to python/paddle/fluid/tests/unittests/test_compare_op.py
index 83d57639ca4b2dab8ce62a23551161dc2766783a..405afebae85eaae6f6af0012058ad58c8bb69a2f 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_compare_op.py
+++ b/python/paddle/fluid/tests/unittests/test_compare_op.py
@@ -38,7 +38,10 @@ def create_test_class(op_type, typename, callback):
 for _type_name in {'float32', 'float64', 'int32', 'int64'}:
     create_test_class('less_than', _type_name, lambda _a, _b: _a < _b)
     create_test_class('less_equal', _type_name, lambda _a, _b: _a <= _b)
+    create_test_class('greater_than', _type_name, lambda _a, _b: _a > _b)
+    create_test_class('greater_equal', _type_name, lambda _a, _b: _a >= _b)
     create_test_class('equal', _type_name, lambda _a, _b: _a == _b)
+    create_test_class('not_equal', _type_name, lambda _a, _b: _a != _b)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_concat_op.py
rename to python/paddle/fluid/tests/unittests/test_concat_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_cond_op.py b/python/paddle/fluid/tests/unittests/test_cond_op.py
similarity index 97%
rename from python/paddle/v2/fluid/tests/unittests/test_cond_op.py
rename to python/paddle/fluid/tests/unittests/test_cond_op.py
index 4a1e806c4be9011dd2f4ebad1ff2abf8a4e83de3..66fbae961a2701e79da5222ae2689108335c4065 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_cond_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cond_op.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 
 import logging
-import paddle.v2.fluid.core as core
+import paddle.fluid.core as core
 import unittest
 import numpy as np
-from paddle.v2.fluid.op import Operator, CondOp
+from paddle.fluid.op import Operator, CondOp
 
 
 class PySimpleCond(object):
diff --git a/python/paddle/v2/fluid/tests/unittests/test_conditional_block.py b/python/paddle/fluid/tests/unittests/test_conditional_block.py
similarity index 84%
rename from python/paddle/v2/fluid/tests/unittests/test_conditional_block.py
rename to python/paddle/fluid/tests/unittests/test_conditional_block.py
index 58ac26720365f5c6e6e3acb29dd9907325e76fe9..084b8d37386fac0366c190f5f30dd39467072498 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_conditional_block.py
+++ b/python/paddle/fluid/tests/unittests/test_conditional_block.py
@@ -13,11 +13,11 @@
 # limitations under the License.
 
 import unittest
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.core as core
-from paddle.v2.fluid.framework import default_startup_program, default_main_program
-from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward
+import paddle.fluid.layers as layers
+import paddle.fluid.core as core
+from paddle.fluid.framework import default_startup_program, default_main_program
+from paddle.fluid.executor import Executor
+from paddle.fluid.backward import append_backward
 import numpy
 
 
@@ -39,7 +39,7 @@ class ConditionalBlock(unittest.TestCase):
 
         outs = exe.run(feed={'X': x}, fetch_list=[out])[0]
         print outs
-        loss = layers.mean(x=out)
+        loss = layers.mean(out)
         append_backward(loss=loss)
         outs = exe.run(
             feed={'X': x},
diff --git a/python/paddle/v2/fluid/tests/unittests/test_const_value.py b/python/paddle/fluid/tests/unittests/test_const_value.py
similarity index 95%
rename from python/paddle/v2/fluid/tests/unittests/test_const_value.py
rename to python/paddle/fluid/tests/unittests/test_const_value.py
index 06c1c21fbcb7e1c1ee266a7c7fdd031229838da2..d1075d514e9b2b692f271f10a005815a66b421fb 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_const_value.py
+++ b/python/paddle/fluid/tests/unittests/test_const_value.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import unittest
-import paddle.v2.fluid.framework as framework
+import paddle.fluid.framework as framework
 
 
 class ConditionalBlock(unittest.TestCase):
diff --git a/python/paddle/v2/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
similarity index 92%
rename from python/paddle/v2/fluid/tests/unittests/test_conv2d_op.py
rename to python/paddle/fluid/tests/unittests/test_conv2d_op.py
index ad242692ec6752d64030603acbf928cc50054222..1321cfd484ec8be1d8a817535386db949d825574 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -15,7 +15,7 @@
 import unittest
 import numpy as np
 
-import paddle.v2.fluid.core as core
+import paddle.fluid.core as core
 from op_test import OpTest
 
 
@@ -210,6 +210,19 @@ class TestWithDilation(TestConv2dOp):
         self.groups = 3
 
 
+class TestWithInput1x1Filter1x1(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 1, 1]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 1, 1]
+
+    def init_group(self):
+        self.groups = 3
+
+
 #----------------Conv2dCUDNN----------------
 class TestCUDNN(TestConv2dOp):
     def init_op_type(self):
@@ -241,6 +254,12 @@ class TestCUDNNWith1x1(TestWith1x1):
         self.op_type = "conv2d"
 
 
+class TestCUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv2d"
+
+
 class TestDepthwiseConv(TestConv2dOp):
     def init_test_case(self):
         self.pad = [1, 1]
@@ -265,7 +284,8 @@ class TestDepthwiseConv2(TestConv2dOp):
         self.op_type = "depthwise_conv2d"
 
 
-#  cudnn v5 does not support dilation conv.
+# Please Don't remove the following code.
+# Currently, CI use cudnn V5.0 which not support dilation conv.
 # class TestCUDNNWithDilation(TestWithDilation):
 #     def init_op_type(self):
 #         self.op_type = "conv_cudnn"
diff --git a/python/paddle/v2/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
similarity index 97%
rename from python/paddle/v2/fluid/tests/unittests/test_conv2d_transpose_op.py
rename to python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
index c9e74f58605270bb4d20f2fbdf3ed63d22fba023..d864b9b348e961c585749d47d449d775b2dfebc9 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
@@ -15,7 +15,7 @@
 import unittest
 import numpy as np
 
-import paddle.v2.fluid.core as core
+import paddle.fluid.core as core
 from op_test import OpTest
 
 
@@ -200,7 +200,8 @@ class TestCUDNNWithStride(TestWithStride):
         self.op_type = "conv2d_transpose"
 
 
-# #cudnn v5 does not support dilation conv.
+# Please Don't remove the following code.
+# Currently, CI use cudnn V5.0 which not support dilation conv.
 # class TestCUDNNWithDilation(TestWithDilation):
 #     def init_test_case(self):
 #         self.pad = [1, 1]
diff --git a/python/paddle/v2/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
similarity index 92%
rename from python/paddle/v2/fluid/tests/unittests/test_conv3d_op.py
rename to python/paddle/fluid/tests/unittests/test_conv3d_op.py
index 0f7e383d1aa86b54d8b82b57f44c70162bfd40b4..d5dd63e8737cbdd9b91d083fbd0b38f8baf570b3 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
@@ -15,7 +15,7 @@
 import unittest
 import numpy as np
 
-import paddle.v2.fluid.core as core
+import paddle.fluid.core as core
 from op_test import OpTest
 
 
@@ -200,6 +200,22 @@ class TestWith1x1(TestConv3dOp):
         self.groups = 3
 
 
+class TestWithInput1x1Filter1x1(TestConv3dOp):
+    def init_test_case(self):
+        self.pad = [0, 0, 0]
+        self.stride = [1, 1, 1]
+        self.input_size = [2, 3, 1, 1, 1]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 1, 1, 1]
+
+    def init_dilation(self):
+        self.dilations = [1, 1, 1]
+
+    def init_group(self):
+        self.groups = 3
+
+
 class TestWithDilation(TestConv3dOp):
     def init_test_case(self):
         self.pad = [0, 0, 0]
@@ -240,6 +256,12 @@ class TestWith1x1CUDNN(TestWith1x1):
         self.op_type = "conv3d"
 
 
+class TestWithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d"
+
+
 # FIXME(typhoonzero): find a way to determine if
 # using cudnn > 6 in python
 # class TestWithDilationCUDNN(TestWithDilation):
diff --git a/python/paddle/v2/fluid/tests/unittests/test_conv3d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
similarity index 98%
rename from python/paddle/v2/fluid/tests/unittests/test_conv3d_transpose_op.py
rename to python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
index a70f23d4ad4ab02fe068625fcbe11948adf148f5..55ba238710c56dd0daea388cd2dcdb79243bb71e 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_conv3d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
@@ -15,7 +15,7 @@
 import unittest
 import numpy as np
 
-import paddle.v2.fluid.core as core
+import paddle.fluid.core as core
 from op_test import OpTest
 
 
@@ -207,7 +207,8 @@ class TestCUDNNWithStride(TestWithStride):
         self.op_type = "conv3d_transpose"
 
 
-# #cudnn v5 does not support dilation conv.
+# Please Don't remove the following code.
+# Currently, CI use cudnn V5.0 which not support dilation conv.
 # class TestCUDNNWithDilation(TestWithDilation):
 #     def init_test_case(self):
 #         self.pad = [1, 1, 1]
diff --git a/python/paddle/v2/fluid/tests/unittests/test_conv_shift_op.py b/python/paddle/fluid/tests/unittests/test_conv_shift_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_conv_shift_op.py
rename to python/paddle/fluid/tests/unittests/test_conv_shift_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_cos_sim_op.py b/python/paddle/fluid/tests/unittests/test_cos_sim_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_cos_sim_op.py
rename to python/paddle/fluid/tests/unittests/test_cos_sim_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_create_op_doc_string.py b/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py
similarity index 95%
rename from python/paddle/v2/fluid/tests/unittests/test_create_op_doc_string.py
rename to python/paddle/fluid/tests/unittests/test_create_op_doc_string.py
index 4eadbd18ac428cd8d98d63fc886d16b20c90963b..5e6f9a20a93e467980f5a4f23fbcb6118317fe44 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_create_op_doc_string.py
+++ b/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import unittest
-import paddle.v2.fluid.layers as layers
+import paddle.fluid.layers as layers
 
 
 class TestDocString(unittest.TestCase):
diff --git a/python/paddle/v2/fluid/tests/unittests/test_crf_decoding_op.py b/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_crf_decoding_op.py
rename to python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_crop_op.py b/python/paddle/fluid/tests/unittests/test_crop_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_crop_op.py
rename to python/paddle/fluid/tests/unittests/test_crop_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_cross_entropy_op.py
rename to python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_ctc_align.py b/python/paddle/fluid/tests/unittests/test_ctc_align.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_ctc_align.py
rename to python/paddle/fluid/tests/unittests/test_ctc_align.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_cumsum_op.py b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_cumsum_op.py
rename to python/paddle/fluid/tests/unittests/test_cumsum_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_decayed_adagrad_op.py b/python/paddle/fluid/tests/unittests/test_decayed_adagrad_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_decayed_adagrad_op.py
rename to python/paddle/fluid/tests/unittests/test_decayed_adagrad_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_default_scope_funcs.py b/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py
similarity index 96%
rename from python/paddle/v2/fluid/tests/unittests/test_default_scope_funcs.py
rename to python/paddle/fluid/tests/unittests/test_default_scope_funcs.py
index d7ca59607098b3cf9865867f32d3af7804a9d7a2..a3bf7b544b91c70ffe3894219c118ec9887aba81 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_default_scope_funcs.py
+++ b/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle.v2.fluid.default_scope_funcs import *
+from paddle.fluid.default_scope_funcs import *
 import unittest
 
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_detection_map_op.py b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_detection_map_op.py
rename to python/paddle/fluid/tests/unittests/test_detection_map_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_detection_output_op.py b/python/paddle/fluid/tests/unittests/test_detection_output_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_detection_output_op.py
rename to python/paddle/fluid/tests/unittests/test_detection_output_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_dropout_op.py
rename to python/paddle/fluid/tests/unittests/test_dropout_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_dyn_rnn.py b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
similarity index 97%
rename from python/paddle/v2/fluid/tests/unittests/test_dyn_rnn.py
rename to python/paddle/fluid/tests/unittests/test_dyn_rnn.py
index 1571572fc6b54917ddfa6a92456180700b5024cc..df7ab0d29bdfc9410cd7dd4a8f2a7cd440ef4aba 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_dyn_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2.fluid as fluid
+import paddle.fluid as fluid
 import paddle.v2 as paddle
 import unittest
 import numpy
@@ -81,7 +81,7 @@ class TestDynRNN(unittest.TestCase):
             logits = fluid.layers.fc(input=last, size=1, act=None)
             loss = fluid.layers.sigmoid_cross_entropy_with_logits(
                 x=logits, label=label)
-            loss = fluid.layers.mean(x=loss)
+            loss = fluid.layers.mean(loss)
             sgd = fluid.optimizer.SGD(1e-4)
             sgd.minimize(loss=loss)
         cpu = fluid.CPUPlace()
@@ -119,7 +119,7 @@ class TestDynRNN(unittest.TestCase):
             label = fluid.layers.data(name='label', shape=[1], dtype='float32')
             loss = fluid.layers.sigmoid_cross_entropy_with_logits(
                 x=logits, label=label)
-            loss = fluid.layers.mean(x=loss)
+            loss = fluid.layers.mean(loss)
             sgd = fluid.optimizer.Adam(1e-3)
             sgd.minimize(loss=loss)
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_dynrnn_gradient_check.py b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
similarity index 99%
rename from python/paddle/v2/fluid/tests/unittests/test_dynrnn_gradient_check.py
rename to python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
index 8b01ec730aafdd864db86e2c58d68001d0f0ebfb..22329390754d8d010dced0d1aca35617140cd097 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_dynrnn_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
@@ -15,7 +15,7 @@
 import numpy
 import random
 import collections
-import paddle.v2.fluid as fluid
+import paddle.fluid as fluid
 import unittest
 from decorators import *
 
@@ -272,7 +272,7 @@ class TestSimpleMul(SeedFixedTestCase):
 
         out = rnn()
         out = fluid.layers.sequence_pool(out, pool_type='last')
-        loss = fluid.layers.mean(x=out)
+        loss = fluid.layers.mean(out)
         fluid.backward.append_backward(loss)
 
         cpu = fluid.CPUPlace()
@@ -348,7 +348,7 @@ class TestSimpleMulWithMemory(SeedFixedTestCase):
 
         out = rnn()
         last = fluid.layers.sequence_pool(input=out, pool_type='last')
-        loss = fluid.layers.mean(x=last)
+        loss = fluid.layers.mean(last)
         fluid.backward.append_backward(loss)
 
         cpu = fluid.CPUPlace()
diff --git a/python/paddle/v2/fluid/tests/unittests/test_dynrnn_static_input.py b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
similarity index 96%
rename from python/paddle/v2/fluid/tests/unittests/test_dynrnn_static_input.py
rename to python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
index d2f05dcd14e5df4b796291339b3e0044f500f56a..b03a70f1b9e61162d37541ffeba8510fc11c605a 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_dynrnn_static_input.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
@@ -14,11 +14,11 @@
 
 import unittest
 import paddle.v2 as paddle
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid as fluid
-from paddle.v2.fluid.backward import append_backward
-import paddle.v2.fluid.framework as framework
-from paddle.v2.fluid.framework import Program, switch_main_program
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid.backward import append_backward
+import paddle.fluid.framework as framework
+from paddle.fluid.framework import Program, switch_main_program
 import bisect
 import numpy as np
 
@@ -125,7 +125,7 @@ class TestDyRnnStaticInput(unittest.TestCase):
             return static_input_step_outs
 
         last = fluid.layers.sequence_pool(input=rnn(), pool_type='last')
-        loss = fluid.layers.mean(x=last)
+        loss = fluid.layers.mean(last)
         append_backward(loss)
         static_input_grad = self._program.global_block().var(
             framework.grad_var_name('static_input_tensor'))
diff --git a/python/paddle/v2/fluid/tests/unittests/test_edit_distance_op.py b/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_edit_distance_op.py
rename to python/paddle/fluid/tests/unittests/test_edit_distance_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_elementwise_add_op.py
rename to python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_elementwise_div_op.py
rename to python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_elementwise_max_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_elementwise_max_op.py
rename to python/paddle/fluid/tests/unittests/test_elementwise_max_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_elementwise_min_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_elementwise_min_op.py
rename to python/paddle/fluid/tests/unittests/test_elementwise_min_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_elementwise_mul_op.py
rename to python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_elementwise_pow_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_elementwise_pow_op.py
rename to python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_elementwise_sub_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_elementwise_sub_op.py
rename to python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_exception.py b/python/paddle/fluid/tests/unittests/test_exception.py
similarity index 96%
rename from python/paddle/v2/fluid/tests/unittests/test_exception.py
rename to python/paddle/fluid/tests/unittests/test_exception.py
index 066b0b7409fe4a97afdbd590641269d97adf4a54..bb7c0f88f6027807394e15aa6803da2ddc22f4e2 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_exception.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2.fluid.core as core
+import paddle.fluid.core as core
 import unittest
 
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_executor_and_mul.py b/python/paddle/fluid/tests/unittests/test_executor_and_mul.py
similarity index 91%
rename from python/paddle/v2/fluid/tests/unittests/test_executor_and_mul.py
rename to python/paddle/fluid/tests/unittests/test_executor_and_mul.py
index c043c07b3a31b2675e9a76e54da216aaa26e0f48..4958bef3ef4d101f934a2776efc21efdd24a9a4d 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_executor_and_mul.py
+++ b/python/paddle/fluid/tests/unittests/test_executor_and_mul.py
@@ -15,10 +15,10 @@
 import unittest
 
 import numpy
-import paddle.v2.fluid.core as core
+import paddle.fluid.core as core
 
-from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.layers import mul, data
+from paddle.fluid.executor import Executor
+from paddle.fluid.layers import mul, data
 
 
 class TestExecutor(unittest.TestCase):
diff --git a/python/paddle/v2/fluid/tests/unittests/test_expand_op.py b/python/paddle/fluid/tests/unittests/test_expand_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_expand_op.py
rename to python/paddle/fluid/tests/unittests/test_expand_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_feed_fetch_method.py b/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
similarity index 97%
rename from python/paddle/v2/fluid/tests/unittests/test_feed_fetch_method.py
rename to python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
index f24e5e27f372219673bc7a5aa75bbc880620887e..9d724a6479f061996359b1efcc5f61f0564331c7 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_feed_fetch_method.py
+++ b/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2.fluid.core as core
+import paddle.fluid.core as core
 import unittest
 import numpy as np
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_fetch_var.py b/python/paddle/fluid/tests/unittests/test_fetch_var.py
similarity index 94%
rename from python/paddle/v2/fluid/tests/unittests/test_fetch_var.py
rename to python/paddle/fluid/tests/unittests/test_fetch_var.py
index ed75a350b0bcb220c8435d60e1978c27da84a24c..46c3bbb6712c6276e48dd9328d7741a447f28b91 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_fetch_var.py
+++ b/python/paddle/fluid/tests/unittests/test_fetch_var.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2.fluid as fluid
-import paddle.v2.fluid.layers as layers
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
 import op_test
 import numpy
 import unittest
diff --git a/python/paddle/v2/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
rename to python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_fill_constant_op.py
rename to python/paddle/fluid/tests/unittests/test_fill_constant_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_fill_op.py b/python/paddle/fluid/tests/unittests/test_fill_op.py
similarity index 96%
rename from python/paddle/v2/fluid/tests/unittests/test_fill_op.py
rename to python/paddle/fluid/tests/unittests/test_fill_op.py
index c2e3cfe6f3c0af80bbdaa88a5a5d88531d81f752..762d29199e2127415ed7daabca63edcdbae3344f 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_fill_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_op.py
@@ -15,7 +15,7 @@
 import unittest
 import numpy as np
 from op_test import OpTest
-import paddle.v2.fluid.core as core
+import paddle.fluid.core as core
 
 
 class TestFillOp(OpTest):
diff --git a/python/paddle/v2/fluid/tests/unittests/test_fill_zeros_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_fill_zeros_like_op.py
rename to python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_framework_debug_str.py b/python/paddle/fluid/tests/unittests/test_framework_debug_str.py
similarity index 95%
rename from python/paddle/v2/fluid/tests/unittests/test_framework_debug_str.py
rename to python/paddle/fluid/tests/unittests/test_framework_debug_str.py
index 88995c24dfcefc01c85915b5ad969700799d9c04..c906c74afe66b05e2ca0e1122677e2dc738351b8 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_framework_debug_str.py
+++ b/python/paddle/fluid/tests/unittests/test_framework_debug_str.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import unittest
-from paddle.v2.fluid.framework import Program
+from paddle.fluid.framework import Program
 
 
 class TestDebugStringFramework(unittest.TestCase):
diff --git a/python/paddle/v2/fluid/tests/unittests/test_ftrl_op.py b/python/paddle/fluid/tests/unittests/test_ftrl_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_ftrl_op.py
rename to python/paddle/fluid/tests/unittests/test_ftrl_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_gather_op.py b/python/paddle/fluid/tests/unittests/test_gather_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_gather_op.py
rename to python/paddle/fluid/tests/unittests/test_gather_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_gaussian_random_batch_size_like_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_batch_size_like_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_gaussian_random_batch_size_like_op.py
rename to python/paddle/fluid/tests/unittests/test_gaussian_random_batch_size_like_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
similarity index 92%
rename from python/paddle/v2/fluid/tests/unittests/test_gaussian_random_op.py
rename to python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
index 3c0ee64098a5a1ca7c86c0cea45995d6673c5482..272caceaf38699438ccae41691bf26b2eb4d2a22 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -15,10 +15,10 @@
 import unittest
 import numpy
 
-import paddle.v2.fluid as fluid
-import paddle.v2.fluid.core as core
-from paddle.v2.fluid.op import Operator
-from paddle.v2.fluid.executor import Executor
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from paddle.fluid.executor import Executor
 
 
 class TestGaussianRandomOp(unittest.TestCase):
diff --git a/python/paddle/v2/fluid/tests/unittests/test_get_places_op.py b/python/paddle/fluid/tests/unittests/test_get_places_op.py
similarity index 96%
rename from python/paddle/v2/fluid/tests/unittests/test_get_places_op.py
rename to python/paddle/fluid/tests/unittests/test_get_places_op.py
index 265433e606fec3bf7a5a164e2546943382f0259b..6dab1e22f0c50ab011d6b8e8944097600cf3fecc 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_get_places_op.py
+++ b/python/paddle/fluid/tests/unittests/test_get_places_op.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2.fluid as fluid
+import paddle.fluid as fluid
 import decorators
 import unittest
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_gru_op.py b/python/paddle/fluid/tests/unittests/test_gru_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_gru_op.py
rename to python/paddle/fluid/tests/unittests/test_gru_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_gru_unit_op.py b/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_gru_unit_op.py
rename to python/paddle/fluid/tests/unittests/test_gru_unit_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_hinge_loss_op.py b/python/paddle/fluid/tests/unittests/test_hinge_loss_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_hinge_loss_op.py
rename to python/paddle/fluid/tests/unittests/test_hinge_loss_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_huber_loss_op.py b/python/paddle/fluid/tests/unittests/test_huber_loss_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_huber_loss_op.py
rename to python/paddle/fluid/tests/unittests/test_huber_loss_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_im2sequence_op.py b/python/paddle/fluid/tests/unittests/test_im2sequence_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_im2sequence_op.py
rename to python/paddle/fluid/tests/unittests/test_im2sequence_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_image_classification_layer.py b/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
similarity index 96%
rename from python/paddle/v2/fluid/tests/unittests/test_image_classification_layer.py
rename to python/paddle/fluid/tests/unittests/test_image_classification_layer.py
index 8af8f646a71af1c91178d7bc3a1e0b3cdec7f0bb..6ecfa9ea213fe0cf57e18fa83bbb85c223727d71 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_image_classification_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
@@ -14,9 +14,9 @@
 
 import unittest
 
-import paddle.v2.fluid as fluid
-import paddle.v2.fluid.nets as nets
-from paddle.v2.fluid.framework import Program
+import paddle.fluid as fluid
+import paddle.fluid.nets as nets
+from paddle.fluid.framework import Program
 
 
 def conv_block(input, num_filter, groups, dropouts):
diff --git a/python/paddle/v2/fluid/tests/unittests/test_infer_shape.py b/python/paddle/fluid/tests/unittests/test_infer_shape.py
similarity index 98%
rename from python/paddle/v2/fluid/tests/unittests/test_infer_shape.py
rename to python/paddle/fluid/tests/unittests/test_infer_shape.py
index 17957b9e049183bd01089c44e5afde1bc7b7c5e3..699a2d42467b7ac0dcf1939bde744ad2fcb29c97 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_infer_shape.py
+++ b/python/paddle/fluid/tests/unittests/test_infer_shape.py
@@ -14,7 +14,7 @@
 
 import unittest
 
-import paddle.v2.fluid.core as core
+import paddle.fluid.core as core
 
 
 class TestInferShape(unittest.TestCase):
diff --git a/python/paddle/v2/fluid/tests/unittests/test_inference_model_io.py b/python/paddle/fluid/tests/unittests/test_inference_model_io.py
similarity index 88%
rename from python/paddle/v2/fluid/tests/unittests/test_inference_model_io.py
rename to python/paddle/fluid/tests/unittests/test_inference_model_io.py
index e381312ccc7ce529a2568ecda3182f8d619e6d0d..51460cbb1370f6794e13d18fe099865b4713691f 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_inference_model_io.py
+++ b/python/paddle/fluid/tests/unittests/test_inference_model_io.py
@@ -15,13 +15,13 @@
 import unittest
 
 import numpy as np
-import paddle.v2.fluid.core as core
+import paddle.fluid.core as core
 
-import paddle.v2.fluid.executor as executor
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.optimizer as optimizer
-from paddle.v2.fluid.framework import Program, program_guard
-from paddle.v2.fluid.io import save_inference_model, load_inference_model
+import paddle.fluid.executor as executor
+import paddle.fluid.layers as layers
+import paddle.fluid.optimizer as optimizer
+from paddle.fluid.framework import Program, program_guard
+from paddle.fluid.io import save_inference_model, load_inference_model
 
 
 class TestBook(unittest.TestCase):
@@ -38,7 +38,7 @@ class TestBook(unittest.TestCase):
             y_predict = layers.fc(input=x, size=1, act=None)
 
             cost = layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = layers.mean(x=cost)
+            avg_cost = layers.mean(cost)
 
             sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
             sgd_optimizer.minimize(avg_cost, init_program)
diff --git a/python/paddle/v2/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
similarity index 99%
rename from python/paddle/v2/fluid/tests/unittests/test_initializer.py
rename to python/paddle/fluid/tests/unittests/test_initializer.py
index 6d4eb62916b64ee89430c01f0caf39e1a30c5ceb..587e2025e1045f63a5825f884d4dcad8b4685e62 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -15,8 +15,8 @@
 import numpy as np
 import unittest
 
-import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid.initializer as initializer
+import paddle.fluid.framework as framework
+import paddle.fluid.initializer as initializer
 
 DELTA = 0.00001
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_iou_similarity_op.py b/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_iou_similarity_op.py
rename to python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_is_empty_op.py b/python/paddle/fluid/tests/unittests/test_is_empty_op.py
similarity index 95%
rename from python/paddle/v2/fluid/tests/unittests/test_is_empty_op.py
rename to python/paddle/fluid/tests/unittests/test_is_empty_op.py
index 799da9dc151f507a71b4026f78760f5a5fbb450f..4d11cf226be2ba4ffbe015198fed3191f1e02f72 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_is_empty_op.py
+++ b/python/paddle/fluid/tests/unittests/test_is_empty_op.py
@@ -14,8 +14,8 @@
 
 import unittest
 import numpy as np
-from paddle.v2.fluid.op import Operator
-import paddle.v2.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid.core as core
 
 
 def create_tensor(scope, name, np_data):
diff --git a/python/paddle/v2/fluid/tests/unittests/test_l1_norm_op.py b/python/paddle/fluid/tests/unittests/test_l1_norm_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_l1_norm_op.py
rename to python/paddle/fluid/tests/unittests/test_l1_norm_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_label_smooth_op.py b/python/paddle/fluid/tests/unittests/test_label_smooth_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_label_smooth_op.py
rename to python/paddle/fluid/tests/unittests/test_label_smooth_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
similarity index 98%
rename from python/paddle/v2/fluid/tests/unittests/test_layer_norm_op.py
rename to python/paddle/fluid/tests/unittests/test_layer_norm_op.py
index a1206b3b858a1b03e7189bb9adbbcb3b2628f29f..8c67e45b7fc997012af5f678f21271ad8b220edc 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
@@ -16,9 +16,9 @@ import numpy as np
 
 from operator import mul
 from op_test import OpTest
-import paddle.v2.fluid.core as core
-from paddle.v2.fluid.op import Operator
-from paddle.v2.fluid.framework import grad_var_name
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from paddle.fluid.framework import grad_var_name
 
 np.random.random(123)
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
similarity index 96%
rename from python/paddle/v2/fluid/tests/unittests/test_layers.py
rename to python/paddle/fluid/tests/unittests/test_layers.py
index e757598bbacf39463c3fc0e66ecd462c2668c589..6944cca394fbc1ddde09dfeb0bc82e357a3cd225 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -15,10 +15,10 @@
 from __future__ import print_function
 import unittest
 
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.nets as nets
-from paddle.v2.fluid.framework import Program, program_guard, default_main_program
-from paddle.v2.fluid.param_attr import ParamAttr
+import paddle.fluid.layers as layers
+import paddle.fluid.nets as nets
+from paddle.fluid.framework import Program, program_guard, default_main_program
+from paddle.fluid.param_attr import ParamAttr
 import decorators
 
 
@@ -30,7 +30,7 @@ class TestBook(unittest.TestCase):
             y_predict = layers.fc(input=x, size=1, act=None)
             y = layers.data(name='y', shape=[1], dtype='float32')
             cost = layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = layers.mean(x=cost)
+            avg_cost = layers.mean(cost)
             self.assertIsNotNone(avg_cost)
             program.append_backward(avg_cost)
 
@@ -49,7 +49,7 @@ class TestBook(unittest.TestCase):
                                 act='softmax',
                                 param_attr=["sftmax.w1", "sftmax.w2"])
             cost = layers.cross_entropy(input=predict, label=label)
-            avg_cost = layers.mean(x=cost)
+            avg_cost = layers.mean(cost)
             self.assertIsNotNone(avg_cost)
 
         print(str(program))
@@ -92,7 +92,7 @@ class TestBook(unittest.TestCase):
 
             predict = layers.fc(input=conv_pool_2, size=10, act="softmax")
             cost = layers.cross_entropy(input=predict, label=label)
-            avg_cost = layers.mean(x=cost)
+            avg_cost = layers.mean(cost)
 
             program.append_backward(avg_cost)
 
@@ -140,7 +140,7 @@ class TestBook(unittest.TestCase):
                                      size=dict_size,
                                      act='softmax')
             cost = layers.cross_entropy(input=predict_word, label=next_word)
-            avg_cost = layers.mean(x=cost)
+            avg_cost = layers.mean(cost)
             self.assertIsNotNone(avg_cost)
 
         print(str(program))
@@ -287,7 +287,7 @@ class TestBook(unittest.TestCase):
                           num_total_classes=dict_size,
                           param_attr='nce.w',
                           bias_attr='nce.b')
-        avg_loss = layers.mean(x=loss)
+        avg_loss = layers.mean(loss)
         self.assertIsNotNone(avg_loss)
         print(str(default_main_program()))
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_learning_rate_decay.py b/python/paddle/fluid/tests/unittests/test_learning_rate_decay.py
similarity index 96%
rename from python/paddle/v2/fluid/tests/unittests/test_learning_rate_decay.py
rename to python/paddle/fluid/tests/unittests/test_learning_rate_decay.py
index 1d6bab3d6c44b2b3403778d5db086e405bb30dee..595b05168920f3b9497172e502591cef82903cdc 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_learning_rate_decay.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_decay.py
@@ -17,10 +17,10 @@ import unittest
 import math
 import copy
 
-import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid as fluid
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.learning_rate_decay as lr_decay
+import paddle.fluid.framework as framework
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle.fluid.learning_rate_decay as lr_decay
 
 
 def exponential_decay(learning_rate,
diff --git a/python/paddle/v2/fluid/tests/unittests/test_linear_chain_crf_op.py b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_linear_chain_crf_op.py
rename to python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_lod_array_length_op.py b/python/paddle/fluid/tests/unittests/test_lod_array_length_op.py
similarity index 90%
rename from python/paddle/v2/fluid/tests/unittests/test_lod_array_length_op.py
rename to python/paddle/fluid/tests/unittests/test_lod_array_length_op.py
index 643ee906d6f280903b0a8416b54c39e7678f2da2..d8b4e40662568f580ccff0257512cb8809488f17 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_lod_array_length_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_array_length_op.py
@@ -13,9 +13,9 @@
 # limitations under the License.
 
 import unittest
-import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.executor import Executor
-import paddle.v2.fluid.core as core
+import paddle.fluid.layers as layers
+from paddle.fluid.executor import Executor
+import paddle.fluid.core as core
 import numpy
 
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_lod_rank_table.py b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
similarity index 90%
rename from python/paddle/v2/fluid/tests/unittests/test_lod_rank_table.py
rename to python/paddle/fluid/tests/unittests/test_lod_rank_table.py
index 70b8d69585c4fe743f5a88efffa1de04c9a2829d..093eecb8370b8ae7e4c43ce7ca6f50f5d302bd60 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_lod_rank_table.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle.v2.fluid.layers import lod_rank_table, data
-from paddle.v2.fluid.executor import Executor
-import paddle.v2.fluid.core as core
+from paddle.fluid.layers import lod_rank_table, data
+from paddle.fluid.executor import Executor
+import paddle.fluid.core as core
 import numpy
 import unittest
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_lod_reset_op.py b/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_lod_reset_op.py
rename to python/paddle/fluid/tests/unittests/test_lod_reset_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_lod_tensor_array.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
similarity index 97%
rename from python/paddle/v2/fluid/tests/unittests/test_lod_tensor_array.py
rename to python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
index 0e90e25538118dbf031891061bc4dbf63a2252b3..63b17a5ccd62ed79b3d611e039c2b2705a133272 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_lod_tensor_array.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import unittest
-import paddle.v2.fluid.core as core
+import paddle.fluid.core as core
 import numpy
 
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_lod_tensor_array_ops.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
similarity index 96%
rename from python/paddle/v2/fluid/tests/unittests/test_lod_tensor_array_ops.py
rename to python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
index ebc0a2f71465c0b3d10b205d3bdedfac87c500ec..66a03640c148d769787593f41a44cd4d1aaa10b1 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_lod_tensor_array_ops.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 
 import unittest
-import paddle.v2.fluid.core as core
+import paddle.fluid.core as core
 import numpy
-import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.framework import Program, program_guard
-from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward
+import paddle.fluid.layers as layers
+from paddle.fluid.framework import Program, program_guard
+from paddle.fluid.executor import Executor
+from paddle.fluid.backward import append_backward
 
 
 class TestCPULoDTensorArrayOps(unittest.TestCase):
@@ -182,7 +182,7 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
             array = layers.lod_tensor_to_array(x, table)
             result = layers.array_to_lod_tensor(array, table)
 
-            mean = layers.mean(x=result)
+            mean = layers.mean(result)
 
             append_backward(mean)
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_log_loss_op.py b/python/paddle/fluid/tests/unittests/test_log_loss_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_log_loss_op.py
rename to python/paddle/fluid/tests/unittests/test_log_loss_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_logical_op.py b/python/paddle/fluid/tests/unittests/test_logical_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_logical_op.py
rename to python/paddle/fluid/tests/unittests/test_logical_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_lookup_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_lookup_table_op.py
rename to python/paddle/fluid/tests/unittests/test_lookup_table_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_lrn_op.py b/python/paddle/fluid/tests/unittests/test_lrn_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_lrn_op.py
rename to python/paddle/fluid/tests/unittests/test_lrn_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_lstm_op.py b/python/paddle/fluid/tests/unittests/test_lstm_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_lstm_op.py
rename to python/paddle/fluid/tests/unittests/test_lstm_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_lstm_unit_op.py b/python/paddle/fluid/tests/unittests/test_lstm_unit_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_lstm_unit_op.py
rename to python/paddle/fluid/tests/unittests/test_lstm_unit_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_lstmp_op.py b/python/paddle/fluid/tests/unittests/test_lstmp_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_lstmp_op.py
rename to python/paddle/fluid/tests/unittests/test_lstmp_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_margin_rank_loss_op.py b/python/paddle/fluid/tests/unittests/test_margin_rank_loss_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_margin_rank_loss_op.py
rename to python/paddle/fluid/tests/unittests/test_margin_rank_loss_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_math_op_patch.py b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
similarity index 99%
rename from python/paddle/v2/fluid/tests/unittests/test_math_op_patch.py
rename to python/paddle/fluid/tests/unittests/test_math_op_patch.py
index cae5188fe88d9f8ab80e100d101b7b52e6b8b254..6864d271e795026d59525e9f1e4d86e32df980bf 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_math_op_patch.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
@@ -14,7 +14,7 @@
 
 import unittest
 import decorators
-import paddle.v2.fluid as fluid
+import paddle.fluid as fluid
 import numpy
 
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_matmul_op.py b/python/paddle/fluid/tests/unittests/test_matmul_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_matmul_op.py
rename to python/paddle/fluid/tests/unittests/test_matmul_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_maxout_op.py b/python/paddle/fluid/tests/unittests/test_maxout_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_maxout_op.py
rename to python/paddle/fluid/tests/unittests/test_maxout_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_mean_op.py b/python/paddle/fluid/tests/unittests/test_mean_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_mean_op.py
rename to python/paddle/fluid/tests/unittests/test_mean_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_memory_optimization_transpiler.py b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
similarity index 85%
rename from python/paddle/v2/fluid/tests/unittests/test_memory_optimization_transpiler.py
rename to python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
index a276db581e240547a392aac9f2e68cc2ed848e01..f3dcca6b0107a9c4a6efcb0c0fd50324aaf92648 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_memory_optimization_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
@@ -15,10 +15,10 @@
 from __future__ import print_function
 import unittest
 
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.optimizer as optimizer
-from paddle.v2.fluid.framework import Program, program_guard
-from paddle.v2.fluid.memory_optimization_transpiler import memory_optimize
+import paddle.fluid.layers as layers
+import paddle.fluid.optimizer as optimizer
+from paddle.fluid.framework import Program, program_guard
+from paddle.fluid.memory_optimization_transpiler import memory_optimize
 
 
 class TestControlFlowGraph(unittest.TestCase):
@@ -29,7 +29,7 @@ class TestControlFlowGraph(unittest.TestCase):
             y_predict = layers.fc(input=x, size=1, act=None)
             y = layers.data(name='y', shape=[1], dtype='float32')
             cost = layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = layers.mean(x=cost)
+            avg_cost = layers.mean(cost)
             opt = optimizer.SGD(learning_rate=0.001)
             opt = opt.minimize(avg_cost)
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_mine_hard_examples_op.py b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_mine_hard_examples_op.py
rename to python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_minus_op.py b/python/paddle/fluid/tests/unittests/test_minus_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_minus_op.py
rename to python/paddle/fluid/tests/unittests/test_minus_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_modified_huber_loss_op.py b/python/paddle/fluid/tests/unittests/test_modified_huber_loss_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_modified_huber_loss_op.py
rename to python/paddle/fluid/tests/unittests/test_modified_huber_loss_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_momentum_op.py
rename to python/paddle/fluid/tests/unittests/test_momentum_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_mul_op.py b/python/paddle/fluid/tests/unittests/test_mul_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_mul_op.py
rename to python/paddle/fluid/tests/unittests/test_mul_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_multiclass_nms_op.py
rename to python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_multihead_attention.py b/python/paddle/fluid/tests/unittests/test_multihead_attention.py
similarity index 97%
rename from python/paddle/v2/fluid/tests/unittests/test_multihead_attention.py
rename to python/paddle/fluid/tests/unittests/test_multihead_attention.py
index 6eeeefe021cad0447400c3930056fea3beb5142c..80c3c67967e970a7182c008b6cfd138aff044167 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_multihead_attention.py
+++ b/python/paddle/fluid/tests/unittests/test_multihead_attention.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import unittest
-import paddle.v2.fluid as fluid
-import paddle.v2.fluid.core as core
+import paddle.fluid as fluid
+import paddle.fluid.core as core
 import numpy as np
 
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_multiplex_op.py b/python/paddle/fluid/tests/unittests/test_multiplex_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_multiplex_op.py
rename to python/paddle/fluid/tests/unittests/test_multiplex_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_nce.py b/python/paddle/fluid/tests/unittests/test_nce.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_nce.py
rename to python/paddle/fluid/tests/unittests/test_nce.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_net.py b/python/paddle/fluid/tests/unittests/test_net.py
similarity index 95%
rename from python/paddle/v2/fluid/tests/unittests/test_net.py
rename to python/paddle/fluid/tests/unittests/test_net.py
index 796a83911793ba27357f4b827960a58279c06f7f..ae1699d647d7c0adab36200fb07bde12085053c1 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_net.py
+++ b/python/paddle/fluid/tests/unittests/test_net.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2.fluid.core as core
-from paddle.v2.fluid.op import Operator
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
 import unittest
 
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_norm_op.py b/python/paddle/fluid/tests/unittests/test_norm_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_norm_op.py
rename to python/paddle/fluid/tests/unittests/test_norm_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_normalization_wrapper.py b/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py
similarity index 97%
rename from python/paddle/v2/fluid/tests/unittests/test_normalization_wrapper.py
rename to python/paddle/fluid/tests/unittests/test_normalization_wrapper.py
index 094d8071e27e478ea50f0db13701f5aeb0d1f9e8..ef34893943d8f6bf91b1eb14378e463c178de84d 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_normalization_wrapper.py
+++ b/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import unittest
-import paddle.v2.fluid as fluid
-import paddle.v2.fluid.core as core
+import paddle.fluid as fluid
+import paddle.fluid.core as core
 import numpy as np
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_nvprof.py b/python/paddle/fluid/tests/unittests/test_nvprof.py
new file mode 100644
index 0000000000000000000000000000000000000000..226e5e5d1131b1f33cfbbfefec536e6974f85b36
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_nvprof.py
@@ -0,0 +1,46 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.layers as layers
+import paddle.fluid.core as core
+
+
+class TestNVProf(unittest.TestCase):
+    def test_nvprof(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+        epoc = 8
+        dshape = [4, 3, 28, 28]
+        data = layers.data(name='data', shape=[3, 28, 28], dtype='float32')
+        conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
+
+        place = fluid.CUDAPlace(0)
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        output_file = 'cuda_profiler.txt'
+        with profiler.cuda_profiler(output_file, 'csv') as nvprof:
+            for i in range(epoc):
+                input = np.random.random(dshape).astype('float32')
+                exe.run(fluid.default_main_program(), feed={'data': input})
+        os.remove(output_file)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/unittests/test_one_hot_op.py b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
similarity index 95%
rename from python/paddle/v2/fluid/tests/unittests/test_one_hot_op.py
rename to python/paddle/fluid/tests/unittests/test_one_hot_op.py
index b7db30104a894097feb75da3a79355df93d0fa65..cd78cce8729ab2b5a0bb4817cf3022e53932283a 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_one_hot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
@@ -16,10 +16,10 @@ import unittest
 import numpy as np
 import math
 from op_test import OpTest
-import paddle.v2.fluid as fluid
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.framework as framework
-from paddle.v2.fluid.framework import Program, program_guard
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.framework as framework
+from paddle.fluid.framework import Program, program_guard
 
 
 class TestOneHotOp(OpTest):
diff --git a/python/paddle/v2/fluid/tests/unittests/test_op_support_gpu.py b/python/paddle/fluid/tests/unittests/test_op_support_gpu.py
similarity index 95%
rename from python/paddle/v2/fluid/tests/unittests/test_op_support_gpu.py
rename to python/paddle/fluid/tests/unittests/test_op_support_gpu.py
index f8ac55590c72a4503c53268809ef413ddd4d400f..5fafb8280e19cca46e5bf687494c07200ca53153 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_op_support_gpu.py
+++ b/python/paddle/fluid/tests/unittests/test_op_support_gpu.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import unittest
-import paddle.v2.fluid.core as core
+import paddle.fluid.core as core
 
 
 class TestOpSupportGPU(unittest.TestCase):
diff --git a/python/paddle/v2/fluid/tests/unittests/test_operator.py b/python/paddle/fluid/tests/unittests/test_operator.py
similarity index 98%
rename from python/paddle/v2/fluid/tests/unittests/test_operator.py
rename to python/paddle/fluid/tests/unittests/test_operator.py
index 1f5de93387d4108ef499f08f9517a80f90a1f0c2..5e418fe6ac2d62948762290a65686207d017275c 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_operator.py
+++ b/python/paddle/fluid/tests/unittests/test_operator.py
@@ -14,8 +14,8 @@
 
 import unittest
 
-import paddle.v2.fluid.op as op
-import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
+import paddle.fluid.op as op
+import paddle.fluid.proto.framework_pb2 as framework_pb2
 
 
 class TestGetAllProtos(unittest.TestCase):
diff --git a/python/paddle/v2/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py
similarity index 97%
rename from python/paddle/v2/fluid/tests/unittests/test_operator_desc.py
rename to python/paddle/fluid/tests/unittests/test_operator_desc.py
index c64c08ff7f3f3f8e673b106f6b955c6109058d93..649fabe4a0cdef4c665f8a6d3ebee1bb8232185f 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_operator_desc.py
+++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py
@@ -14,9 +14,9 @@
 
 import unittest
 
-import paddle.v2.fluid.core as core
+import paddle.fluid.core as core
 
-from paddle.v2.fluid.framework import Program, default_startup_program
+from paddle.fluid.framework import Program, default_startup_program
 
 main_program = default_startup_program()
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py
similarity index 99%
rename from python/paddle/v2/fluid/tests/unittests/test_optimizer.py
rename to python/paddle/fluid/tests/unittests/test_optimizer.py
index 875e9e7c762819ea0605ac840453ef65cd87cec7..6ee7fc819a501eb7bf452b5d470a2e0df1b44600 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@@ -14,9 +14,9 @@
 
 import unittest
 
-import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid.optimizer as optimizer
-from paddle.v2.fluid.backward import append_backward
+import paddle.fluid.framework as framework
+import paddle.fluid.optimizer as optimizer
+from paddle.fluid.backward import append_backward
 
 
 class TestOptimizer(unittest.TestCase):
diff --git a/python/paddle/v2/fluid/tests/unittests/test_pad_op.py b/python/paddle/fluid/tests/unittests/test_pad_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_pad_op.py
rename to python/paddle/fluid/tests/unittests/test_pad_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_parallel_op.py b/python/paddle/fluid/tests/unittests/test_parallel_op.py
similarity index 97%
rename from python/paddle/v2/fluid/tests/unittests/test_parallel_op.py
rename to python/paddle/fluid/tests/unittests/test_parallel_op.py
index d65752608b204454d9d3e529dad366084f9b2c0e..cd20b430f93498372dd706a46c3a6d9d798721f5 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_parallel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_op.py
@@ -14,7 +14,7 @@
 
 import unittest
 
-import paddle.v2.fluid as fluid
+import paddle.fluid as fluid
 import numpy
 
 
@@ -127,7 +127,7 @@ class BaseParallelForTest(unittest.TestCase):
                 data = next(generator)
                 loss = generator.send(data)
             self.assertIsNotNone(loss)
-            avg_loss = fluid.layers.mean(x=loss)
+            avg_loss = fluid.layers.mean(loss)
             fluid.backward.append_backward(loss=avg_loss)
 
         exe = fluid.Executor(place)
@@ -170,7 +170,7 @@ class ParallelOpTest(BaseParallelForTest):
         x = fluid.layers.data(shape=[784], dtype='float32', name='img')
         x = yield x
         hidden = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
-        loss = fluid.layers.mean(x=hidden)
+        loss = fluid.layers.mean(hidden)
         yield loss
 
     def test_simple_fc(self):
@@ -200,7 +200,7 @@ class ParallelOpTestMultipleInput(BaseParallelForTest):
         hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
         hidden2 = fluid.layers.fc(input=hidden1, size=200, param_attr='fc2.w')
         hidden3 = fluid.layers.fc(input=hidden2, size=200, param_attr='fc3.w')
-        loss = fluid.layers.mean(x=hidden3)
+        loss = fluid.layers.mean(hidden3)
         yield loss
 
     def test_simple_fc(self):
diff --git a/python/paddle/v2/fluid/tests/unittests/test_parameter.py b/python/paddle/fluid/tests/unittests/test_parameter.py
similarity index 87%
rename from python/paddle/v2/fluid/tests/unittests/test_parameter.py
rename to python/paddle/fluid/tests/unittests/test_parameter.py
index 88356a7ea14bdf4ef8cd7cb4a1a401d66556cc7e..e09865074e8aa9345fd9cc84e1f19eaf0436142f 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_parameter.py
+++ b/python/paddle/fluid/tests/unittests/test_parameter.py
@@ -13,11 +13,11 @@
 # limitations under the License.
 
 import unittest
-from paddle.v2.fluid.framework import default_main_program
-import paddle.v2.fluid.core as core
-from paddle.v2.fluid.executor import Executor
-import paddle.v2.fluid.io as io
-from paddle.v2.fluid.initializer import ConstantInitializer
+from paddle.fluid.framework import default_main_program
+import paddle.fluid.core as core
+from paddle.fluid.executor import Executor
+import paddle.fluid.io as io
+from paddle.fluid.initializer import ConstantInitializer
 import numpy as np
 
 main_program = default_main_program()
diff --git a/python/paddle/v2/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
similarity index 99%
rename from python/paddle/v2/fluid/tests/unittests/test_pool2d_op.py
rename to python/paddle/fluid/tests/unittests/test_pool2d_op.py
index 77961bc99f0e477ec7e38943afc59c89ffd0bfa8..12899ecca36dd1a49f536e657c1840803c3405e2 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -15,7 +15,7 @@
 import unittest
 import numpy as np
 
-import paddle.v2.fluid.core as core
+import paddle.fluid.core as core
 from op_test import OpTest
 
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
similarity index 99%
rename from python/paddle/v2/fluid/tests/unittests/test_pool3d_op.py
rename to python/paddle/fluid/tests/unittests/test_pool3d_op.py
index a6afdaedc516e295ed08420d72b4873f1c4b8ee5..321b5f39ffff129384c2a809ba8f7d154dcf5036 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -15,7 +15,7 @@
 import unittest
 import numpy as np
 
-import paddle.v2.fluid.core as core
+import paddle.fluid.core as core
 from op_test import OpTest
 
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_pool_max_op.py b/python/paddle/fluid/tests/unittests/test_pool_max_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_pool_max_op.py
rename to python/paddle/fluid/tests/unittests/test_pool_max_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_positive_negative_pair_op.py b/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_positive_negative_pair_op.py
rename to python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_precision_recall_op.py b/python/paddle/fluid/tests/unittests/test_precision_recall_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_precision_recall_op.py
rename to python/paddle/fluid/tests/unittests/test_precision_recall_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_prelu_op.py b/python/paddle/fluid/tests/unittests/test_prelu_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_prelu_op.py
rename to python/paddle/fluid/tests/unittests/test_prelu_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_print_op.py b/python/paddle/fluid/tests/unittests/test_print_op.py
similarity index 87%
rename from python/paddle/v2/fluid/tests/unittests/test_print_op.py
rename to python/paddle/fluid/tests/unittests/test_print_op.py
index 1e49ce994b788d5f0f46d8427282410b2a2d2db2..c75080fbb96d472810e5d6a1d02a77c456006f66 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_print_op.py
+++ b/python/paddle/fluid/tests/unittests/test_print_op.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 
 import unittest
-import paddle.v2.fluid.core as core
-from paddle.v2.fluid.executor import Executor
-import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.backward import append_backward
-from paddle.v2.fluid.framework import switch_main_program
-from paddle.v2.fluid.framework import Program
+import paddle.fluid.core as core
+from paddle.fluid.executor import Executor
+import paddle.fluid.layers as layers
+from paddle.fluid.backward import append_backward
+from paddle.fluid.framework import switch_main_program
+from paddle.fluid.framework import Program
 import numpy as np
 
 
@@ -35,7 +35,7 @@ class TestPrintOpCPU(unittest.TestCase):
         x.stop_gradient = False
         printed = layers.Print(input=x, **kargs)
         if only_forward: return printed
-        loss = layers.mean(x=printed)
+        loss = layers.mean(printed)
         append_backward(loss=loss)
         return loss
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_prior_box_op.py b/python/paddle/fluid/tests/unittests/test_prior_box_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_prior_box_op.py
rename to python/paddle/fluid/tests/unittests/test_prior_box_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py
similarity index 71%
rename from python/paddle/v2/fluid/tests/unittests/test_profiler.py
rename to python/paddle/fluid/tests/unittests/test_profiler.py
index 62bfb2b8e24d0bf1554d19ea46f9ae7de5665937..f6f581ff7d67260dad50b285aa35276698fd7130 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
@@ -15,34 +15,16 @@
 import unittest
 import os
 import numpy as np
-import paddle.v2.fluid as fluid
-import paddle.v2.fluid.profiler as profiler
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.core as core
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.layers as layers
+import paddle.fluid.core as core
 
 
 class TestProfiler(unittest.TestCase):
-    def test_nvprof(self):
-        if not fluid.core.is_compiled_with_cuda():
-            return
-        epoc = 8
-        dshape = [4, 3, 28, 28]
-        data = layers.data(name='data', shape=[3, 28, 28], dtype='float32')
-        conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
-
-        place = fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-
-        output_file = 'cuda_profiler.txt'
-        with profiler.cuda_profiler(output_file, 'csv') as nvprof:
-            for i in range(epoc):
-                input = np.random.random(dshape).astype('float32')
-                exe.run(fluid.default_main_program(), feed={'data': input})
-        os.remove(output_file)
-
     def net_profiler(self, state):
-        if state == 'GPU' and not core.is_compiled_with_cuda():
+        enable_if_gpu = state == 'GPU' or state == "All"
+        if enable_if_gpu and not core.is_compiled_with_cuda():
             return
         startup_program = fluid.Program()
         main_program = fluid.Program()
@@ -54,7 +36,7 @@ class TestProfiler(unittest.TestCase):
             predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
             label = fluid.layers.data(name='y', shape=[1], dtype='int64')
             cost = fluid.layers.cross_entropy(input=predict, label=label)
-            avg_cost = fluid.layers.mean(x=cost)
+            avg_cost = fluid.layers.mean(cost)
             accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
 
         optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
@@ -85,6 +67,9 @@ class TestProfiler(unittest.TestCase):
     def test_cuda_profiler(self):
         self.net_profiler('GPU')
 
+    def test_all_profiler(self):
+        self.net_profiler('All')
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/unittests/test_program.py b/python/paddle/fluid/tests/unittests/test_program.py
similarity index 97%
rename from python/paddle/v2/fluid/tests/unittests/test_program.py
rename to python/paddle/fluid/tests/unittests/test_program.py
index 266e189e501fb925451a0b5bfade2dc24e3dc640..87a2195f0d5c7fd355ea01a3c8f60908b33d4b9d 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_program.py
+++ b/python/paddle/fluid/tests/unittests/test_program.py
@@ -15,8 +15,8 @@
 from __future__ import print_function
 import unittest
 
-from paddle.v2.fluid.framework import Program, default_main_program, program_guard, grad_var_name
-import paddle.v2.fluid.layers as layers
+from paddle.fluid.framework import Program, default_main_program, program_guard, grad_var_name
+import paddle.fluid.layers as layers
 
 main_program = default_main_program()
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_protobuf.py b/python/paddle/fluid/tests/unittests/test_protobuf.py
similarity index 95%
rename from python/paddle/v2/fluid/tests/unittests/test_protobuf.py
rename to python/paddle/fluid/tests/unittests/test_protobuf.py
index 90de56514da27ea8f5fe303cb9505917c4ae6485..c3f1fa80185bfc4afc3ed715d736bcba092629d8 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_protobuf.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
+import paddle.fluid.proto.framework_pb2 as framework_pb2
 import unittest
 
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_protobuf_descs.py b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
similarity index 99%
rename from python/paddle/v2/fluid/tests/unittests/test_protobuf_descs.py
rename to python/paddle/fluid/tests/unittests/test_protobuf_descs.py
index c3bef9587485445504bdd04247766afe8996d7da..309ea2b9b7ede442da3ac897ce8d1a4b9aa68233 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_protobuf_descs.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import unittest
-import paddle.v2.fluid.core as core
+import paddle.fluid.core as core
 
 
 class TestOpDesc(unittest.TestCase):
diff --git a/python/paddle/v2/fluid/tests/unittests/test_proximal_adagrad_op.py b/python/paddle/fluid/tests/unittests/test_proximal_adagrad_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_proximal_adagrad_op.py
rename to python/paddle/fluid/tests/unittests/test_proximal_adagrad_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_proximal_gd_op.py b/python/paddle/fluid/tests/unittests/test_proximal_gd_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_proximal_gd_op.py
rename to python/paddle/fluid/tests/unittests/test_proximal_gd_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_rank_loss_op.py b/python/paddle/fluid/tests/unittests/test_rank_loss_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_rank_loss_op.py
rename to python/paddle/fluid/tests/unittests/test_rank_loss_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
similarity index 96%
rename from python/paddle/v2/fluid/tests/unittests/test_recurrent_op.py
rename to python/paddle/fluid/tests/unittests/test_recurrent_op.py
index 177d8fc65f415fff542829152e0d3768c03c18fb..d6ff18430e319e236f03d5661381e923cc956590 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
@@ -14,12 +14,12 @@
 
 import unittest
 
-import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.framework import Program, grad_var_name
-from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward
+import paddle.fluid.layers as layers
+from paddle.fluid.framework import Program, grad_var_name
+from paddle.fluid.executor import Executor
+from paddle.fluid.backward import append_backward
 import numpy as np
-import paddle.v2.fluid.core as core
+import paddle.fluid.core as core
 
 
 class PyRNNBase(object):
@@ -127,7 +127,7 @@ class RecurrentOpTest1(unittest.TestCase):
         self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
         self.py_rnn = PySimpleRNN1(self.input_shape, self.output_shape)
 
-        self.output = layers.mean(x=self.create_rnn_op(), **self.p_info)
+        self.output = layers.mean(self.create_rnn_op(), **self.p_info)
 
     def create_rnn_op(self):
         x = layers.data(
@@ -261,7 +261,7 @@ class RecurrentOpTest2(RecurrentOpTest1):
         self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
         self.py_rnn = PySimpleRNN2(self.input_shape, self.output_shape)
 
-        self.output = layers.mean(x=self.create_rnn_op(), **self.p_info)
+        self.output = layers.mean(self.create_rnn_op(), **self.p_info)
 
     def create_rnn_op(self):
         x = layers.data(
@@ -360,7 +360,7 @@ class RecurrentOpMultipleMemoryTest(RecurrentOpTest1):
         self.py_rnn = RecurrentOpMultipleMemoryTest.PySimpleRNN3(
             self.input_shape, self.output_shape)
 
-        self.output = layers.mean(x=self.create_rnn_op(), **self.p_info)
+        self.output = layers.mean(self.create_rnn_op(), **self.p_info)
 
     def create_rnn_op(self):
         x = layers.data(
@@ -444,7 +444,7 @@ class RecurrentOpNoMemBootTest(RecurrentOpTest1):
         self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
         self.py_rnn = RecurrentOpNoMemBootTest.PySimpleRNN4(self.input_shape,
                                                             self.output_shape)
-        self.output = layers.mean(x=self.create_rnn_op(), **self.p_info)
+        self.output = layers.mean(self.create_rnn_op(), **self.p_info)
         print self.main_program
 
     def create_rnn_op(self):
diff --git a/python/paddle/v2/fluid/tests/unittests/test_recv_op.py b/python/paddle/fluid/tests/unittests/test_recv_op.py
similarity index 96%
rename from python/paddle/v2/fluid/tests/unittests/test_recv_op.py
rename to python/paddle/fluid/tests/unittests/test_recv_op.py
index 7a0802afc546279cd5c164809f3eb44c5265b620..985d892c568472614c5f3e6691f54807ddccc4bd 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_recv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recv_op.py
@@ -14,8 +14,8 @@
 
 import unittest
 
-import paddle.v2.fluid as fluid
-import paddle.v2.fluid.layers as layers
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
 import numpy
 from multiprocessing import Process
 import os, sys
diff --git a/python/paddle/v2/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_reduce_op.py
rename to python/paddle/fluid/tests/unittests/test_reduce_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_registry.py b/python/paddle/fluid/tests/unittests/test_registry.py
similarity index 93%
rename from python/paddle/v2/fluid/tests/unittests/test_registry.py
rename to python/paddle/fluid/tests/unittests/test_registry.py
index 82527a6ec7ca59ca0a9e10d416e1c98d9c84da09..a361c4624e3e2efa817e8137ff31133997a0a1fb 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_registry.py
+++ b/python/paddle/fluid/tests/unittests/test_registry.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import unittest
 
-import paddle.v2.fluid as fluid
+import paddle.fluid as fluid
 import numpy as np
 import decorators
 
@@ -22,7 +22,7 @@ class TestRegistry(unittest.TestCase):
     @decorators.prog_scope()
     def test_registry_layer(self):
         x = fluid.layers.data(name='X', shape=[10, 10], dtype='float32')
-        output = fluid.layers.mean(x=x)
+        output = fluid.layers.mean(x)
 
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
diff --git a/python/paddle/v2/fluid/tests/unittests/test_regularizer.py b/python/paddle/fluid/tests/unittests/test_regularizer.py
similarity index 95%
rename from python/paddle/v2/fluid/tests/unittests/test_regularizer.py
rename to python/paddle/fluid/tests/unittests/test_regularizer.py
index 8fc4db1c5a8d2dff91b7990d4bd5978aa64efe04..9b1c4ceada52322b3f1fdc4ab2e90a2c089ee67e 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_regularizer.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer.py
@@ -14,10 +14,10 @@
 
 import unittest
 
-import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid.optimizer as optimizer
-import paddle.v2.fluid.regularizer as regularizer
-from paddle.v2.fluid.backward import append_backward
+import paddle.fluid.framework as framework
+import paddle.fluid.optimizer as optimizer
+import paddle.fluid.regularizer as regularizer
+from paddle.fluid.backward import append_backward
 
 
 class TestL2DecayRegularizer(unittest.TestCase):
diff --git a/python/paddle/v2/fluid/tests/unittests/test_reorder_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
similarity index 99%
rename from python/paddle/v2/fluid/tests/unittests/test_reorder_lod_tensor.py
rename to python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
index d4e17d1b1e5a9ced0ca463101b2d41791d53e337..76d0d2f2fe80e409dc1b7fa858d43fbc6ad960ef 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_reorder_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import unittest
-import paddle.v2.fluid as fluid
-import paddle.v2.fluid.core as core
+import paddle.fluid as fluid
+import paddle.fluid.core as core
 import numpy
 
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_reshape_op.py
rename to python/paddle/fluid/tests/unittests/test_reshape_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_rmsprop_op.py b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_rmsprop_op.py
rename to python/paddle/fluid/tests/unittests/test_rmsprop_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_rnn_memory_helper_op.py b/python/paddle/fluid/tests/unittests/test_rnn_memory_helper_op.py
similarity index 96%
rename from python/paddle/v2/fluid/tests/unittests/test_rnn_memory_helper_op.py
rename to python/paddle/fluid/tests/unittests/test_rnn_memory_helper_op.py
index 773bd17456a2375e577adcd20addda9b117cae1c..178606f05961263df5ef0398064a1fd135fbe784 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_rnn_memory_helper_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_memory_helper_op.py
@@ -14,11 +14,11 @@
 
 import unittest
 
-from paddle.v2.fluid.framework import Program
-from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward
+from paddle.fluid.framework import Program
+from paddle.fluid.executor import Executor
+from paddle.fluid.backward import append_backward
 import numpy as np
-import paddle.v2.fluid.core as core
+import paddle.fluid.core as core
 
 
 class RNNMemoryHelperOpTest(unittest.TestCase):
diff --git a/python/paddle/v2/fluid/tests/unittests/test_roi_pool_op.py b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_roi_pool_op.py
rename to python/paddle/fluid/tests/unittests/test_roi_pool_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_row_conv_op.py b/python/paddle/fluid/tests/unittests/test_row_conv_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_row_conv_op.py
rename to python/paddle/fluid/tests/unittests/test_row_conv_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_scale_op.py b/python/paddle/fluid/tests/unittests/test_scale_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_scale_op.py
rename to python/paddle/fluid/tests/unittests/test_scale_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_scatter_op.py b/python/paddle/fluid/tests/unittests/test_scatter_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_scatter_op.py
rename to python/paddle/fluid/tests/unittests/test_scatter_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_scope.py b/python/paddle/fluid/tests/unittests/test_scope.py
similarity index 89%
rename from python/paddle/v2/fluid/tests/unittests/test_scope.py
rename to python/paddle/fluid/tests/unittests/test_scope.py
index 2a2efbf098268ee8df959e8b58eefaa6728820f9..d249a989a9499d01f6ed10d6cdbc6c456a7262c5 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_scope.py
+++ b/python/paddle/fluid/tests/unittests/test_scope.py
@@ -12,25 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2.fluid.core
+import paddle.fluid.core
 import unittest
 
 
 class TestScope(unittest.TestCase):
     def test_create_destroy(self):
-        paddle_c = paddle.v2.fluid.core
+        paddle_c = paddle.fluid.core
         scope = paddle_c.Scope()
         self.assertIsNotNone(scope)
         scope_with_parent = scope.new_scope()
         self.assertIsNotNone(scope_with_parent)
 
     def test_none_variable(self):
-        paddle_c = paddle.v2.fluid.core
+        paddle_c = paddle.fluid.core
         scope = paddle_c.Scope()
         self.assertIsNone(scope.find_var("test"))
 
     def test_create_var_get_var(self):
-        paddle_c = paddle.v2.fluid.core
+        paddle_c = paddle.fluid.core
         scope = paddle_c.Scope()
         var_a = scope.var("var_a")
         self.assertIsNotNone(var_a)
@@ -39,7 +39,7 @@ class TestScope(unittest.TestCase):
         self.assertIsNotNone(scope2.find_var('var_a'))
 
     def test_var_get_int(self):
-        paddle_c = paddle.v2.fluid.core
+        paddle_c = paddle.fluid.core
         scope = paddle_c.Scope()
         var = scope.var("test_int")
         var.set_int(10)
diff --git a/python/paddle/v2/fluid/tests/unittests/test_selected_rows.py b/python/paddle/fluid/tests/unittests/test_selected_rows.py
similarity index 98%
rename from python/paddle/v2/fluid/tests/unittests/test_selected_rows.py
rename to python/paddle/fluid/tests/unittests/test_selected_rows.py
index 50c8bb4bca2edfa362e55e404a764485fba675ad..3d7b86787fbf0a855bcd86b8a873c9134cb1d5cc 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_selected_rows.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2.fluid.core as core
+import paddle.fluid.core as core
 import unittest
 import numpy as np
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_seq_concat_op.py b/python/paddle/fluid/tests/unittests/test_seq_concat_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_seq_concat_op.py
rename to python/paddle/fluid/tests/unittests/test_seq_concat_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_seq_conv.py b/python/paddle/fluid/tests/unittests/test_seq_conv.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_seq_conv.py
rename to python/paddle/fluid/tests/unittests/test_seq_conv.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_seq_pool.py b/python/paddle/fluid/tests/unittests/test_seq_pool.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_seq_pool.py
rename to python/paddle/fluid/tests/unittests/test_seq_pool.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_sequence_erase_op.py b/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_sequence_erase_op.py
rename to python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_sequence_expand.py b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_sequence_expand.py
rename to python/paddle/fluid/tests/unittests/test_sequence_expand.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_sequence_reshape.py b/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_sequence_reshape.py
rename to python/paddle/fluid/tests/unittests/test_sequence_reshape.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_sequence_slice_op.py b/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_sequence_slice_op.py
rename to python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_sequence_softmax_op.py b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_sequence_softmax_op.py
rename to python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_sgd_op.py b/python/paddle/fluid/tests/unittests/test_sgd_op.py
similarity index 97%
rename from python/paddle/v2/fluid/tests/unittests/test_sgd_op.py
rename to python/paddle/fluid/tests/unittests/test_sgd_op.py
index e5379b961f40d00c9c8f6522addac3a4768ad763..c498b23db12cd83304f4c3a3d1f15bd68ad4f0b6 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_sgd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py
@@ -14,8 +14,8 @@
 
 import unittest
 import numpy as np
-import paddle.v2.fluid.core as core
-from paddle.v2.fluid.op import Operator
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
 from op_test import OpTest
 
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_shrink_rnn_memory.py b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
similarity index 91%
rename from python/paddle/v2/fluid/tests/unittests/test_shrink_rnn_memory.py
rename to python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
index 48874ba8a5ba827a5ef36bff5adcd4a696f8cbc3..1d93230e7b74c5b6c00bbe125e3ae2d3a649b4b9 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_shrink_rnn_memory.py
+++ b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 
 import unittest
-import paddle.v2.fluid.core as core
-from paddle.v2.fluid.executor import Executor
-import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.backward import append_backward
-from paddle.v2.fluid.framework import default_main_program, switch_main_program
-from paddle.v2.fluid.framework import Program
+import paddle.fluid.core as core
+from paddle.fluid.executor import Executor
+import paddle.fluid.layers as layers
+from paddle.fluid.backward import append_backward
+from paddle.fluid.framework import default_main_program, switch_main_program
+from paddle.fluid.framework import Program
 import numpy as np
 
 
@@ -39,7 +39,7 @@ class TestShrinkRNNMemoryBase(unittest.TestCase):
         i = layers.increment(x=i)
         i.stop_gradient = True
         self.mem3 = layers.shrink_memory(x=self.mem2, i=i, table=table)
-        mem3_mean = layers.mean(x=self.mem3)
+        mem3_mean = layers.mean(self.mem3)
         append_backward(loss=mem3_mean)
         self.x_grad = self.main_program.global_block().var('x@GRAD')
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
rename to python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_sign_op.py b/python/paddle/fluid/tests/unittests/test_sign_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_sign_op.py
rename to python/paddle/fluid/tests/unittests/test_sign_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_smooth_l1_loss_op.py b/python/paddle/fluid/tests/unittests/test_smooth_l1_loss_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_smooth_l1_loss_op.py
rename to python/paddle/fluid/tests/unittests/test_smooth_l1_loss_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_softmax_op.py
rename to python/paddle/fluid/tests/unittests/test_softmax_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
rename to python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
similarity index 95%
rename from python/paddle/v2/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
rename to python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
index 48e6756a8682a4cb4a151769bf8e9db22be80cf5..02cc7da84918041c33bf5c8def46025bc87a2b9e 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 
 import unittest
-import paddle.v2.fluid.core as core
+import paddle.fluid.core as core
 import numpy as np
-import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.framework import Program, program_guard
-from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward
+import paddle.fluid.layers as layers
+from paddle.fluid.framework import Program, program_guard
+from paddle.fluid.executor import Executor
+from paddle.fluid.backward import append_backward
 
 
 class TestCPULoDTensorArrayOps(unittest.TestCase):
@@ -145,7 +145,7 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
                 input=x, mask=y, level=level)
             out = layers.merge_lod_tensor(
                 in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
-            mean = layers.mean(x=out)
+            mean = layers.mean(out)
 
             append_backward(mean)
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_split_op.py b/python/paddle/fluid/tests/unittests/test_split_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_split_op.py
rename to python/paddle/fluid/tests/unittests/test_split_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_split_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
similarity index 98%
rename from python/paddle/v2/fluid/tests/unittests/test_split_selected_rows_op.py
rename to python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
index 2aaa05dcacf09fb2b9119b90598373bf3f874b71..286d305a777a4683d42a4d3d2d5d5f0c5b6ac12a 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_split_selected_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
@@ -13,9 +13,9 @@
 # limitations under the License.
 
 import unittest
-import paddle.v2.fluid.core as core
+import paddle.fluid.core as core
 import numpy as np
-from paddle.v2.fluid.op import Operator
+from paddle.fluid.op import Operator
 
 
 class TestSpliteSelectedRows(unittest.TestCase):
diff --git a/python/paddle/v2/fluid/tests/unittests/test_split_var.py b/python/paddle/fluid/tests/unittests/test_split_var.py
similarity index 92%
rename from python/paddle/v2/fluid/tests/unittests/test_split_var.py
rename to python/paddle/fluid/tests/unittests/test_split_var.py
index d7160b78b95c862a41a4a07a770ccb35c97845ca..104ceb4fe7beb70b9016f57cef0ef895a3eb8ba6 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_split_var.py
+++ b/python/paddle/fluid/tests/unittests/test_split_var.py
@@ -14,9 +14,9 @@
 
 import math
 import unittest
-from paddle.v2.fluid.distribute_transpiler import split_dense_variable
-import paddle.v2.fluid as fluid
-import paddle.v2.fluid.core as core
+from paddle.fluid.distribute_transpiler import split_dense_variable
+import paddle.fluid as fluid
+import paddle.fluid.core as core
 import random
 
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_spp_op.py b/python/paddle/fluid/tests/unittests/test_spp_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_spp_op.py
rename to python/paddle/fluid/tests/unittests/test_spp_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_squared_l2_distance_op.py b/python/paddle/fluid/tests/unittests/test_squared_l2_distance_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_squared_l2_distance_op.py
rename to python/paddle/fluid/tests/unittests/test_squared_l2_distance_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_squared_l2_norm_op.py b/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_squared_l2_norm_op.py
rename to python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_sum_op.py
rename to python/paddle/fluid/tests/unittests/test_sum_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_switch.py b/python/paddle/fluid/tests/unittests/test_switch.py
similarity index 90%
rename from python/paddle/v2/fluid/tests/unittests/test_switch.py
rename to python/paddle/fluid/tests/unittests/test_switch.py
index 11296bc04ea29366dfa7b047154063333cbe998b..528c5cce4bc7262ade196f6a81a57a57089117ec 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_switch.py
+++ b/python/paddle/fluid/tests/unittests/test_switch.py
@@ -14,11 +14,11 @@
 
 import unittest
 
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.framework as framework
-from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.framework import default_startup_program
+import paddle.fluid.core as core
+import paddle.fluid.layers as layers
+import paddle.fluid.framework as framework
+from paddle.fluid.executor import Executor
+from paddle.fluid.framework import default_startup_program
 
 
 class TestSwitch(unittest.TestCase):
diff --git a/python/paddle/v2/fluid/tests/unittests/test_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_target_assign_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_target_assign_op.py
rename to python/paddle/fluid/tests/unittests/test_target_assign_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor.py
similarity index 99%
rename from python/paddle/v2/fluid/tests/unittests/test_tensor.py
rename to python/paddle/fluid/tests/unittests/test_tensor.py
index 8fe234a90f27f1bd72302ebad9d5959c917924af..a369783245ae2e35a9743ef1f4321ac919e58283 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2.fluid.core as core
+import paddle.fluid.core as core
 import unittest
 import numpy
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_top_k_op.py b/python/paddle/fluid/tests/unittests/test_top_k_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_top_k_op.py
rename to python/paddle/fluid/tests/unittests/test_top_k_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_transpose_op.py
rename to python/paddle/fluid/tests/unittests/test_transpose_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_uniform_random_batch_size_like_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_batch_size_like_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_uniform_random_batch_size_like_op.py
rename to python/paddle/fluid/tests/unittests/test_uniform_random_batch_size_like_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_uniform_random_op.py
rename to python/paddle/fluid/tests/unittests/test_uniform_random_op.py
diff --git a/python/paddle/fluid/tests/unittests/test_unique_name.py b/python/paddle/fluid/tests/unittests/test_unique_name.py
new file mode 100644
index 0000000000000000000000000000000000000000..49ef335618ca7ca1e8249a61a97ca552dabdb9e8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_unique_name.py
@@ -0,0 +1,43 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid as fluid
+
+
+class TestUniqueName(unittest.TestCase):
+    def test_guard(self):
+        with fluid.unique_name.guard():
+            name_1 = fluid.unique_name.generate('')
+
+        with fluid.unique_name.guard():
+            name_2 = fluid.unique_name.generate('')
+
+        self.assertEqual(name_1, name_2)
+
+        with fluid.unique_name.guard("A"):
+            name_1 = fluid.unique_name.generate('')
+
+        with fluid.unique_name.guard('B'):
+            name_2 = fluid.unique_name.generate('')
+
+        self.assertNotEqual(name_1, name_2)
+
+    def test_generate(self):
+        with fluid.unique_name.guard():
+            name1 = fluid.unique_name.generate('fc')
+            name2 = fluid.unique_name.generate('fc')
+            name3 = fluid.unique_name.generate('tmp')
+            self.assertNotEqual(name1, name2)
+            self.assertEqual(name1[-2:], name3[-2:])
diff --git a/python/paddle/v2/fluid/tests/unittests/test_unpool_op.py b/python/paddle/fluid/tests/unittests/test_unpool_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_unpool_op.py
rename to python/paddle/fluid/tests/unittests/test_unpool_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py
similarity index 94%
rename from python/paddle/v2/fluid/tests/unittests/test_variable.py
rename to python/paddle/fluid/tests/unittests/test_variable.py
index 4ae3909d2743874fb7cd4b739c3fe5db02fa02b3..49784e21c461bacadd404bf4a8640ebc4dcb26ca 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import unittest
-from paddle.v2.fluid.framework import default_main_program, Program, convert_np_dtype_to_dtype_
-import paddle.v2.fluid.core as core
+from paddle.fluid.framework import default_main_program, Program, convert_np_dtype_to_dtype_
+import paddle.fluid.core as core
 import numpy as np
 
 
diff --git a/python/paddle/v2/fluid/tests/unittests/test_warpctc_op.py b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/unittests/test_warpctc_op.py
rename to python/paddle/fluid/tests/unittests/test_warpctc_op.py
diff --git a/python/paddle/v2/fluid/tests/unittests/test_weight_normalization.py b/python/paddle/fluid/tests/unittests/test_weight_normalization.py
similarity index 95%
rename from python/paddle/v2/fluid/tests/unittests/test_weight_normalization.py
rename to python/paddle/fluid/tests/unittests/test_weight_normalization.py
index c2b81dddb0039aba7b39518b43d29a2b466b85e3..2adf917bc5d3bb35842a817c57a983627b759f22 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_weight_normalization.py
+++ b/python/paddle/fluid/tests/unittests/test_weight_normalization.py
@@ -15,10 +15,10 @@
 import unittest
 import numpy
 import collections
-import paddle.v2.fluid as fluid
-import paddle.v2.fluid.core as core
-from paddle.v2.fluid.initializer import ConstantInitializer
-from paddle.v2.fluid.param_attr import WeightNormParamAttr
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.initializer import ConstantInitializer
+from paddle.fluid.param_attr import WeightNormParamAttr
 
 
 class TestWeightNormalization(unittest.TestCase):
diff --git a/python/paddle/v2/fluid/tests/unittests/test_while_op.py b/python/paddle/fluid/tests/unittests/test_while_op.py
similarity index 92%
rename from python/paddle/v2/fluid/tests/unittests/test_while_op.py
rename to python/paddle/fluid/tests/unittests/test_while_op.py
index 3fa1d5e0edd4ac8b62c1e45089a9384952f0822a..fe8808bc044684c96fb3382836be32dac1d241f3 100644
--- a/python/paddle/v2/fluid/tests/unittests/test_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_while_op.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 
 import unittest
-import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.executor import Executor
-import paddle.v2.fluid.core as core
-from paddle.v2.fluid.backward import append_backward
+import paddle.fluid.layers as layers
+from paddle.fluid.executor import Executor
+import paddle.fluid.core as core
+from paddle.fluid.backward import append_backward
 import numpy
 
 
@@ -58,7 +58,7 @@ class TestWhileOp(unittest.TestCase):
             layers.less_than(x=i, y=array_len, cond=cond)
 
         sum_result = layers.array_read(array=mem_array, i=i)
-        loss = layers.mean(x=sum_result)
+        loss = layers.mean(sum_result)
 
         append_backward(loss)
 
diff --git a/python/paddle/fluid/unique_name.py b/python/paddle/fluid/unique_name.py
new file mode 100644
index 0000000000000000000000000000000000000000..33c53113ae7e8ed9aeada31f2aed6990b6fea110
--- /dev/null
+++ b/python/paddle/fluid/unique_name.py
@@ -0,0 +1,74 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import contextlib
+import sys
+
+__all__ = ['generate', 'switch', 'guard', 'UniqueNameGenerator']
+
+
+class UniqueNameGenerator(object):
+    """
+    Generate unique name with prefix.
+
+    Args:
+        prefix(str): The generated name prefix. All generated name will be
+                     started with this prefix.
+    """
+
+    def __init__(self, prefix=None):
+        self.ids = collections.defaultdict(int)
+        if prefix is None:
+            prefix = ""
+        self.prefix = prefix
+
+    def __call__(self, key):
+        """
+        Generate unique names with prefix
+
+        Args:
+            key(str): The key of return string.
+
+        Returns(str): A unique string with the prefix
+        """
+        tmp = self.ids[key]
+        self.ids[key] += 1
+        return self.prefix + "_".join([key, str(tmp)])
+
+
+generator = UniqueNameGenerator()
+
+
+def generate(key):
+    return generator(key)
+
+
+def switch(new_generator=None):
+    global generator
+    old = generator
+    if new_generator is None:
+        generator = UniqueNameGenerator()
+    else:
+        generator = new_generator
+    return old
+
+
+@contextlib.contextmanager
+def guard(new_generator=None):
+    if isinstance(new_generator, basestring):
+        new_generator = UniqueNameGenerator(new_generator)
+    old = switch(new_generator)
+    yield
+    switch(old)
diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py
index 23f5a24a1cea7f665fb65e802e1a7811df78208d..0d544efac9cd20157f87b5cd3b68f97ab5ed2dbc 100644
--- a/python/paddle/v2/dataset/conll05.py
+++ b/python/paddle/v2/dataset/conll05.py
@@ -41,6 +41,26 @@ EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
 UNK_IDX = 0
 
 
+def load_label_dict(filename):
+    d = dict()
+    tag_dict = set()
+    with open(filename, 'r') as f:
+        for i, line in enumerate(f):
+            line = line.strip()
+            if line.startswith("B-"):
+                tag_dict.add(line[2:])
+            elif line.startswith("I-"):
+                tag_dict.add(line[2:])
+        index = 0
+        for tag in tag_dict:
+            d["B-" + tag] = index
+            index += 1
+            d["I-" + tag] = index
+            index += 1
+        d["O"] = index
+    return d
+
+
 def load_dict(filename):
     d = dict()
     with open(filename, 'r') as f:
@@ -188,7 +208,7 @@ def get_dict():
     verb_dict = load_dict(
         paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st',
                                           VERBDICT_MD5))
-    label_dict = load_dict(
+    label_dict = load_label_dict(
         paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st',
                                           TRGDICT_MD5))
     return word_dict, verb_dict, label_dict
diff --git a/python/paddle/v2/fluid/.gitignore b/python/paddle/v2/fluid/.gitignore
deleted file mode 100644
index 2ff540d5764b76cf7bac64fc2bb9df6e9c1b398a..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-proto
diff --git a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
deleted file mode 100644
index a66c2c3c2fc24048eb794a1a36e2b2fc8ff0da27..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
+++ /dev/null
@@ -1,125 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-import contextlib
-import numpy
-import unittest
-import math
-import sys
-
-
-def train(use_cuda, save_dirname):
-    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-
-    y_predict = fluid.layers.fc(input=x, size=1, act=None)
-
-    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-
-    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-    avg_cost = fluid.layers.mean(x=cost)
-
-    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-    sgd_optimizer.minimize(avg_cost)
-
-    BATCH_SIZE = 20
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.uci_housing.train(), buf_size=500),
-        batch_size=BATCH_SIZE)
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-    exe = fluid.Executor(place)
-
-    exe.run(fluid.default_startup_program())
-
-    PASS_NUM = 100
-    for pass_id in range(PASS_NUM):
-        for data in train_reader():
-            avg_loss_value, = exe.run(fluid.default_main_program(),
-                                      feed=feeder.feed(data),
-                                      fetch_list=[avg_cost])
-            print(avg_loss_value)
-            if avg_loss_value[0] < 10.0:
-                if save_dirname is not None:
-                    fluid.io.save_inference_model(save_dirname, ['x'],
-                                                  [y_predict], exe)
-                return
-            if math.isnan(float(avg_loss_value)):
-                sys.exit("got NaN loss, training failed.")
-    raise AssertionError("Fit a line cost is too large, {0:2.2}".format(
-        avg_loss_value[0]))
-
-
-def infer(use_cuda, save_dirname=None):
-    if save_dirname is None:
-        return
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    # Use fluid.io.load_inference_model to obtain the inference program desc,
-    # the feed_target_names (the names of variables that will be feeded 
-    # data using feed operators), and the fetch_targets (variables that 
-    # we want to obtain data from using fetch operators).
-    [inference_program, feed_target_names,
-     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
-    # The input's dimension should be 2-D and the second dim is 13
-    # The input data should be >= 0
-    batch_size = 10
-    tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
-    assert feed_target_names[0] == 'x'
-    results = exe.run(inference_program,
-                      feed={feed_target_names[0]: tensor_x},
-                      fetch_list=fetch_targets)
-    print("infer shape: ", results[0].shape)
-    print("infer results: ", results[0])
-
-
-def main(use_cuda):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-
-    # Directory for saving the trained model
-    save_dirname = "fit_a_line.inference.model"
-
-    train(use_cuda, save_dirname)
-    infer(use_cuda, save_dirname)
-
-
-class TestFitALine(unittest.TestCase):
-    def test_cpu(self):
-        with self.program_scope_guard():
-            main(use_cuda=False)
-
-    def test_cuda(self):
-        with self.program_scope_guard():
-            main(use_cuda=True)
-
-    @contextlib.contextmanager
-    def program_scope_guard(self):
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        scope = fluid.core.Scope()
-        with fluid.scope_guard(scope):
-            with fluid.program_guard(prog, startup_prog):
-                yield
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
deleted file mode 100644
index 2462d425e16c40f2fa3b0944311aea4f44de3ac4..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
+++ /dev/null
@@ -1,246 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-import argparse
-import paddle.v2.fluid as fluid
-import paddle.v2 as paddle
-import sys
-import numpy
-import unittest
-import math
-import sys
-
-
-def parse_arg():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "nn_type",
-        help="The neural network type, in ['mlp', 'conv']",
-        type=str,
-        choices=['mlp', 'conv'])
-    parser.add_argument(
-        "--parallel",
-        help='Run in parallel or not',
-        default=False,
-        action="store_true")
-    parser.add_argument(
-        "--use_cuda",
-        help="Run the program by using CUDA",
-        default=False,
-        action="store_true")
-    return parser.parse_args()
-
-
-BATCH_SIZE = 64
-
-
-def loss_net(hidden, label):
-    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_loss = fluid.layers.mean(x=loss)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-    return prediction, avg_loss, acc
-
-
-def mlp(img, label):
-    hidden = fluid.layers.fc(input=img, size=200, act='tanh')
-    hidden = fluid.layers.fc(input=hidden, size=200, act='tanh')
-    return loss_net(hidden, label)
-
-
-def conv_net(img, label):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
-    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
-    return loss_net(conv_pool_2, label)
-
-
-def train(nn_type, use_cuda, parallel, save_dirname, save_param_filename):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    if nn_type == 'mlp':
-        net_conf = mlp
-    else:
-        net_conf = conv_net
-
-    if parallel:
-        places = fluid.layers.get_places()
-        pd = fluid.layers.ParallelDo(places)
-        with pd.do():
-            img_ = pd.read_input(img)
-            label_ = pd.read_input(label)
-            prediction, avg_loss, acc = net_conf(img_, label_)
-            for o in [avg_loss, acc]:
-                pd.write_output(o)
-
-        avg_loss, acc = pd()
-        # get mean loss and acc through every devices.
-        avg_loss = fluid.layers.mean(x=avg_loss)
-        acc = fluid.layers.mean(x=acc)
-    else:
-        prediction, avg_loss, acc = net_conf(img, label)
-
-    test_program = fluid.default_main_program().clone()
-
-    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
-    optimizer.minimize(avg_loss)
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
-    exe = fluid.Executor(place)
-    exe.run(fluid.default_startup_program())
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.mnist.train(), buf_size=500),
-        batch_size=BATCH_SIZE)
-    test_reader = paddle.batch(
-        paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
-    feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
-
-    PASS_NUM = 100
-    for pass_id in range(PASS_NUM):
-        for batch_id, data in enumerate(train_reader()):
-            # train a mini-batch, fetch nothing
-            exe.run(feed=feeder.feed(data))
-            if (batch_id + 1) % 10 == 0:
-                acc_set = []
-                avg_loss_set = []
-                for test_data in test_reader():
-                    acc_np, avg_loss_np = exe.run(program=test_program,
-                                                  feed=feeder.feed(test_data),
-                                                  fetch_list=[acc, avg_loss])
-                    acc_set.append(float(acc_np))
-                    avg_loss_set.append(float(avg_loss_np))
-                # get test acc and loss
-                acc_val = numpy.array(acc_set).mean()
-                avg_loss_val = numpy.array(avg_loss_set).mean()
-                if float(acc_val) > 0.2:  # Smaller value to increase CI speed
-                    if save_dirname is not None:
-                        fluid.io.save_inference_model(
-                            save_dirname, ["img"], [prediction],
-                            exe,
-                            save_file_name=save_param_filename)
-                    return
-                else:
-                    print(
-                        'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
-                        format(pass_id, batch_id + 1,
-                               float(avg_loss_val), float(acc_val)))
-                    if math.isnan(float(avg_loss_val)):
-                        sys.exit("got NaN loss, training failed.")
-    raise AssertionError("Loss of recognize digits is too large")
-
-
-def infer(use_cuda, save_dirname=None, param_filename=None):
-    if save_dirname is None:
-        return
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    # Use fluid.io.load_inference_model to obtain the inference program desc,
-    # the feed_target_names (the names of variables that will be feeded 
-    # data using feed operators), and the fetch_targets (variables that 
-    # we want to obtain data from using fetch operators).
-    [inference_program, feed_target_names, fetch_targets
-     ] = fluid.io.load_inference_model(save_dirname, exe, param_filename)
-
-    # The input's dimension of conv should be 4-D or 5-D.
-    # Use normilized image pixels as input data, which should be in the range [-1.0, 1.0].
-    batch_size = 1
-    tensor_img = numpy.random.uniform(-1.0, 1.0,
-                                      [batch_size, 1, 28, 28]).astype("float32")
-
-    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
-    # and results will contain a list of data corresponding to fetch_targets.
-    results = exe.run(inference_program,
-                      feed={feed_target_names[0]: tensor_img},
-                      fetch_list=fetch_targets)
-    print("infer results: ", results[0])
-
-
-def main(use_cuda, parallel, nn_type, combine):
-    if not use_cuda and not parallel:
-        save_dirname = "recognize_digits_" + nn_type + ".inference.model"
-        save_filename = None
-        if combine == True:
-            save_filename = "__params_combined__"
-    else:
-        save_dirname = None
-        save_filename = None
-
-    train(
-        nn_type=nn_type,
-        use_cuda=use_cuda,
-        parallel=parallel,
-        save_dirname=save_dirname,
-        save_param_filename=save_filename)
-    infer(
-        use_cuda=use_cuda,
-        save_dirname=save_dirname,
-        param_filename=save_filename)
-
-
-class TestRecognizeDigits(unittest.TestCase):
-    pass
-
-
-def inject_test_method(use_cuda, parallel, nn_type, combine):
-    def __impl__(self):
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        scope = fluid.core.Scope()
-        with fluid.scope_guard(scope):
-            with fluid.program_guard(prog, startup_prog):
-                main(use_cuda, parallel, nn_type, combine)
-
-    fn = 'test_{0}_{1}_{2}_{3}'.format(nn_type, 'cuda'
-                                       if use_cuda else 'cpu', 'parallel'
-                                       if parallel else 'normal', 'combine'
-                                       if combine else 'separate')
-
-    setattr(TestRecognizeDigits, fn, __impl__)
-
-
-def inject_all_tests():
-    for use_cuda in (False, True):
-        for parallel in (False, True):
-            for nn_type in ('mlp', 'conv'):
-                inject_test_method(use_cuda, parallel, nn_type, True)
-
-    # Two unit-test for saving parameters as separate files
-    inject_test_method(False, False, 'mlp', False)
-    inject_test_method(False, False, 'conv', False)
-
-
-inject_all_tests()
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py
deleted file mode 100644
index af917de8e337179feac835f4ba30afe584b5d42b..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py
+++ /dev/null
@@ -1,275 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-
-import unittest
-import paddle.v2.fluid as fluid
-import paddle.v2 as paddle
-import contextlib
-import math
-import numpy as np
-import sys
-
-
-def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
-                    hid_dim=32):
-    emb = fluid.layers.embedding(
-        input=data, size=[input_dim, emb_dim], is_sparse=True)
-    conv_3 = fluid.nets.sequence_conv_pool(
-        input=emb,
-        num_filters=hid_dim,
-        filter_size=3,
-        act="tanh",
-        pool_type="sqrt")
-    conv_4 = fluid.nets.sequence_conv_pool(
-        input=emb,
-        num_filters=hid_dim,
-        filter_size=4,
-        act="tanh",
-        pool_type="sqrt")
-    prediction = fluid.layers.fc(input=[conv_3, conv_4],
-                                 size=class_dim,
-                                 act="softmax")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    accuracy = fluid.layers.accuracy(input=prediction, label=label)
-    return avg_cost, accuracy, prediction
-
-
-def stacked_lstm_net(data,
-                     label,
-                     input_dim,
-                     class_dim=2,
-                     emb_dim=128,
-                     hid_dim=512,
-                     stacked_num=3):
-    assert stacked_num % 2 == 1
-
-    emb = fluid.layers.embedding(
-        input=data, size=[input_dim, emb_dim], is_sparse=True)
-    # add bias attr
-
-    # TODO(qijun) linear act
-    fc1 = fluid.layers.fc(input=emb, size=hid_dim)
-    lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)
-
-    inputs = [fc1, lstm1]
-
-    for i in range(2, stacked_num + 1):
-        fc = fluid.layers.fc(input=inputs, size=hid_dim)
-        lstm, cell = fluid.layers.dynamic_lstm(
-            input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
-        inputs = [fc, lstm]
-
-    fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
-    lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
-
-    prediction = fluid.layers.fc(input=[fc_last, lstm_last],
-                                 size=class_dim,
-                                 act='softmax')
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    accuracy = fluid.layers.accuracy(input=prediction, label=label)
-    return avg_cost, accuracy, prediction
-
-
-def create_random_lodtensor(lod, place, low, high):
-    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
-    res = fluid.LoDTensor()
-    res.set(data, place)
-    res.set_lod([lod])
-    return res
-
-
-def train(word_dict, net_method, use_cuda, parallel=False, save_dirname=None):
-    BATCH_SIZE = 128
-    PASS_NUM = 5
-    dict_dim = len(word_dict)
-    class_dim = 2
-
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-
-    if not parallel:
-        cost, acc_out, prediction = net_method(
-            data, label, input_dim=dict_dim, class_dim=class_dim)
-    else:
-        places = fluid.layers.get_places()
-        pd = fluid.layers.ParallelDo(places)
-        with pd.do():
-            cost, acc, _ = net_method(
-                pd.read_input(data),
-                pd.read_input(label),
-                input_dim=dict_dim,
-                class_dim=class_dim)
-            pd.write_output(cost)
-            pd.write_output(acc)
-
-        cost, acc = pd()
-        cost = fluid.layers.mean(x=cost)
-        acc_out = fluid.layers.mean(x=acc)
-        prediction = None
-        assert save_dirname is None
-
-    adagrad = fluid.optimizer.Adagrad(learning_rate=0.002)
-    adagrad.minimize(cost)
-
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.imdb.train(word_dict), buf_size=1000),
-        batch_size=BATCH_SIZE)
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
-
-    exe.run(fluid.default_startup_program())
-
-    for pass_id in xrange(PASS_NUM):
-        for data in train_data():
-            cost_val, acc_val = exe.run(fluid.default_main_program(),
-                                        feed=feeder.feed(data),
-                                        fetch_list=[cost, acc_out])
-            print("cost=" + str(cost_val) + " acc=" + str(acc_val))
-            if cost_val < 0.4 and acc_val > 0.8:
-                if save_dirname is not None:
-                    fluid.io.save_inference_model(save_dirname, ["words"],
-                                                  prediction, exe)
-                return
-            if math.isnan(float(cost_val)):
-                sys.exit("got NaN loss, training failed.")
-    raise AssertionError("Cost is too large for {0}".format(
-        net_method.__name__))
-
-
-def infer(use_cuda, save_dirname=None):
-    if save_dirname is None:
-        return
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    # Use fluid.io.load_inference_model to obtain the inference program desc,
-    # the feed_target_names (the names of variables that will be feeded 
-    # data using feed operators), and the fetch_targets (variables that 
-    # we want to obtain data from using fetch operators).
-    [inference_program, feed_target_names,
-     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
-    lod = [0, 4, 10]
-    word_dict = paddle.dataset.imdb.word_dict()
-    tensor_words = create_random_lodtensor(
-        lod, place, low=0, high=len(word_dict) - 1)
-
-    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
-    # and results will contain a list of data corresponding to fetch_targets.
-    assert feed_target_names[0] == "words"
-    results = exe.run(inference_program,
-                      feed={feed_target_names[0]: tensor_words},
-                      fetch_list=fetch_targets,
-                      return_numpy=False)
-    print(results[0].lod())
-    np_data = np.array(results[0])
-    print("Inference Shape: ", np_data.shape)
-    print("Inference results: ", np_data)
-
-
-def main(word_dict, net_method, use_cuda, parallel=False, save_dirname=None):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-
-    train(
-        word_dict,
-        net_method,
-        use_cuda,
-        parallel=parallel,
-        save_dirname=save_dirname)
-    infer(use_cuda, save_dirname)
-
-
-class TestUnderstandSentiment(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.word_dict = paddle.dataset.imdb.word_dict()
-
-    @contextlib.contextmanager
-    def new_program_scope(self):
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        scope = fluid.core.Scope()
-        with fluid.scope_guard(scope):
-            with fluid.program_guard(prog, startup_prog):
-                yield
-
-    def test_conv_cpu(self):
-        with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=convolution_net,
-                use_cuda=False,
-                save_dirname="understand_sentiment.inference.model")
-
-    def test_conv_cpu_parallel(self):
-        with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=convolution_net,
-                use_cuda=False,
-                parallel=True)
-
-    @unittest.skip(reason="make CI faster")
-    def test_stacked_lstm_cpu(self):
-        with self.new_program_scope():
-            main(self.word_dict, net_method=stacked_lstm_net, use_cuda=False)
-
-    def test_stacked_lstm_cpu_parallel(self):
-        with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=stacked_lstm_net,
-                use_cuda=False,
-                parallel=True)
-
-    def test_conv_gpu(self):
-        with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=convolution_net,
-                use_cuda=True,
-                save_dirname="understand_sentiment.inference.model")
-
-    def test_conv_gpu_parallel(self):
-        with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=convolution_net,
-                use_cuda=True,
-                parallel=True)
-
-    @unittest.skip(reason="make CI faster")
-    def test_stacked_lstm_gpu(self):
-        with self.new_program_scope():
-            main(self.word_dict, net_method=stacked_lstm_net, use_cuda=True)
-
-    def test_stacked_lstm_gpu_parallel(self):
-        with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=stacked_lstm_net,
-                use_cuda=True,
-                parallel=True)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/book/test_word2vec.py b/python/paddle/v2/fluid/tests/book/test_word2vec.py
deleted file mode 100644
index 9bd8f90c5ee3ccbb6642dc09a8177be914856975..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/book/test_word2vec.py
+++ /dev/null
@@ -1,226 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-# # Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-import unittest
-import os
-import numpy as np
-import math
-import sys
-
-
-def create_random_lodtensor(lod, place, low, high):
-    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
-    res = fluid.LoDTensor()
-    res.set(data, place)
-    res.set_lod([lod])
-    return res
-
-
-def infer(use_cuda, save_dirname=None):
-    if save_dirname is None:
-        return
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    # Use fluid.io.load_inference_model to obtain the inference program desc,
-    # the feed_target_names (the names of variables that will be feeded 
-    # data using feed operators), and the fetch_targets (variables that 
-    # we want to obtain data from using fetch operators).
-    [inference_program, feed_target_names,
-     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
-    word_dict = paddle.dataset.imikolov.build_dict()
-    dict_size = len(word_dict) - 1
-
-    # Setup input, by creating 4 words, and setting up lod required for 
-    # lookup_table_op
-    lod = [0, 1]
-    first_word = create_random_lodtensor(lod, place, low=0, high=dict_size)
-    second_word = create_random_lodtensor(lod, place, low=0, high=dict_size)
-    third_word = create_random_lodtensor(lod, place, low=0, high=dict_size)
-    fourth_word = create_random_lodtensor(lod, place, low=0, high=dict_size)
-
-    assert feed_target_names[0] == 'firstw'
-    assert feed_target_names[1] == 'secondw'
-    assert feed_target_names[2] == 'thirdw'
-    assert feed_target_names[3] == 'forthw'
-
-    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
-    # and results will contain a list of data corresponding to fetch_targets.
-    results = exe.run(inference_program,
-                      feed={
-                          feed_target_names[0]: first_word,
-                          feed_target_names[1]: second_word,
-                          feed_target_names[2]: third_word,
-                          feed_target_names[3]: fourth_word
-                      },
-                      fetch_list=fetch_targets,
-                      return_numpy=False)
-    print(results[0].lod())
-    np_data = np.array(results[0])
-    print("Inference Shape: ", np_data.shape)
-    print("Inference results: ", np_data)
-
-
-def train(use_cuda, is_sparse, parallel, save_dirname):
-    PASS_NUM = 100
-    EMBED_SIZE = 32
-    HIDDEN_SIZE = 256
-    N = 5
-    BATCH_SIZE = 32
-    IS_SPARSE = is_sparse
-
-    def __network__(words):
-        embed_first = fluid.layers.embedding(
-            input=words[0],
-            size=[dict_size, EMBED_SIZE],
-            dtype='float32',
-            is_sparse=IS_SPARSE,
-            param_attr='shared_w')
-        embed_second = fluid.layers.embedding(
-            input=words[1],
-            size=[dict_size, EMBED_SIZE],
-            dtype='float32',
-            is_sparse=IS_SPARSE,
-            param_attr='shared_w')
-        embed_third = fluid.layers.embedding(
-            input=words[2],
-            size=[dict_size, EMBED_SIZE],
-            dtype='float32',
-            is_sparse=IS_SPARSE,
-            param_attr='shared_w')
-        embed_forth = fluid.layers.embedding(
-            input=words[3],
-            size=[dict_size, EMBED_SIZE],
-            dtype='float32',
-            is_sparse=IS_SPARSE,
-            param_attr='shared_w')
-
-        concat_embed = fluid.layers.concat(
-            input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
-        hidden1 = fluid.layers.fc(input=concat_embed,
-                                  size=HIDDEN_SIZE,
-                                  act='sigmoid')
-        predict_word = fluid.layers.fc(input=hidden1,
-                                       size=dict_size,
-                                       act='softmax')
-        cost = fluid.layers.cross_entropy(input=predict_word, label=words[4])
-        avg_cost = fluid.layers.mean(x=cost)
-        return avg_cost, predict_word
-
-    word_dict = paddle.dataset.imikolov.build_dict()
-    dict_size = len(word_dict)
-
-    first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
-    second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
-    third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
-    forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
-    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
-
-    if not parallel:
-        avg_cost, predict_word = __network__(
-            [first_word, second_word, third_word, forth_word, next_word])
-    else:
-        places = fluid.layers.get_places()
-        pd = fluid.layers.ParallelDo(places)
-        with pd.do():
-            avg_cost, predict_word = __network__(
-                map(pd.read_input, [
-                    first_word, second_word, third_word, forth_word, next_word
-                ]))
-            pd.write_output(avg_cost)
-
-        avg_cost = fluid.layers.mean(x=pd())
-
-    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-    sgd_optimizer.minimize(avg_cost)
-
-    train_reader = paddle.batch(
-        paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    feeder = fluid.DataFeeder(
-        feed_list=[first_word, second_word, third_word, forth_word, next_word],
-        place=place)
-
-    exe.run(fluid.default_startup_program())
-
-    for pass_id in range(PASS_NUM):
-        for data in train_reader():
-            avg_cost_np = exe.run(fluid.default_main_program(),
-                                  feed=feeder.feed(data),
-                                  fetch_list=[avg_cost])
-            if avg_cost_np[0] < 5.0:
-                if save_dirname is not None:
-                    fluid.io.save_inference_model(save_dirname, [
-                        'firstw', 'secondw', 'thirdw', 'forthw'
-                    ], [predict_word], exe)
-                return
-            if math.isnan(float(avg_cost_np[0])):
-                sys.exit("got NaN loss, training failed.")
-
-    raise AssertionError("Cost is too large {0:2.2}".format(avg_cost_np[0]))
-
-
-def main(use_cuda, is_sparse, parallel):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-    save_dirname = "word2vec.inference.model"
-    train(use_cuda, is_sparse, parallel, save_dirname)
-    infer(use_cuda, save_dirname)
-
-
-FULL_TEST = os.getenv('FULL_TEST',
-                      '0').lower() in ['true', '1', 't', 'y', 'yes', 'on']
-SKIP_REASON = "Only run minimum number of tests in CI server, to make CI faster"
-
-
-class W2VTest(unittest.TestCase):
-    pass
-
-
-def inject_test_method(use_cuda, is_sparse, parallel):
-    fn_name = "test_{0}_{1}_{2}".format("cuda" if use_cuda else "cpu", "sparse"
-                                        if is_sparse else "dense", "parallel"
-                                        if parallel else "normal")
-
-    def __impl__(*args, **kwargs):
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        scope = fluid.core.Scope()
-        with fluid.scope_guard(scope):
-            with fluid.program_guard(prog, startup_prog):
-                main(use_cuda=use_cuda, is_sparse=is_sparse, parallel=parallel)
-
-    # run only 2 cases: use_cuda is either True or False
-    if is_sparse == False and parallel == False:
-        fn = __impl__
-    else:
-        # skip the other test when on CI server
-        fn = unittest.skipUnless(
-            condition=FULL_TEST, reason=SKIP_REASON)(__impl__)
-
-    setattr(W2VTest, fn_name, fn)
-
-
-for use_cuda in (False, True):
-    for is_sparse in (False, True):
-        for parallel in (False, True):
-            inject_test_method(use_cuda, is_sparse, parallel)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/book_distribute/CMakeLists.txt b/python/paddle/v2/fluid/tests/book_distribute/CMakeLists.txt
deleted file mode 100644
index 4d7664469e481344cf9eea84688f068b4fb99dee..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/book_distribute/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
-endforeach()
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py
deleted file mode 100644
index c443c4e0b7d68bdff4a767372d0b9c96f34eba5e..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py
+++ /dev/null
@@ -1,77 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-import os
-
-x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-
-y_predict = fluid.layers.fc(input=x, size=1, act=None)
-
-y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-
-cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-avg_cost = fluid.layers.mean(x=cost)
-
-sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
-
-BATCH_SIZE = 20
-
-train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.uci_housing.train(), buf_size=500),
-    batch_size=BATCH_SIZE)
-
-place = fluid.CPUPlace()
-feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-exe = fluid.Executor(place)
-
-t = fluid.DistributeTranspiler()
-# all parameter server endpoints list for spliting parameters
-pserver_endpoints = os.getenv("PSERVERS")
-# server endpoint for current node
-current_endpoint = os.getenv("SERVER_ENDPOINT")
-# run as trainer or parameter server
-training_role = os.getenv("TRAINING_ROLE",
-                          "TRAINER")  # get the training role: trainer/pserver
-t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
-
-if training_role == "PSERVER":
-    if not current_endpoint:
-        print("need env SERVER_ENDPOINT")
-        exit(1)
-    pserver_prog = t.get_pserver_program(current_endpoint)
-    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
-    exe.run(pserver_startup)
-    exe.run(pserver_prog)
-else:
-    trainer_prog = t.get_trainer_program()
-
-    exe.run(fluid.default_startup_program())
-
-    PASS_NUM = 100
-    for pass_id in range(PASS_NUM):
-        fluid.io.save_persistables(exe, "./fit_a_line.model/")
-        fluid.io.load_persistables(exe, "./fit_a_line.model/")
-        for data in train_reader():
-            avg_loss_value = exe.run(trainer_prog,
-                                     feed=feeder.feed(data),
-                                     fetch_list=[avg_cost])
-            print("loss:" + str(avg_loss_value))
-            if avg_loss_value[0] < 10.0:
-                exit(0)
-exit(1)
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_image_classification.py b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_image_classification.py
deleted file mode 100644
index 298ecfc386b3ae093cf714a41f5072759cb2cf2e..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_image_classification.py
+++ /dev/null
@@ -1,170 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-import os
-import sys
-
-TRAINERS = 5
-BATCH_SIZE = 128
-PASS_NUM = 100
-
-
-def resnet_cifar10(input, depth=32):
-    def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
-        tmp = fluid.layers.conv2d(
-            input=input,
-            filter_size=filter_size,
-            num_filters=ch_out,
-            stride=stride,
-            padding=padding,
-            act=None,
-            bias_attr=False)
-        return fluid.layers.batch_norm(input=tmp, act=act)
-
-    def shortcut(input, ch_in, ch_out, stride):
-        if ch_in != ch_out:
-            return conv_bn_layer(input, ch_out, 1, stride, 0, None)
-        else:
-            return input
-
-    def basicblock(input, ch_in, ch_out, stride):
-        tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
-        tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None)
-        short = shortcut(input, ch_in, ch_out, stride)
-        return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
-
-    def layer_warp(block_func, input, ch_in, ch_out, count, stride):
-        tmp = block_func(input, ch_in, ch_out, stride)
-        for i in range(1, count):
-            tmp = block_func(tmp, ch_out, ch_out, 1)
-        return tmp
-
-    assert (depth - 2) % 6 == 0
-    n = (depth - 2) / 6
-    conv1 = conv_bn_layer(
-        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
-    res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
-    res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
-    res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
-    pool = fluid.layers.pool2d(
-        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
-    return pool
-
-
-def vgg16_bn_drop(input):
-    def conv_block(input, num_filter, groups, dropouts):
-        return fluid.nets.img_conv_group(
-            input=input,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act='relu',
-            conv_with_batchnorm=True,
-            conv_batchnorm_drop_rate=dropouts,
-            pool_type='max')
-
-    conv1 = conv_block(input, 64, 2, [0.3, 0])
-    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
-    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
-    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
-    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-
-    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
-    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
-    bn = fluid.layers.batch_norm(input=fc1, act='relu')
-    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
-    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
-    return fc2
-
-
-classdim = 10
-data_shape = [3, 32, 32]
-
-images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
-label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-net_type = "vgg"
-if len(sys.argv) >= 2:
-    net_type = sys.argv[1]
-
-if net_type == "vgg":
-    print("training vgg net")
-    net = vgg16_bn_drop(images)
-elif net_type == "resnet":
-    print("training resnet")
-    net = resnet_cifar10(images, 32)
-else:
-    raise ValueError("%s network is not supported" % net_type)
-
-predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
-cost = fluid.layers.cross_entropy(input=predict, label=label)
-avg_cost = fluid.layers.mean(x=cost)
-
-optimizer = fluid.optimizer.Adam(learning_rate=0.001)
-optimize_ops, params_grads = optimizer.minimize(avg_cost)
-
-accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
-
-train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.cifar.train10(), buf_size=128 * 10),
-    batch_size=BATCH_SIZE)
-
-place = fluid.CPUPlace()
-feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
-exe = fluid.Executor(place)
-
-t = fluid.DistributeTranspiler()
-# all parameter server endpoints list for spliting parameters
-pserver_endpoints = os.getenv("PSERVERS")
-# server endpoint for current node
-current_endpoint = os.getenv("SERVER_ENDPOINT")
-# run as trainer or parameter server
-training_role = os.getenv("TRAINING_ROLE",
-                          "TRAINER")  # get the training role: trainer/pserver
-t.transpile(
-    optimize_ops, params_grads, pservers=pserver_endpoints, trainers=TRAINERS)
-
-if training_role == "PSERVER":
-    if not current_endpoint:
-        print("need env SERVER_ENDPOINT")
-        exit(1)
-    pserver_prog = t.get_pserver_program(current_endpoint)
-    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
-    exe.run(pserver_startup)
-    exe.run(pserver_prog)
-elif training_role == "TRAINER":
-    trainer_prog = t.get_trainer_program()
-    exe.run(fluid.default_startup_program())
-
-    for pass_id in range(PASS_NUM):
-        accuracy.reset(exe)
-        for data in train_reader():
-            loss, acc = exe.run(trainer_prog,
-                                feed=feeder.feed(data),
-                                fetch_list=[avg_cost] + accuracy.metrics)
-            pass_acc = accuracy.eval(exe)
-            print("pass_id:" + str(pass_id) + "loss:" + str(loss) + " pass_acc:"
-                  + str(pass_acc))
-            # this model is slow, so if we can train two mini batches,
-            # we think it works properly.
-    print("trainer run end")
-else:
-    print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
-exit(1)
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_label_semantic_roles.py
deleted file mode 100644
index 1210bf1d8441356e41e18266e8e087ba0d7b032c..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_label_semantic_roles.py
+++ /dev/null
@@ -1,240 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-
-import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.dataset.conll05 as conll05
-import paddle.v2.fluid as fluid
-import time
-import os
-
-word_dict, verb_dict, label_dict = conll05.get_dict()
-word_dict_len = len(word_dict)
-label_dict_len = len(label_dict)
-pred_len = len(verb_dict)
-
-mark_dict_len = 2
-word_dim = 32
-mark_dim = 5
-hidden_dim = 512
-depth = 8
-mix_hidden_lr = 1e-3
-
-IS_SPARSE = True
-PASS_NUM = 10
-BATCH_SIZE = 20
-
-embedding_name = 'emb'
-
-
-def load_parameter(file_name, h, w):
-    with open(file_name, 'rb') as f:
-        f.read(16)  # skip header.
-        return np.fromfile(f, dtype=np.float32).reshape(h, w)
-
-
-def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
-            **ignored):
-    # 8 features
-    predicate_embedding = fluid.layers.embedding(
-        input=predicate,
-        size=[pred_len, word_dim],
-        dtype='float32',
-        is_sparse=IS_SPARSE,
-        param_attr='vemb')
-
-    mark_embedding = fluid.layers.embedding(
-        input=mark,
-        size=[mark_dict_len, mark_dim],
-        dtype='float32',
-        is_sparse=IS_SPARSE)
-
-    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
-    emb_layers = [
-        fluid.layers.embedding(
-            size=[word_dict_len, word_dim],
-            input=x,
-            param_attr=fluid.ParamAttr(
-                name=embedding_name, trainable=False)) for x in word_input
-    ]
-    emb_layers.append(predicate_embedding)
-    emb_layers.append(mark_embedding)
-
-    hidden_0_layers = [
-        fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers
-    ]
-
-    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
-
-    lstm_0 = fluid.layers.dynamic_lstm(
-        input=hidden_0,
-        size=hidden_dim,
-        candidate_activation='relu',
-        gate_activation='sigmoid',
-        cell_activation='sigmoid')
-
-    # stack L-LSTM and R-LSTM with direct edges
-    input_tmp = [hidden_0, lstm_0]
-
-    for i in range(1, depth):
-        mix_hidden = fluid.layers.sums(input=[
-            fluid.layers.fc(input=input_tmp[0], size=hidden_dim),
-            fluid.layers.fc(input=input_tmp[1], size=hidden_dim)
-        ])
-
-        lstm = fluid.layers.dynamic_lstm(
-            input=mix_hidden,
-            size=hidden_dim,
-            candidate_activation='relu',
-            gate_activation='sigmoid',
-            cell_activation='sigmoid',
-            is_reverse=((i % 2) == 1))
-
-        input_tmp = [mix_hidden, lstm]
-
-    feature_out = fluid.layers.sums(input=[
-        fluid.layers.fc(input=input_tmp[0], size=label_dict_len),
-        fluid.layers.fc(input=input_tmp[1], size=label_dict_len)
-    ])
-
-    return feature_out
-
-
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
-def main():
-    # define network topology
-    word = fluid.layers.data(
-        name='word_data', shape=[1], dtype='int64', lod_level=1)
-    predicate = fluid.layers.data(
-        name='verb_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_n2 = fluid.layers.data(
-        name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_n1 = fluid.layers.data(
-        name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_0 = fluid.layers.data(
-        name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_p1 = fluid.layers.data(
-        name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_p2 = fluid.layers.data(
-        name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
-    mark = fluid.layers.data(
-        name='mark_data', shape=[1], dtype='int64', lod_level=1)
-    feature_out = db_lstm(**locals())
-    target = fluid.layers.data(
-        name='target', shape=[1], dtype='int64', lod_level=1)
-    crf_cost = fluid.layers.linear_chain_crf(
-        input=feature_out,
-        label=target,
-        param_attr=fluid.ParamAttr(
-            name='crfw', learning_rate=mix_hidden_lr))
-    avg_cost = fluid.layers.mean(x=crf_cost)
-
-    # TODO(qiao)
-    # check other optimizers and check why out will be NAN
-    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
-    optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
-
-    # TODO(qiao)
-    # add dependency track and move this config before optimizer
-    crf_decode = fluid.layers.crf_decoding(
-        input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
-
-    chunk_evaluator = fluid.evaluator.ChunkEvaluator(
-        input=crf_decode,
-        label=target,
-        chunk_scheme="IOB",
-        num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))
-
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.conll05.test(), buf_size=8192),
-        batch_size=BATCH_SIZE)
-    place = fluid.CPUPlace()
-    feeder = fluid.DataFeeder(
-        feed_list=[
-            word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
-        ],
-        place=place)
-    exe = fluid.Executor(place)
-
-    t = fluid.DistributeTranspiler()
-    pserver_endpoints = os.getenv("PSERVERS")
-    # server endpoint for current node
-    current_endpoint = os.getenv("SERVER_ENDPOINT")
-    # run as trainer or parameter server
-    training_role = os.getenv(
-        "TRAINING_ROLE", "TRAINER")  # get the training role: trainer/pserver
-    t.transpile(
-        optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
-
-    if training_role == "PSERVER":
-        if not current_endpoint:
-            print("need env SERVER_ENDPOINT")
-            exit(1)
-        pserver_prog = t.get_pserver_program(current_endpoint)
-        pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
-        exe.run(pserver_startup)
-        exe.run(pserver_prog)
-    elif training_role == "TRAINER":
-        trainer_prog = t.get_trainer_program()
-        start_time = time.time()
-        batch_id = 0
-        exe.run(fluid.default_startup_program())
-        embedding_param = fluid.global_scope().find_var(
-            embedding_name).get_tensor()
-        embedding_param.set(
-            load_parameter(conll05.get_embedding(), word_dict_len, word_dim),
-            place)
-        for pass_id in xrange(PASS_NUM):
-            chunk_evaluator.reset(exe)
-            for data in train_data():
-                cost, precision, recall, f1_score = exe.run(
-                    trainer_prog,
-                    feed=feeder.feed(data),
-                    fetch_list=[avg_cost] + chunk_evaluator.metrics)
-                pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(
-                    exe)
-
-                if batch_id % 10 == 0:
-                    print("avg_cost:" + str(cost) + " precision:" + str(
-                        precision) + " recall:" + str(recall) + " f1_score:" +
-                          str(f1_score) + " pass_precision:" + str(
-                              pass_precision) + " pass_recall:" + str(
-                                  pass_recall) + " pass_f1_score:" + str(
-                                      pass_f1_score))
-                    if batch_id != 0:
-                        print("second per batch: " + str((time.time(
-                        ) - start_time) / batch_id))
-
-                batch_id = batch_id + 1
-
-
-if __name__ == '__main__':
-    main()
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_word2vec.py b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_word2vec.py
deleted file mode 100644
index 0d5ad9885028b8b53d0ed4017cefc3eae04dc691..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_word2vec.py
+++ /dev/null
@@ -1,111 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-import os
-
-PASS_NUM = 100
-EMBED_SIZE = 32
-HIDDEN_SIZE = 256
-N = 5
-BATCH_SIZE = 32
-IS_SPARSE = True
-TRAINERS = 2
-
-word_dict = paddle.dataset.imikolov.build_dict()
-dict_size = len(word_dict)
-
-first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
-second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
-third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
-forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
-next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
-
-embed_first = fluid.layers.embedding(
-    input=first_word,
-    size=[dict_size, EMBED_SIZE],
-    dtype='float32',
-    is_sparse=IS_SPARSE,
-    param_attr='shared_w')
-embed_second = fluid.layers.embedding(
-    input=second_word,
-    size=[dict_size, EMBED_SIZE],
-    dtype='float32',
-    is_sparse=IS_SPARSE,
-    param_attr='shared_w')
-embed_third = fluid.layers.embedding(
-    input=third_word,
-    size=[dict_size, EMBED_SIZE],
-    dtype='float32',
-    is_sparse=IS_SPARSE,
-    param_attr='shared_w')
-embed_forth = fluid.layers.embedding(
-    input=forth_word,
-    size=[dict_size, EMBED_SIZE],
-    dtype='float32',
-    is_sparse=IS_SPARSE,
-    param_attr='shared_w')
-
-concat_embed = fluid.layers.concat(
-    input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
-hidden1 = fluid.layers.fc(input=concat_embed, size=HIDDEN_SIZE, act='sigmoid')
-predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax')
-cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
-avg_cost = fluid.layers.mean(x=cost)
-sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
-train_reader = paddle.batch(
-    paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
-
-place = fluid.CPUPlace()
-exe = fluid.Executor(place)
-
-t = fluid.DistributeTranspiler()
-# all parameter server endpoints list for spliting parameters
-pserver_endpoints = os.getenv("PSERVERS")
-# server endpoint for current node
-current_endpoint = os.getenv("SERVER_ENDPOINT")
-# run as trainer or parameter server
-training_role = os.getenv("TRAINING_ROLE",
-                          "TRAINER")  # get the training role: trainer/pserver
-t.transpile(
-    optimize_ops, params_grads, pservers=pserver_endpoints, trainers=TRAINERS)
-if training_role == "PSERVER":
-    if not current_endpoint:
-        print("need env SERVER_ENDPOINT")
-        exit(1)
-    pserver_prog = t.get_pserver_program(current_endpoint)
-    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
-    exe.run(pserver_startup)
-    exe.run(pserver_prog)
-elif training_role == "TRAINER":
-    feeder = fluid.DataFeeder(
-        feed_list=[first_word, second_word, third_word, forth_word, next_word],
-        place=place)
-    exe.run(fluid.default_startup_program())
-    for pass_id in range(PASS_NUM):
-        for data in train_reader():
-            avg_cost_np = exe.run(t.get_trainer_program(),
-                                  feed=feeder.feed(data),
-                                  fetch_list=[avg_cost])
-            print("avg_cost_np", avg_cost_np)
-            if avg_cost_np[0] < 5.0:
-                exit(
-                    0)  # if avg cost less than 10.0, we think our code is good.
-else:
-    print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
-exit(1)
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_machine_translation.py b/python/paddle/v2/fluid/tests/book_distribute/notest_machine_translation.py
deleted file mode 100644
index 15d2d40979edc7ceb2cdafcb21b10e21c4ae7392..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_machine_translation.py
+++ /dev/null
@@ -1,157 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.executor import Executor
-import os
-
-dict_size = 30000
-source_dict_dim = target_dict_dim = dict_size
-src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
-hidden_dim = 32
-word_dim = 16
-IS_SPARSE = True
-batch_size = 10
-max_length = 50
-topk_size = 50
-trg_dic_size = 10000
-
-decoder_size = hidden_dim
-
-
-def encoder_decoder():
-    # encoder
-    src_word_id = layers.data(
-        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
-    src_embedding = layers.embedding(
-        input=src_word_id,
-        size=[dict_size, word_dim],
-        dtype='float32',
-        is_sparse=IS_SPARSE,
-        param_attr=fluid.ParamAttr(name='vemb'))
-
-    fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
-    lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
-    encoder_out = layers.sequence_last_step(input=lstm_hidden0)
-
-    # decoder
-    trg_language_word = layers.data(
-        name="target_language_word", shape=[1], dtype='int64', lod_level=1)
-    trg_embedding = layers.embedding(
-        input=trg_language_word,
-        size=[dict_size, word_dim],
-        dtype='float32',
-        is_sparse=IS_SPARSE,
-        param_attr=fluid.ParamAttr(name='vemb'))
-
-    rnn = fluid.layers.DynamicRNN()
-    with rnn.block():
-        current_word = rnn.step_input(trg_embedding)
-        mem = rnn.memory(init=encoder_out)
-        fc1 = fluid.layers.fc(input=[current_word, mem],
-                              size=decoder_size,
-                              act='tanh')
-        out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax')
-        rnn.update_memory(mem, fc1)
-        rnn.output(out)
-
-    return rnn()
-
-
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = core.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
-def main():
-    rnn_out = encoder_decoder()
-    label = layers.data(
-        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
-    cost = layers.cross_entropy(input=rnn_out, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-
-    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
-    optimize_ops, params_grads = optimizer.minimize(avg_cost)
-
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
-        batch_size=batch_size)
-
-    place = core.CPUPlace()
-    exe = Executor(place)
-
-    t = fluid.DistributeTranspiler()
-    # all parameter server endpoints list for spliting parameters
-    pserver_endpoints = os.getenv("PSERVERS")
-    # server endpoint for current node
-    current_endpoint = os.getenv("SERVER_ENDPOINT")
-    # run as trainer or parameter server
-    training_role = os.getenv(
-        "TRAINING_ROLE", "TRAINER")  # get the training role: trainer/pserver
-    t.transpile(
-        optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
-
-    if training_role == "PSERVER":
-        if not current_endpoint:
-            print("need env SERVER_ENDPOINT")
-            exit(1)
-        pserver_prog = t.get_pserver_program(current_endpoint)
-        pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
-        exe.run(pserver_startup)
-        exe.run(pserver_prog)
-    elif training_role == "TRAINER":
-        trainer_prog = t.get_trainer_program()
-        exe.run(framework.default_startup_program())
-
-        batch_id = 0
-        for pass_id in xrange(2):
-            for data in train_data():
-                word_data = to_lodtensor(map(lambda x: x[0], data), place)
-                trg_word = to_lodtensor(map(lambda x: x[1], data), place)
-                trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
-                outs = exe.run(trainer_prog,
-                               feed={
-                                   'src_word_id': word_data,
-                                   'target_language_word': trg_word,
-                                   'target_language_next_word': trg_word_next
-                               },
-                               fetch_list=[avg_cost])
-                avg_cost_val = np.array(outs[0])
-                print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
-                      " avg_cost=" + str(avg_cost_val))
-                if batch_id > 3:
-                    exit(0)
-                batch_id += 1
-    else:
-        print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
-
-
-if __name__ == '__main__':
-    main()
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py b/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py
deleted file mode 100644
index 1c1fffc5892aa6ade05a341efb7043cea538b03f..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py
+++ /dev/null
@@ -1,100 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-import os
-
-images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
-label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-conv_pool_1 = fluid.nets.simple_img_conv_pool(
-    input=images,
-    filter_size=5,
-    num_filters=20,
-    pool_size=2,
-    pool_stride=2,
-    act="relu")
-conv_pool_2 = fluid.nets.simple_img_conv_pool(
-    input=conv_pool_1,
-    filter_size=5,
-    num_filters=50,
-    pool_size=2,
-    pool_stride=2,
-    act="relu")
-
-predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax")
-cost = fluid.layers.cross_entropy(input=predict, label=label)
-avg_cost = fluid.layers.mean(x=cost)
-optimizer = fluid.optimizer.Adam(learning_rate=0.01)
-optimize_ops, params_grads = optimizer.minimize(avg_cost)
-
-accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
-
-BATCH_SIZE = 50
-PASS_NUM = 3
-train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.mnist.train(), buf_size=500),
-    batch_size=BATCH_SIZE)
-
-place = fluid.CPUPlace()
-exe = fluid.Executor(place)
-
-pserver_endpoints = os.getenv("PSERVERS")  # all pserver endpoints
-trainers = int(os.getenv("TRAINERS"))  # total trainer count
-current_endpoint = os.getenv("SERVER_ENDPOINT")  # current pserver endpoint
-training_role = os.getenv("TRAINING_ROLE",
-                          "TRAINER")  # get the training role: trainer/pserver
-if not current_endpoint:
-    print("need env SERVER_ENDPOINT")
-    exit(1)
-
-t = fluid.DistributeTranspiler()
-t.transpile(
-    optimize_ops,
-    params_grads,
-    0,
-    pservers=pserver_endpoints,
-    trainers=trainers)
-
-if training_role == "PSERVER":
-    pserver_prog = t.get_pserver_program(current_endpoint)
-    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
-    exe.run(pserver_startup)
-    exe.run(pserver_prog)
-elif training_role == "TRAINER":
-    trainer_prog = t.get_trainer_program()
-    feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
-    # TODO(typhoonzero): change trainer startup program to fetch parameters from pserver
-    exe.run(fluid.default_startup_program())
-
-    for pass_id in range(PASS_NUM):
-        accuracy.reset(exe)
-        batch_id = 0
-        for data in train_reader():
-            loss, acc = exe.run(trainer_prog,
-                                feed=feeder.feed(data),
-                                fetch_list=[avg_cost] + accuracy.metrics)
-            pass_acc = accuracy.eval(exe)
-            if batch_id % 100 == 0:
-                print("batch_id %d, loss: %f, acc: %f" %
-                      (batch_id, loss, pass_acc))
-            batch_id += 1
-
-        pass_acc = accuracy.eval(exe)
-        print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc))
-else:
-    print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_mlp_dist.py b/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_mlp_dist.py
deleted file mode 100644
index c442ada6e3c03426f084da379207bc48817072ed..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_mlp_dist.py
+++ /dev/null
@@ -1,89 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-import os
-
-BATCH_SIZE = 128
-PASS_NUM = 100
-
-images = fluid.layers.data(name='x', shape=[784], dtype='float32')
-
-# TODO(aroraabhinav) Add regularization and error clipping after
-# Issue 7432(https://github.com/PaddlePaddle/Paddle/issues/7432) is resolved.
-hidden1 = fluid.layers.fc(input=images, size=128, act='relu')
-hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
-predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
-
-label = fluid.layers.data(name='y', shape=[1], dtype='int64')
-
-cost = fluid.layers.cross_entropy(input=predict, label=label)
-avg_cost = fluid.layers.mean(x=cost)
-
-optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
-optimize_ops, params_grads = optimizer.minimize(avg_cost)
-
-accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
-
-train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.mnist.train(), buf_size=8192),
-    batch_size=BATCH_SIZE)
-
-place = fluid.CPUPlace()
-exe = fluid.Executor(place)
-
-t = fluid.DistributeTranspiler()
-# all parameter server endpoints list for spliting parameters
-pserver_endpoints = os.getenv("PSERVERS")
-# server endpoint for current node
-current_endpoint = os.getenv("SERVER_ENDPOINT")
-# run as trainer or parameter server
-training_role = os.getenv("TRAINING_ROLE",
-                          "TRAINER")  # get the training role: trainer/pserver
-t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
-
-if training_role == "PSERVER":
-    if not current_endpoint:
-        print("need env SERVER_ENDPOINT")
-        exit(1)
-    pserver_prog = t.get_pserver_program(current_endpoint)
-    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
-    exe.run(pserver_startup)
-    exe.run(pserver_prog)
-elif training_role == "TRAINER":
-    trainer_prog = t.get_trainer_program()
-    feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
-    exe.run(fluid.default_startup_program())
-
-    for pass_id in range(PASS_NUM):
-        accuracy.reset(exe)
-        batch_id = 0
-        for data in train_reader():
-            loss, acc = exe.run(trainer_prog,
-                                feed=feeder.feed(data),
-                                fetch_list=[avg_cost] + accuracy.metrics)
-            pass_acc = accuracy.eval(exe)
-            if batch_id % 100 == 0:
-                print("batch_id %d, loss: %f, acc: %f" %
-                      (batch_id, loss, pass_acc))
-            batch_id += 1
-
-        pass_acc = accuracy.eval(exe)
-        print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc))
-else:
-    print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_recommender_system_dist.py b/python/paddle/v2/fluid/tests/book_distribute/notest_recommender_system_dist.py
deleted file mode 100644
index 363c7102c7fa431bfb5457003a28bee6fce3a77b..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_recommender_system_dist.py
+++ /dev/null
@@ -1,216 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import os
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.nets as nets
-from paddle.v2.fluid.optimizer import SGDOptimizer
-
-IS_SPARSE = True
-BATCH_SIZE = 256
-PASS_NUM = 100
-
-
-def get_usr_combined_features():
-    USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1
-    uid = layers.data(name='user_id', shape=[1], dtype='int64')
-    usr_emb = layers.embedding(
-        input=uid,
-        dtype='float32',
-        size=[USR_DICT_SIZE, 32],
-        param_attr='user_table',
-        is_sparse=IS_SPARSE)
-    usr_fc = layers.fc(input=usr_emb, size=32)
-    USR_GENDER_DICT_SIZE = 2
-
-    usr_gender_id = layers.data(name='gender_id', shape=[1], dtype='int64')
-    usr_gender_emb = layers.embedding(
-        input=usr_gender_id,
-        size=[USR_GENDER_DICT_SIZE, 16],
-        param_attr='gender_table',
-        is_sparse=IS_SPARSE)
-    usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
-
-    USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
-    usr_age_id = layers.data(name='age_id', shape=[1], dtype="int64")
-    usr_age_emb = layers.embedding(
-        input=usr_age_id,
-        size=[USR_AGE_DICT_SIZE, 16],
-        is_sparse=IS_SPARSE,
-        param_attr='age_table')
-    usr_age_fc = layers.fc(input=usr_age_emb, size=16)
-
-    USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
-    usr_job_id = layers.data(name='job_id', shape=[1], dtype="int64")
-    usr_job_emb = layers.embedding(
-        input=usr_job_id,
-        size=[USR_JOB_DICT_SIZE, 16],
-        param_attr='job_table',
-        is_sparse=IS_SPARSE)
-    usr_job_fc = layers.fc(input=usr_job_emb, size=16)
-
-    concat_embed = layers.concat(
-        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1)
-
-    usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
-    return usr_combined_features
-
-
-def get_mov_combined_features():
-    MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1
-    mov_id = layers.data(name='movie_id', shape=[1], dtype='int64')
-    mov_emb = layers.embedding(
-        input=mov_id,
-        dtype='float32',
-        size=[MOV_DICT_SIZE, 32],
-        param_attr='movie_table',
-        is_sparse=IS_SPARSE)
-    mov_fc = layers.fc(input=mov_emb, size=32)
-
-    CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
-    category_id = layers.data(name='category_id', shape=[1], dtype='int64')
-    mov_categories_emb = layers.embedding(
-        input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
-    mov_categories_hidden = layers.sequence_pool(
-        input=mov_categories_emb, pool_type="sum")
-
-    MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
-    mov_title_id = layers.data(name='movie_title', shape=[1], dtype='int64')
-    mov_title_emb = layers.embedding(
-        input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
-    mov_title_conv = nets.sequence_conv_pool(
-        input=mov_title_emb,
-        num_filters=32,
-        filter_size=3,
-        act="tanh",
-        pool_type="sum")
-
-    concat_embed = layers.concat(
-        input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1)
-
-    mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
-    return mov_combined_features
-
-
-def model():
-    usr_combined_features = get_usr_combined_features()
-    mov_combined_features = get_mov_combined_features()
-
-    # need cos sim
-    inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
-    scale_infer = layers.scale(x=inference, scale=5.0)
-
-    label = layers.data(name='score', shape=[1], dtype='float32')
-    square_cost = layers.square_error_cost(input=scale_infer, label=label)
-    avg_cost = layers.mean(x=square_cost)
-
-    return avg_cost
-
-
-def func_feed(feeding, data, place):
-    feed_tensors = {}
-    for (key, idx) in feeding.iteritems():
-        tensor = core.LoDTensor()
-        if key != "category_id" and key != "movie_title":
-            if key == "score":
-                numpy_data = np.array(map(lambda x: x[idx], data)).astype(
-                    "float32")
-            else:
-                numpy_data = np.array(map(lambda x: x[idx], data)).astype(
-                    "int64")
-        else:
-            numpy_data = map(lambda x: np.array(x[idx]).astype("int64"), data)
-            lod_info = [len(item) for item in numpy_data]
-            offset = 0
-            lod = [offset]
-            for item in lod_info:
-                offset += item
-                lod.append(offset)
-            numpy_data = np.concatenate(numpy_data, axis=0)
-            tensor.set_lod([lod])
-
-        numpy_data = numpy_data.reshape([numpy_data.shape[0], 1])
-        tensor.set(numpy_data, place)
-        feed_tensors[key] = tensor
-    return feed_tensors
-
-
-def main():
-    cost = model()
-    optimizer = SGDOptimizer(learning_rate=0.2)
-    optimize_ops, params_grads = optimizer.minimize(cost)
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.movielens.train(), buf_size=8192),
-        batch_size=BATCH_SIZE)
-
-    place = fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    t = fluid.DistributeTranspiler()
-
-    # all parameter server endpoints list for spliting parameters
-    pserver_endpoints = os.getenv("PSERVERS")
-    # server endpoint for current node
-    current_endpoint = os.getenv("SERVER_ENDPOINT")
-    # run as trainer or parameter server
-    training_role = os.getenv("TRAINING_ROLE", "TRAINER")
-    t.transpile(
-        optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
-
-    if training_role == "PSERVER":
-        if not current_endpoint:
-            print("need env SERVER_ENDPOINT")
-            exit(1)
-        pserver_prog = t.get_pserver_program(current_endpoint)
-        pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
-        exe.run(pserver_startup)
-        exe.run(pserver_prog)
-    elif training_role == "TRAINER":
-        exe.run(fluid.default_startup_program())
-        trainer_prog = t.get_trainer_program()
-
-        feeding = {
-            'user_id': 0,
-            'gender_id': 1,
-            'age_id': 2,
-            'job_id': 3,
-            'movie_id': 4,
-            'category_id': 5,
-            'movie_title': 6,
-            'score': 7
-        }
-
-        for pass_id in range(PASS_NUM):
-            for data in train_reader():
-                outs = exe.run(trainer_prog,
-                               feed=func_feed(feeding, data, place),
-                               fetch_list=[cost])
-                out = np.array(outs[0])
-                print("cost=" + str(out[0]))
-                if out[0] < 6.0:
-                    print("Training complete. Average cost is less than 6.0.")
-                    # if avg cost less than 6.0, we think our code is good.
-                    exit(0)
-    else:
-        print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
-
-
-if __name__ == '__main__':
-    main()
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_conv_dist.py b/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_conv_dist.py
deleted file mode 100644
index c5c0856c31a43d04fa8983b612e99487cb41d615..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_conv_dist.py
+++ /dev/null
@@ -1,125 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import os
-import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-
-
-def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
-                    hid_dim=32):
-    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
-    conv_3 = fluid.nets.sequence_conv_pool(
-        input=emb,
-        num_filters=hid_dim,
-        filter_size=3,
-        act="tanh",
-        pool_type="sqrt")
-    conv_4 = fluid.nets.sequence_conv_pool(
-        input=emb,
-        num_filters=hid_dim,
-        filter_size=4,
-        act="tanh",
-        pool_type="sqrt")
-    prediction = fluid.layers.fc(input=[conv_3, conv_4],
-                                 size=class_dim,
-                                 act="softmax")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
-    optimize_ops, params_grads = adam_optimizer.minimize(avg_cost)
-    accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
-    return avg_cost, accuracy, accuracy.metrics[0], optimize_ops, params_grads
-
-
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
-def main():
-    BATCH_SIZE = 100
-    PASS_NUM = 5
-
-    word_dict = paddle.dataset.imdb.word_dict()
-    dict_dim = len(word_dict)
-    class_dim = 2
-
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-    cost, accuracy, acc_out, optimize_ops, params_grads = convolution_net(
-        data, label, input_dim=dict_dim, class_dim=class_dim)
-
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.imdb.train(word_dict), buf_size=1000),
-        batch_size=BATCH_SIZE)
-    place = fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    t = fluid.DistributeTranspiler()
-
-    # all parameter server endpoints list for spliting parameters
-    pserver_endpoints = os.getenv("PSERVERS")
-    # server endpoint for current node
-    current_endpoint = os.getenv("SERVER_ENDPOINT")
-    # run as trainer or parameter server
-    training_role = os.getenv(
-        "TRAINING_ROLE", "TRAINER")  # get the training role: trainer/pserver
-    t.transpile(
-        optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
-
-    if training_role == "PSERVER":
-        if not current_endpoint:
-            print("need env SERVER_ENDPOINT")
-            exit(1)
-        pserver_prog = t.get_pserver_program(current_endpoint)
-        pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
-        exe.run(pserver_startup)
-        exe.run(pserver_prog)
-    elif training_role == "TRAINER":
-        exe.run(fluid.default_startup_program())
-        trainer_prog = t.get_trainer_program()
-        feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
-
-        for pass_id in xrange(PASS_NUM):
-            accuracy.reset(exe)
-            for data in train_data():
-                cost_val, acc_val = exe.run(trainer_prog,
-                                            feed=feeder.feed(data),
-                                            fetch_list=[cost, acc_out])
-                pass_acc = accuracy.eval(exe)
-                print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
-                      " pass_acc=" + str(pass_acc))
-                if cost_val < 1.0 and pass_acc > 0.8:
-                    exit(0)
-    else:
-        print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
-
-
-if __name__ == '__main__':
-    main()
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_dynamic_lstm.py b/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_dynamic_lstm.py
deleted file mode 100644
index 99e2c2bbac6d12912de0a718ff6e4151a28e12eb..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_dynamic_lstm.py
+++ /dev/null
@@ -1,135 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import os
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-
-
-def stacked_lstm_net(data,
-                     label,
-                     input_dim,
-                     class_dim=2,
-                     emb_dim=128,
-                     hid_dim=512,
-                     stacked_num=3):
-    assert stacked_num % 2 == 1
-
-    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
-    # add bias attr
-
-    # TODO(qijun) linear act
-    fc1 = fluid.layers.fc(input=emb, size=hid_dim)
-    lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)
-
-    inputs = [fc1, lstm1]
-
-    for i in range(2, stacked_num + 1):
-        fc = fluid.layers.fc(input=inputs, size=hid_dim)
-        lstm, cell = fluid.layers.dynamic_lstm(
-            input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
-        inputs = [fc, lstm]
-
-    fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
-    lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
-
-    prediction = fluid.layers.fc(input=[fc_last, lstm_last],
-                                 size=class_dim,
-                                 act='softmax')
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
-    optimize_ops, params_grads = adam_optimizer.minimize(avg_cost)
-    accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
-    return avg_cost, accuracy, accuracy.metrics[0], optimize_ops, params_grads
-
-
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
-def main():
-    BATCH_SIZE = 100
-    PASS_NUM = 5
-
-    word_dict = paddle.dataset.imdb.word_dict()
-    print "loaded word dict successfully"
-    dict_dim = len(word_dict)
-    class_dim = 2
-
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-    cost, accuracy, acc_out, optimize_ops, params_grads = stacked_lstm_net(
-        data, label, input_dim=dict_dim, class_dim=class_dim)
-
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.imdb.train(word_dict), buf_size=1000),
-        batch_size=BATCH_SIZE)
-    place = fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
-
-    t = fluid.DistributeTranspiler()
-    # all parameter server endpoints list for spliting parameters
-    pserver_endpoints = os.getenv("PSERVERS")
-    # server endpoint for current node
-    current_endpoint = os.getenv("SERVER_ENDPOINT")
-    # run as trainer or parameter server
-    training_role = os.getenv(
-        "TRAINING_ROLE", "TRAINER")  # get the training role: trainer/pserver
-    t.transpile(
-        optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
-
-    if training_role == "PSERVER":
-        if not current_endpoint:
-            print("need env SERVER_ENDPOINT")
-            exit(1)
-        pserver_prog = t.get_pserver_program(current_endpoint)
-        pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
-        exe.run(pserver_startup)
-        exe.run(pserver_prog)
-    elif training_role == "TRAINER":
-        exe.run(fluid.default_startup_program())
-        trainer_prog = t.get_trainer_program()
-        for pass_id in xrange(PASS_NUM):
-            accuracy.reset(exe)
-            for data in train_data():
-                cost_val, acc_val = exe.run(trainer_prog,
-                                            feed=feeder.feed(data),
-                                            fetch_list=[cost, acc_out])
-                pass_acc = accuracy.eval(exe)
-                print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
-                      " pass_acc=" + str(pass_acc))
-                if cost_val < 1.0 and acc_val > 0.8:
-                    exit(0)
-    else:
-        print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
-
-
-if __name__ == '__main__':
-    main()
diff --git a/python/setup.py.in b/python/setup.py.in
index 5a0d99995439135ce814e58fcfbcdc337965b164..6fff1bb09e4923650356a50afba0e105b3d8c0c2 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -71,9 +71,9 @@ packages=['paddle',
           'paddle.v2.reader',
           'paddle.v2.master',
           'paddle.v2.plot',
-          'paddle.v2.fluid',
-          'paddle.v2.fluid.proto',
-          'paddle.v2.fluid.layers',
+          'paddle.fluid',
+          'paddle.fluid.proto',
+          'paddle.fluid.layers',
           'py_paddle']
 
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
@@ -102,14 +102,14 @@ setup(name='${PACKAGE_NAME}',
       ext_modules=[Extension('_foo', ['stub.cc'])],
       package_data={
         'paddle.v2.master': ['libpaddle_master.so'],
-        'paddle.v2.fluid': ['core.so'],
+        'paddle.fluid': ['core.so'],
         'py_paddle':['*.py','_swig_paddle.so']
       },
       package_dir={
           '': '${CMAKE_CURRENT_SOURCE_DIR}',
-          # The paddle.v2.fluid.proto will be generated while compiling.
+          # The paddle.fluid.proto will be generated while compiling.
           # So that package points to other directory.
-          'paddle.v2.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
+          'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
           'py_paddle': '${PADDLE_SOURCE_DIR}/paddle/py_paddle'
       },
       scripts=paddle_bins,