diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5db5c228be2d6491463ec1ddb17de7bec730bd44..a2f440c2d089b5d596ab59d5099c0066ef325614 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,7 +19,7 @@ set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 
 include(system)
 
-project(paddle CXX C Go)
+project(paddle CXX C)
 message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
         "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
 message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
@@ -60,7 +60,7 @@ option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
 option(WITH_DISTRIBUTE  "Compile with grpc distributed support"         OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
-option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   ON)
+option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -146,6 +146,7 @@ include(external/cares)
 include(external/grpc)
 
 include(cudnn)              # set cudnn libraries, must before configure
+include(cupti)
 include(configure)          # add paddle env configuration
 include(generic)            # simplify cmake module
 include(package)            # set paddle packages
@@ -174,7 +175,7 @@ set(EXTERNAL_LIBS
 )
 
 if(WITH_GPU)
-  include(cuda)
+    include(cuda)
 endif(WITH_GPU)
 
 if(WITH_MKLML)
@@ -201,17 +202,18 @@ endif()
 # "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
 # placed after this block, because they depends on it.
 if(WITH_GOLANG)
+    enable_language(Go)
     add_subdirectory(go)
 endif(WITH_GOLANG)
 
 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 
-SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
-SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
+set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 
 add_subdirectory(paddle)
 if(WITH_PYTHON)
-  add_subdirectory(python)
+    add_subdirectory(python)
 endif()
 
 if(WITH_DOC)
diff --git a/Dockerfile.android b/Dockerfile.android
index 9d13a414f67be04e17b7d83403228d92bce0eda9..cc022d596b4b74dd1e4f4d0901dd81c91a7decd1 100644
--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -21,16 +21,6 @@ RUN apt-get update && \
     wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
     apt-get clean -y
 
-# Install Go and glide
-RUN wget -qO- go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
-    tar -xz -C /usr/local && \
-    mkdir /root/gopath && \
-    mkdir /root/gopath/bin && \
-    mkdir /root/gopath/src
-ENV GOROOT=/usr/local/go GOPATH=/root/gopath
-# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
-ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
-
 # git credential to skip password typing
 RUN git config --global credential.helper store
 
diff --git a/benchmark/cluster/vgg16/Dockerfile b/benchmark/cluster/vgg16/Dockerfile
index 98356cd7613baff7f0cd66d1462068232b2b8500..13ad8e1b6237e6f41a076c4fb54311728832ae33 100644
--- a/benchmark/cluster/vgg16/Dockerfile
+++ b/benchmark/cluster/vgg16/Dockerfile
@@ -1,18 +1,35 @@
-#FROM python:2.7.14
 FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
-RUN apt-get update && apt-get install -y python
-RUN pip install -U kubernetes opencv-python &&   apt-get update -y &&   apt-get install -y iputils-ping libgtk2.0-dev
-# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
-#       so we must build one with distribute support to install in this image.
+
+# you can get mirror list here:
+# https://launchpad.net/ubuntu/+archivemirrors
+ARG UBUNTU_MIRROR
+RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
+
+RUN apt-get update && apt-get install -y python python-dev python-pip iputils-ping libgtk2.0-dev
+RUN pip install -U kubernetes opencv-python
+
 RUN pip install paddlepaddle
+# if network is slowly, you may need to add proxy here.
+# ENV https_proxy=
 RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
 RUN pip uninstall -y paddlepaddle
+# unset proxy if it is setted.
+# ENV https_proxy=""
+
+# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
+#       so we must build one with distribute support to install in this image.
+ADD *.whl /
+RUN pip install /*.whl && rm -f /*.whl
+ENV LD_LIBRARY_PATH=/usr/local/lib
+
+# tf k8s
+RUN pip install tensorflow==1.4.0
+ADD tf_k8s /usr/bin
+RUN chmod +x /usr/bin/tf_k8s
+ADD vgg16_tf.py /workspace/
 
 # below lines may change a lot for debugging
 ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
 ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
-ADD *.whl /
-RUN pip install /*.whl && rm -f /*.whl && \
-chmod +x /usr/bin/paddle_k8s
-ENV LD_LIBRARY_PATH=/usr/local/lib
+RUN chmod +x /usr/bin/paddle_k8s
 ADD vgg16_fluid.py vgg16_v2.py /workspace/
diff --git a/benchmark/cluster/vgg16/fluid_trainer.yaml b/benchmark/cluster/vgg16/fluid_trainer.yaml
index 0a0ed25ebe43c4cc0d5ab0b72cf36c936fcce802..3d56caac009464d1073423bb63abff1f8b0cf28f 100644
--- a/benchmark/cluster/vgg16/fluid_trainer.yaml
+++ b/benchmark/cluster/vgg16/fluid_trainer.yaml
@@ -11,7 +11,7 @@ spec:
         paddle-job: vgg16job
     spec:
       imagePullSecrets:
-        - name: job-registry-secret
+      - name: job-registry-secret
       hostNetwork: true
       containers:
       - name: trainer
diff --git a/benchmark/cluster/vgg16/tf_k8s b/benchmark/cluster/vgg16/tf_k8s
new file mode 100644
index 0000000000000000000000000000000000000000..4fc263d5f681aeabfa71f1758714d269d987b272
--- /dev/null
+++ b/benchmark/cluster/vgg16/tf_k8s
@@ -0,0 +1,82 @@
+#!/bin/bash
+check_trainer_ret() {
+  ret=$1
+  stdbuf -oL echo "job returned $ret...setting pod return message..."
+  stdbuf -oL echo "==============================="
+
+  if [ $ret -eq 136 ] ; then
+    echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
+  elif [ $ret -eq 139 ] ; then
+    echo "Segmentation Fault" > /dev/termination-log
+  elif [ $ret -eq 1 ] ; then
+    echo "General Error" > /dev/termination-log
+  elif [ $ret -eq 134 ] ; then
+    echo "Program Abort" > /dev/termination-log
+  fi
+  stdbuf -oL echo "termination log wroted..."
+  exit $ret
+}
+
+g_pservers=""
+g_trainers=""
+
+wait_running_pods(){
+  pserver_label="tf-job-pserver=${JOB_NAME}"
+  trainer_label="tf-job-trainer=${JOB_NAME}"
+
+  stdbuf -oL python /root/k8s_tools.py wait_pods_running ${pserver_label} ${PSERVERS_NUM}
+  stdbuf -oL python /root/k8s_tools.py wait_pods_running ${trainer_label} ${TRAINERS_NUM}
+
+  g_pservers=$(python /root/k8s_tools.py fetch_endpoints ${pserver_label} ${PORT})
+  g_trainers=$(python /root/k8s_tools.py fetch_endpoints ${trainer_label} ${PORT})
+}
+
+start_tf_pserver(){
+  wait_running_pods
+
+  label="tf-job-pserver=${JOB_NAME}"
+  pserver_id=$(python /root/k8s_tools.py fetch_id ${label})
+
+  cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
+  --job_name=${TF_JOB_NAME} --task_index=${pserver_id}"
+
+  stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
+}
+
+start_tf_trainer(){
+  wait_running_pods
+
+  label="tf-job-trainer=${JOB_NAME}"
+  trainer_id=$(python /root/k8s_tools.py fetch_id ${label})
+
+  cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
+  --job_name=${TF_JOB_NAME} --task_index=${trainer_id} --batch_size=${BATCH_SIZE}"
+
+  stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
+  check_trainer_ret $?
+}
+
+start_tf(){
+    if [[ "${TF_JOB_NAME}" == "worker" ]]; then
+        start_tf_trainer
+    else
+        start_tf_pserver
+    fi
+}
+
+usage() {
+    echo "usage: tf_k8s [<args>]:"
+    echo "  start_tf         Start tensorflow jobs"
+}
+
+case "$1" in
+    start_tf)
+        start_tf
+        ;;
+    --help)
+        usage
+        ;;
+    *)
+        usage
+        ;;
+esac
diff --git a/benchmark/cluster/vgg16/tf_pserver.yaml b/benchmark/cluster/vgg16/tf_pserver.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5e37c700819119c8af05c40fe4b8d13911efc3e1
--- /dev/null
+++ b/benchmark/cluster/vgg16/tf_pserver.yaml
@@ -0,0 +1,56 @@
+apiVersion: extensions/v1beta1
+kind: ReplicaSet
+metadata:
+  name: vgg16job-tf-pserver
+spec:
+  replicas: 10
+  template:
+    metadata:
+      labels:
+        tf-job-pserver: vgg16job-tf
+    spec:
+      hostNetwork: true
+      imagePullSecrets:
+      - name: job-registry-secret
+      containers:
+      - name: pserver
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
+        imagePullPolicy: Always
+        command: ["tf_k8s", "start_tf"]
+        ports:
+        - name: jobport-30236
+          containerPort: 30236
+        env:
+        - name: PORT
+          value: "32036"
+        - name: ENTRY
+          value: "python vgg16_tf.py"
+        - name: JOB_NAME
+          value: vgg16job-tf
+        - name: PSERVERS_NUM
+          value: "10"
+        - name: TF_JOB_NAME 
+          value: "ps"
+        - name: TRAINERS_NUM
+          value: "20"
+        - name: BATCH_SIZE
+          value: "128"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: NUM_PASSES
+          value: "1"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        - name: POD_IP
+          valueFrom:
+            fieldRef:
+              fieldPath: "status.podIP"
+        resources:
+          requests:
+            memory: 10Gi
+            cpu: 4
+          limits:
+            memory: 10Gi
+            cpu: 4
diff --git a/benchmark/cluster/vgg16/tf_trainer.yaml b/benchmark/cluster/vgg16/tf_trainer.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..08795df3addfa7b618db24a65e57be190e268f06
--- /dev/null
+++ b/benchmark/cluster/vgg16/tf_trainer.yaml
@@ -0,0 +1,58 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: vgg16job-tf-trainer
+spec:
+  parallelism: 20
+  completions: 20
+  template:
+    metadata:
+      labels:
+        tf-job-trainer: vgg16job-tf
+    spec:
+      imagePullSecrets:
+      - name: job-registry-secret
+      hostNetwork: true
+      containers:
+      - name: trainer
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
+        imagePullPolicy: Always
+        command: ["tf_k8s", "start_tf"]
+        ports:
+        - name: jobport-30236
+          containerPort: 30236
+        env:
+        - name: PORT
+          value: "32036"
+        - name: JOB_NAME
+          value: vgg16job-tf
+        - name: TF_JOB_NAME 
+          value: "worker"
+        - name: ENTRY
+          value: "python vgg16_tf.py"
+        - name: PSERVERS_NUM
+          value: "10"
+        - name: BATCH_SIZE
+          value: "128"
+        - name: TRAINERS_NUM
+          value: "20"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: NUM_PASSES
+          value: "1"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        - name: POD_IP
+          valueFrom:
+            fieldRef:
+              fieldPath: "status.podIP"
+        resources:
+          requests:
+            memory: 40Gi
+            cpu: 2
+          limits:
+            memory: 40Gi
+            cpu: 2
+      restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py
index 99395699f2ff5a04f340a1ca73d6e9a853981f5c..7323241f4d3bdcbe9c9efcbaaedebe01adbd4701 100644
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@@ -68,6 +68,21 @@ parser.add_argument(
     type=str2bool,
     default=True,
     help='Whether to run as local mode.')
+
+parser.add_argument(
+    "--ps_hosts",
+    type=str,
+    default="",
+    help="Comma-separated list of hostname:port pairs")
+parser.add_argument(
+    "--trainer_hosts",
+    type=str,
+    default="",
+    help="Comma-separated list of hostname:port pairs")
+
+# Flags for defining the tf.train.Server
+parser.add_argument(
+    "--task_index", type=int, default=0, help="Index of task within the job")
 args = parser.parse_args()
 
 
@@ -180,8 +195,9 @@ def main():
                     iters += 1
                     num_samples += len(data)
                     print(
-                        "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f"
-                        % (pass_id, iters, loss, acc, time.time() - ts)
+                        "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
+                        % (pass_id, iters, loss, acc,
+                           len(data) / (time.time() - ts))
                     )  # The accuracy is the accumulation of batches, but not the current batch.
 
             pass_elapsed = time.time() - start_time
@@ -209,27 +225,24 @@ def main():
             batch_size=args.batch_size)
         train_loop(exe, fluid.default_main_program())
     else:
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # all pserver endpoints
-        eplist = []
-        for ip in pserver_ips.split(","):
-            eplist.append(':'.join([ip, "6174"]))
-        pserver_endpoints = ",".join(eplist)
-        print("pserver endpoints: ", pserver_endpoints)
         trainers = int(os.getenv("TRAINERS"))  # total trainer count
         print("trainers total: ", trainers)
-        current_endpoint = os.getenv(
-            "POD_IP") + ":6174"  # current pserver endpoint
+
         training_role = os.getenv(
             "TRAINING_ROLE",
             "TRAINER")  # get the training role: trainer/pserver
+
         t = fluid.DistributeTranspiler()
         t.transpile(
             optimize_ops,
             params_grads,
-            pservers=pserver_endpoints,
+            trainer_id=args.task_index,
+            pservers=args.ps_hosts,
             trainers=trainers)
 
         if training_role == "PSERVER":
+            current_endpoint = os.getenv("POD_IP") + ":" + os.getenv(
+                "PADDLE_INIT_PORT")
             if not current_endpoint:
                 print("need env SERVER_ENDPOINT")
                 exit(1)
diff --git a/benchmark/cluster/vgg16/vgg16_tf.py b/benchmark/cluster/vgg16/vgg16_tf.py
new file mode 100644
index 0000000000000000000000000000000000000000..996df0e314b867ea8de618dfd3977f490fbe8372
--- /dev/null
+++ b/benchmark/cluster/vgg16/vgg16_tf.py
@@ -0,0 +1,362 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VGG16 benchmark in TensorFlow
+You can get distribution example template structure here:
+https://medium.com/clusterone/how-to-write-distributed-tensorflow-code-with-an-example-on-tensorport-70bf3306adcb
+https://www.tensorflow.org/deploy/distributed
+"""
+
+import tensorflow as tf
+import paddle.v2 as paddle
+import numpy as np
+import argparse
+import time
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--batch_size', type=int, default=128, help="Batch size for training.")
+parser.add_argument(
+    '--learning_rate',
+    type=float,
+    default=1e-3,
+    help="Learning rate for training.")
+parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
+parser.add_argument(
+    '--device',
+    type=str,
+    default='CPU',
+    choices=['CPU', 'GPU'],
+    help="The device type.")
+parser.add_argument(
+    '--data_format',
+    type=str,
+    default='NHWC',
+    choices=['NCHW', 'NHWC'],
+    help='The data order, NCHW=[batch, channels, height, width].'
+    'Only support NHWC right now.')
+parser.add_argument(
+    '--data_set',
+    type=str,
+    default='cifar10',
+    choices=['cifar10', 'flowers'],
+    help='Optional dataset for benchmark.')
+
+parser.add_argument(
+    "--ps_hosts",
+    type=str,
+    default="",
+    help="Comma-separated list of hostname:port pairs")
+parser.add_argument(
+    "--worker_hosts",
+    type=str,
+    default="",
+    help="Comma-separated list of hostname:port pairs")
+parser.add_argument(
+    "--job_name", type=str, default="", help="One of 'worker', 'ps'")
+# Flags for defining the tf.train.Server
+parser.add_argument(
+    "--task_index", type=int, default=0, help="Index of task within the job")
+
+args = parser.parse_args()
+
+
+class VGG16Model(object):
+    def __init__(self):
+        self.parameters = []
+
+    def batch_norm_relu(self, inputs, is_training):
+        """Performs a batch normalization followed by a ReLU."""
+        # We set fused=True for a significant speed boost. See
+        # https://www.tensorflow.org/speed/speed_guide#common_fused_ops
+        inputs = tf.layers.batch_normalization(
+            inputs=inputs,
+            axis=1 if args.data_format == 'NCHW' else -1,
+            momentum=0.9,
+            epsilon=1e-05,
+            center=True,
+            scale=True,
+            training=is_training,
+            fused=True)
+        inputs = tf.nn.relu(inputs)
+        return inputs
+
+    def conv_bn_layer(self,
+                      name,
+                      images,
+                      kernel_shape,
+                      is_training,
+                      drop_rate=0.0):
+        with tf.name_scope(name) as scope:
+            kernel = tf.Variable(
+                tf.truncated_normal(
+                    kernel_shape, dtype=tf.float32, stddev=1e-1),
+                name='weights')
+            conv = tf.nn.conv2d(
+                images,
+                kernel, [1, 1, 1, 1],
+                data_format=args.data_format,
+                padding='SAME')
+            biases = tf.Variable(
+                tf.constant(
+                    0.0, shape=[kernel_shape[-1]], dtype=tf.float32),
+                trainable=True,
+                name='biases')
+            out = tf.nn.bias_add(conv, biases)
+            out = self.batch_norm_relu(out, is_training)
+            out = tf.layers.dropout(out, rate=drop_rate, training=is_training)
+            return out
+
+    def fc_layer(self, name, inputs, shape):
+        with tf.name_scope(name) as scope:
+            fc_w = tf.Variable(
+                tf.truncated_normal(
+                    shape, dtype=tf.float32, stddev=1e-1),
+                name='weights')
+            fc_b = tf.Variable(
+                tf.constant(
+                    0.0, shape=[shape[-1]], dtype=tf.float32),
+                trainable=True,
+                name='biases')
+            out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b)
+            return out
+
+    def network(self, images, class_dim, is_training):
+        """ VGG16 model structure.
+
+            TODO(kuke): enable this network to support the 'NCHW' data format
+        """
+
+        # conv1
+        conv1_1 = self.conv_bn_layer(
+            'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3)
+        conv1_2 = self.conv_bn_layer(
+            'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0)
+        # pool1
+        pool1 = tf.nn.max_pool(
+            conv1_2,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool1')
+        # conv2
+        conv2_1 = self.conv_bn_layer(
+            'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4)
+        conv2_2 = self.conv_bn_layer(
+            'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0)
+        # pool2
+        pool2 = tf.nn.max_pool(
+            conv2_2,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool2')
+        # conv3
+        conv3_1 = self.conv_bn_layer(
+            'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4)
+        conv3_2 = self.conv_bn_layer(
+            'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4)
+        conv3_3 = self.conv_bn_layer(
+            'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0)
+        # pool3
+        pool3 = tf.nn.max_pool(
+            conv3_3,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool3')
+        # conv4
+        conv4_1 = self.conv_bn_layer(
+            'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4)
+        conv4_2 = self.conv_bn_layer(
+            'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
+        conv4_3 = self.conv_bn_layer(
+            'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
+        # pool4
+        pool4 = tf.nn.max_pool(
+            conv4_3,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool4')
+        # conv5
+        conv5_1 = self.conv_bn_layer(
+            'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4)
+        conv5_2 = self.conv_bn_layer(
+            'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
+        conv5_3 = self.conv_bn_layer(
+            'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
+        # pool5
+        pool5 = tf.nn.max_pool(
+            conv5_3,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool4')
+        # flatten
+        shape = int(np.prod(pool5.get_shape()[1:]))
+        pool5_flat = tf.reshape(pool5, [-1, shape])
+        # fc1
+        drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training)
+        fc1 = self.fc_layer('fc1', drop, [shape, 512])
+        # fc2
+        bn = self.batch_norm_relu(fc1, is_training)
+        drop = tf.layers.dropout(bn, rate=0.5, training=is_training)
+        fc2 = self.fc_layer('fc2', drop, [512, 512])
+
+        fc3 = self.fc_layer('fc3', fc2, [512, class_dim])
+
+        return fc3
+
+
+def run_benchmark(cluster_spec, server):
+    """Run benchmark on cifar10 or flowers."""
+
+    if args.data_set == "cifar10":
+        class_dim = 10
+        raw_shape = (3, 32, 32)
+        dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else (
+            None, 3, 32, 32)
+    else:
+        class_dim = 102
+        raw_shape = (3, 224, 224)
+        dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else (
+            None, 3, 224, 224)
+
+    device = tf.train.replica_device_setter(
+        worker_device="/job:worker/task:{}".format(args.task_index),
+        cluster=cluster_spec)
+
+    with tf.device(device):
+        images = tf.placeholder(tf.float32, shape=dat_shape)
+        labels = tf.placeholder(tf.int64, shape=(None, ))
+        is_training = tf.placeholder('bool')
+        onehot_labels = tf.one_hot(labels, depth=class_dim)
+
+        vgg16 = VGG16Model()
+        logits = vgg16.network(images, class_dim, is_training)
+        loss = tf.losses.softmax_cross_entropy(
+            onehot_labels=onehot_labels, logits=logits)
+        avg_loss = tf.reduce_mean(loss)
+
+        correct = tf.equal(tf.argmax(logits, 1), labels)
+        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
+
+        optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
+        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+        global_step = tf.Variable(0, name='global_step', trainable=False)
+        with tf.control_dependencies(update_ops):
+            train_op = optimizer.minimize(avg_loss, global_step=global_step)
+
+        summary_op = tf.summary.merge_all()
+        init_op = tf.global_variables_initializer()
+
+    # data reader
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.test10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+
+    # test
+    def test():
+        test_accs = []
+        for batch_id, data in enumerate(test_reader()):
+            test_images = np.array(
+         map(lambda x: np.transpose(x[0].reshape(raw_shape),
+         axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
+            test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
+            test_accs.append(
+                accuracy.eval(feed_dict={
+                    images: test_images,
+                    labels: test_labels,
+                    is_training: False
+                }))
+        return np.mean(test_accs)
+
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+
+    hooks = [tf.train.StopAtStepHook(last_step=1000000)]
+
+    with tf.train.MonitoredTrainingSession(
+            master=server.target, is_chief=(args.task_index == 0),
+            hooks=hooks) as sess:
+        iters, num_samples, start_time = 0, 0, 0.0
+        for pass_id in range(args.num_passes):
+            # train
+            num_samples = 0
+            start_time = time.time()
+            for batch_id, data in enumerate(train_reader()):
+                train_images = np.array(
+                    map(lambda x: np.transpose(x[0].reshape(raw_shape),
+                    axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
+                train_labels = np.array(map(lambda x: x[1], data)).astype(
+                    'int64')
+                iter_begin_time = time.time()
+                _, loss, acc = sess.run([train_op, avg_loss, accuracy],
+                                        feed_dict={
+                                            images: train_images,
+                                            labels: train_labels,
+                                            is_training: True
+                                        })
+                iters += 1
+                print(
+                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed=%.2f imgs/sec"
+                    % (pass_id, iters, loss, acc,
+                       len(data) / (time.time() - iter_begin_time)))
+                num_samples += len(data)
+            train_elapsed = time.time() - start_time
+            # test
+            pass_test_acc = test()
+            print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" %
+                  (pass_id, num_samples / train_elapsed, pass_test_acc))
+
+
+def print_arguments():
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == '__main__':
+    print_arguments()
+
+    ps_hosts = args.ps_hosts.split(",")
+    worker_hosts = args.worker_hosts.split(",")
+
+    # Create a cluster from the parameter server and worker hosts.
+    cluster_spec = tf.train.ClusterSpec({
+        "ps": ps_hosts,
+        "worker": worker_hosts
+    })
+
+    # Create and start a server for the local task.
+    server = tf.train.Server(
+        cluster_spec, job_name=args.job_name, task_index=args.task_index)
+
+    if args.job_name == "ps":
+        print("start pserver")
+        server.join()
+    elif args.job_name == "worker":
+        print("start worker")
+        run_benchmark(cluster_spec, server)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index ae3295fe4115f457570203e61a56a637895e4770..7730453fc9292015465713232abda155a18a1aad 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -59,6 +59,7 @@ endif(NOT WITH_GOLANG)
 
 if(NOT WITH_GPU)
     add_definitions(-DHPPL_STUB_FUNC)
+    add_definitions("-DCUPTI_LIB_PATH=\"\"")
 
     list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
@@ -73,7 +74,14 @@ else()
     if(NOT CUDNN_FOUND)
         message(FATAL_ERROR "Paddle needs cudnn to compile")
     endif()
-
+    if(CUPTI_FOUND)
+        include_directories(${CUPTI_INCLUDE_DIR})
+        add_definitions(-DPADDLE_WITH_CUPTI)
+        add_definitions("-DCUPTI_LIB_PATH=\"${CUPTI_LIBRARY_PATH}\"")
+    else()
+        add_definitions("-DCUPTI_LIB_PATH=\"\"")
+        message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.")
+    endif()
     set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")
 
     # Include cuda and cudnn
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index de94bd5008effef1bf0fd3a125d4aed56e1b7f81..7edc8637727e300539a46bc3941ace87c87903b8 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -155,7 +155,8 @@ endif()
 include_directories(${CUDA_INCLUDE_DIRS})
 list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
 if(NOT WITH_DSO)
-    list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
+    # TODO(panyx0718): CUPTI only allows DSO?
+    list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
 endif(NOT WITH_DSO)
 
 # setting nvcc arch flags
diff --git a/cmake/cupti.cmake b/cmake/cupti.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..72ed0f1e5858d6d836743ceb038c7f4ad8f194cf
--- /dev/null
+++ b/cmake/cupti.cmake
@@ -0,0 +1,41 @@
+if(NOT WITH_GPU)
+    return()
+endif()
+
+
+set(CUPTI_ROOT "/usr" CACHE PATH "CUPTI ROOT")
+find_path(CUPTI_INCLUDE_DIR cupti.h
+        PATHS ${CUPTI_ROOT} ${CUPTI_ROOT}/include
+        $ENV{CUPTI_ROOT} $ENV{CUPTI_ROOT}/include
+        ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/include
+        NO_DEFAULT_PATH
+        )
+
+get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
+
+set(TARGET_ARCH "x86_64")
+if(NOT ${CMAKE_SYSTEM_PROCESSOR})
+    set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
+endif()
+
+list(APPEND CUPTI_CHECK_LIBRARY_DIRS
+        ${CUPTI_ROOT}
+        ${CUPTI_ROOT}/lib64
+        ${CUPTI_ROOT}/lib
+        ${CUPTI_ROOT}/lib/${TARGET_ARCH}-linux-gnu
+        $ENV{CUPTI_ROOT}
+        $ENV{CUPTI_ROOT}/lib64
+        $ENV{CUPTI_ROOT}/lib
+        /usr/lib
+        ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/lib64)
+find_library(CUPTI_LIBRARY NAMES libcupti.so libcupti.dylib # libcupti_static.a
+       PATHS ${CUPTI_CHECK_LIBRARY_DIRS} ${CUPTI_INCLUDE_DIR} ${__libpath_hist}
+       NO_DEFAULT_PATH
+       DOC "Path to cuPTI library.")
+
+get_filename_component(CUPTI_LIBRARY_PATH ${CUPTI_LIBRARY} DIRECTORY)
+if(CUPTI_INCLUDE_DIR AND CUPTI_LIBRARY)
+    set(CUPTI_FOUND ON)
+else()
+    set(CUPTI_FOUND OFF)
+endif()
diff --git a/doc/build_and_install/build_from_source_cn.rst b/doc/build_and_install/build_from_source_cn.rst
index fec2d412f03f6b94422f0463d1985decd0c1bf99..cb766c3838133740892928b587edcf3843b7abce 100644
--- a/doc/build_and_install/build_from_source_cn.rst
+++ b/doc/build_and_install/build_from_source_cn.rst
@@ -189,7 +189,7 @@ PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种B
     "WITH_TESTING", "是否开启单元测试", "OFF"
     "WITH_DOC", "是否编译中英文文档", "OFF"
     "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练", "Auto"
-    "WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON"
+    "WITH_GOLANG", "是否编译go语言的可容错parameter server", "OFF"
     "WITH_MKL", "是否使用MKL数学库，如果为否则是用OpenBLAS", "ON"
 
 BLAS
diff --git a/doc/build_and_install/build_from_source_en.rst b/doc/build_and_install/build_from_source_en.rst
index 29a1439e4cec50c15cb965a788070f21c704caad..556cbfdf087c340a7f7a1760f92325ab87eeea89 100644
--- a/doc/build_and_install/build_from_source_en.rst
+++ b/doc/build_and_install/build_from_source_en.rst
@@ -191,7 +191,7 @@ You can add :code:`-D` argument to pass such options, like:
     "WITH_TESTING", "Build unit tests", "OFF"
     "WITH_DOC", "Build documentations", "OFF"
     "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
-    "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"
+    "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "OFF"
     "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON"
 
 
diff --git a/doc/howto/capi/workflow_of_capi_cn.md b/doc/howto/capi/workflow_of_capi_cn.md
index a61d2267bfdb7c32da528735b20d7c6a531aaa1f..1ccc72eefbc730b2eab2d51f5b04e50728b735d7 100644
--- a/doc/howto/capi/workflow_of_capi_cn.md
+++ b/doc/howto/capi/workflow_of_capi_cn.md
@@ -65,6 +65,7 @@
     output_file = "output.paddle.model"
     merge_v2_model(net, param_file, output_file)
     ```
+
     对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)这个示例，可直接运行 `python` [merge_v2_model.py](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense/merge_v2_model.py)。序列化结果会写入当前运行目录下的`output.paddle.model`文件中。使用这种方式，运行时C-API可以通过指定`output.paddle.model`文件的路径来加载预测模型。
 
 #### 注意事项
diff --git a/doc/howto/cluster/multi_cluster/index_cn.rst b/doc/howto/cluster/multi_cluster/index_cn.rst
index ef56b6ddb38e59f20f7248de1ceb952c7627ce76..eabf95eda0b20f91913201a6b4e5b56fa440597e 100644
--- a/doc/howto/cluster/multi_cluster/index_cn.rst
+++ b/doc/howto/cluster/multi_cluster/index_cn.rst
@@ -1,20 +1,35 @@
 在不同集群中运行
 ================
+用户的集群环境不尽相同，为了方便大家的部署，我们提供了多种的集群部署方式，方便提交集群训练任务，以下将一一介绍:
 
-PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务，包括：
-- `Kubernetes <http://kubernetes.io>`_ Google开源的容器集群的调度框架，支持大规模集群生产环境的完整集群方案。
-- `OpenMPI <https://www.open-mpi.org>`_ 成熟的高性能并行计算框架。
-- `Fabric <http://www.fabfile.org>`_ 集群管理工具。可以使用`Fabric`编写集群任务提交和管理脚本。
+`Kubernetes <http://kubernetes.io>`_ 是Google开源的容器集群的调度框架，支持大规模集群生产环境的完整集群方案。以下指南展示了PaddlePaddle对Kubernetes的支持：
 
-对于不同的集群平台，会分别介绍集群作业的启动和停止方法。这些例子都可以在 `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ 找到。
+..  toctree::
+  :maxdepth: 1
+
+  k8s_cn.md
+  k8s_distributed_cn.md
 
-在使用分布式计算平台进行训练时，任务被调度在集群中时，分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数，比如节点的ID、IP和任务节点个数等。
+`OpenMPI <https://www.open-mpi.org>`_  是成熟的高性能并行计算框架，在HPC领域使用非常的广泛。以下指南介绍了如何使用OpenMPI来搭建PaddlePaddle的集群训练任务:
 
 ..  toctree::
   :maxdepth: 1
 
-  fabric_cn.md
   openmpi_cn.md
-  k8s_cn.md
-  k8s_distributed_cn.md
+
+`Fabric <http://www.fabfile.org>`_ 是一个方便的程序部署和管理工具。我们提供了使用Fabric 进行部署、管理的方法，如果想详细了解，请阅读以下指南:
+
+..  toctree::
+  :maxdepth: 1
+
+  fabric_cn.md
+
+我们也支持在AWS上部署PaddlePaddle，详细请了解:
+
+..  toctree::
+  :maxdepth: 1
+
   k8s_aws_cn.md
+
+您可以在 `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ 找到以上相关的例子。
+
diff --git a/doc/mobile/cross_compiling_for_android_cn.md b/doc/mobile/cross_compiling_for_android_cn.md
index ae24ced770492743065e37654b494caf6b4c5bc0..cdd6917239371a660d0df05bb623f0b94f8f11a3 100644
--- a/doc/mobile/cross_compiling_for_android_cn.md
+++ b/doc/mobile/cross_compiling_for_android_cn.md
@@ -23,6 +23,12 @@ $ docker build -t username/paddle-android:dev . -f Dockerfile.android
 $ docker pull paddlepaddle/paddle:latest-dev-android
 ```
 
+对于国内用户，我们提供了加速访问的镜像源：
+
+```bash
+$ docker pull docker.paddlepaddlehub.com/paddle:latest-dev-android
+```
+
 ### 编译PaddlePaddle C-API库
 构建好开发镜像后，即可使用开发镜像来编译Android版PaddlePaddle C-API库。
 Android的Docker开发镜像向用户提供两个可配置的参数：
@@ -56,15 +62,15 @@ Android的Docker开发镜像向用户提供两个可配置的参数：
 
 - 编译`armeabi-v7a`，`Android API 21`的PaddlePaddle库
 
-  ```bash
-  $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
-  ```
+```bash
+$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
+```
 
 - 编译`arm64-v8a`，`Android API 21`的PaddlePaddle库
 
-  ```bash
-  $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
-  ```
+```bash
+$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
+```
 
 执行上述`docker run`命令时，容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置，并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`，`ANDROID_API<21`时，Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节，根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后，PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录，所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
 
@@ -155,7 +161,11 @@ cmake -DCMAKE_SYSTEM_NAME=Android \
       ..
 ```
 
-用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小，可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`；若希望最快的执行速度，则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS`来影响PaddlePaddle的编译过程。
+用户还可根据自己的需求设置其他编译参数。
+
+- 设置`CMAKE_BUILD_TYPE`为`MinSizeRel`，最小化生成的库的大小。
+- 设置`CMAKE_BUILD_TYPE`为`Release`，获得最快的执行速度，
+- 用户亦可以通过手动设置`CMAKE_C/CXX_FLAGS`来影响PaddlePaddle的编译过程。
 
 **性能TIPS**，为了达到最快的计算速度，在CMake参数配置上，有以下建议：
 
diff --git a/doc/mobile/cross_compiling_for_android_en.md b/doc/mobile/cross_compiling_for_android_en.md
index 0cf50181df4116beda3aa6faf836eda92edf6066..6af16fc114a2310e364023ec43cc3c64149af8f7 100644
--- a/doc/mobile/cross_compiling_for_android_en.md
+++ b/doc/mobile/cross_compiling_for_android_en.md
@@ -25,6 +25,12 @@ Users can directly use the published Docker image.
 $ docker pull paddlepaddle/paddle:latest-dev-android
 ```
 
+For users in China, we provide a faster mirror.
+
+```bash
+$ docker pull docker.paddlepaddlehub.com/paddle:latest-dev-android
+```
+
 ### Build the Inference Library
 
 We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below:
@@ -86,19 +92,19 @@ Android NDK includes everything we need to build the [*standalone toolchain*](ht
 
 - To build the standalone toolchain for `armeabi-v7a` and Android API level 21:
 
-  ```bash
-  your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
-          --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain
-  ```
+```bash
+your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+        --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain
+```
   
   The generated standalone toolchain will be in `your/path/to/arm_standalone_toolchain`.
 
 - To build the standalone toolchain for `arm64-v8a` and Android API level 21:
 
-  ```bash
-  your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
-          --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
-  ```
+```bash
+your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+        --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
+```
 
   The generated standalone toolchain will be in `your/path/to/arm64_standalone_toolchain`.
 
diff --git a/doc/templates/conf.py.cn.in b/doc/templates/conf.py.cn.in
index c8da4a790be041d673a749cf200dadb839fb3a62..d134aad794b0a630dfc8395c484d111bc5462d9b 100644
--- a/doc/templates/conf.py.cn.in
+++ b/doc/templates/conf.py.cn.in
@@ -18,7 +18,6 @@ import shlex
 from recommonmark import parser, transform
 import paddle
 import paddle.v2
-import paddle.fluid
 
 MarkdownParser = parser.CommonMarkParser
 AutoStructify = transform.AutoStructify
diff --git a/doc/templates/conf.py.en.in b/doc/templates/conf.py.en.in
index a4cb2b7170fc8cdfb05e97738f7b6feb8165fce0..1f057d2e839896722fedba6634607bcdf2fd893a 100644
--- a/doc/templates/conf.py.en.in
+++ b/doc/templates/conf.py.en.in
@@ -18,7 +18,6 @@ import shlex
 from recommonmark import parser, transform
 import paddle
 import paddle.v2
-import paddle.fluid
 
 
 MarkdownParser = parser.CommonMarkParser
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 0d2691e8115ad6de46dcd4fcd5b7fd79ed60ecb9..d3155d33d0b461c9a3889ed8ae2ad9ee400a60fe 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -58,13 +58,13 @@ static void CreateTensor(Variable* var, proto::VarType::Type var_type) {
     var->GetMutable<ReaderHolder>();
   } else if (var_type == proto::VarType::CHANNEL) {
     var->GetMutable<ChannelHolder>();
-  } else if (var_type == proto::VarType::NCCL_COM) {
-    // GetMutable will be called in ncclInit
+  } else if (var_type == proto::VarType::RAW) {
+    // GetMutable will be called in operator
   } else {
     PADDLE_THROW(
         "Variable type %d is not in "
         "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
-        "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, NCCL_COM]",
+        "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]",
         var_type);
   }
 }
@@ -127,7 +127,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
     auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
 
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    platform::RecordEvent record_event(op->Type(), pool.Get(place_));
+    // TODO(panyx0718): Need a program id to distinguish programs.
+    platform::RecordEvent record_event(op->Type(), pool.Get(place_),
+                                       op_desc->Block()->ID());
 
     VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
     op->Run(*local_scope, place_);
diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto
index 5b43f5a8a4a1c128b04ac206d387e30c55f533fe..38f22b89143c3e23c8368b9281ccc757a892a373 100644
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -113,7 +113,10 @@ message VarType {
     PLACE_LIST = 14;
     READER = 15;
     CHANNEL = 16;
-    NCCL_COM = 17;
+    // Any runtime decided variable type is raw
+    // raw variables should manage their own allocations
+    // in operators like nccl_op
+    RAW = 17;
   }
 
   required Type type = 1;
@@ -164,4 +167,6 @@ message BlockDesc {
 // Please refer to
 // https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
 // for more details.
+// TODO(panyx0718): A model can have multiple programs. Need a
+// way to distinguish them. Maybe ID or name?
 message ProgramDesc { repeated BlockDesc blocks = 1; }
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 4cf14c8da547d79258e99d0c64e83f9218a92910..e2f4e9cad1996578b7c51257785e1273d126f80f 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -31,8 +31,14 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) {
   os << "{";
   for (auto &v : lod) {
     os << "{";
+    bool is_first = true;
     for (auto &i : v) {
-      os << i << ",";
+      if (is_first) {
+        os << i;
+        is_first = false;
+      } else {
+        os << ", " << i;
+      }
     }
     os << "}";
   }
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index b72aad6fb538ac483e9ce6fc9cb866c75190f006..614dd8cd00eb866cb8cbc41c3e03c25f968a7d2b 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -125,6 +125,8 @@ class OpDesc {
 
   BlockDesc *Block() { return this->block_; }
 
+  const BlockDesc &BlockRef() const { return *this->block_; }
+
   void SetBlock(BlockDesc *block) { this->block_ = block; }
 
  private:
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index 71c5ab3db937f70ff84391e98d28f023f6dddcfb..80eb9889670744ae527ea29609b33631a021bfa8 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -32,23 +32,11 @@ void ReadBinaryFile(const std::string& filename, std::string& contents) {
   inputfs.close();
 }
 
-bool IsParameter(const framework::VarDesc* var,
-                 const framework::ProgramDesc& main_program) {
-  if (var->Persistable()) {
-    // There are many unreachable variables in the program
-    for (size_t i = 0; i < main_program.Size(); ++i) {
-      const framework::BlockDesc& block = main_program.Block(i);
-      for (auto* op : block.AllOps()) {
-        if (op->Type() == framework::kFeedOpType) {
-          continue;
-        }
-        for (auto input_argument_name : op->InputArgumentNames()) {
-          if (input_argument_name == var->Name()) {
-            return true;
-          }
-        }
-      }
-    }
+bool IsPersistable(const framework::VarDesc* var) {
+  if (var->Persistable() &&
+      var->GetType() != framework::proto::VarType::FEED_MINIBATCH &&
+      var->GetType() != framework::proto::VarType::FETCH_LIST) {
+    return true;
   }
   return false;
 }
@@ -65,8 +53,8 @@ void LoadPersistables(framework::Executor& executor,
   std::vector<std::string> paramlist;
 
   for (auto* var : global_block.AllVars()) {
-    if (IsParameter(var, main_program)) {
-      VLOG(3) << "parameter's name: " << var->Name();
+    if (IsPersistable(var)) {
+      VLOG(3) << "persistable variable's name: " << var->Name();
 
       framework::VarDesc* new_var = load_block->Var(var->Name());
       new_var->SetShape(var->GetShape());
@@ -101,7 +89,6 @@ void LoadPersistables(framework::Executor& executor,
 
   executor.Run(*load_program, &scope, 0, true, true);
 
-  VLOG(3) << "Ran loading successfully";
   delete load_program;
 }
 
diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index 4ead540e5dd87ccf66168ab29c9d4aeaf6921269..e7ffb00ec8d8926193fe510ebdb7185f75c90906 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -30,5 +30,5 @@ inference_test(label_semantic_roles)
 inference_test(recognize_digits ARGS mlp conv)
 inference_test(recommender_system)
 #inference_test(rnn_encoder_decoder)
-inference_test(understand_sentiment)
+inference_test(understand_sentiment ARGS conv)
 inference_test(word2vec)
diff --git a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
index 443193aae8b38323883d460bc37a9c14430fc8bb..184924016634bba26204d937744ca5fa87cd443c 100644
--- a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
@@ -32,16 +32,42 @@ TEST(inference, label_semantic_roles) {
   paddle::framework::LoDTensor word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1,
       ctx_p2, mark;
   paddle::framework::LoD lod{{0, 4, 10}};
-
-  SetupLoDTensor(word, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(
-      predicate, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(ctx_n2, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(ctx_n1, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(ctx_0, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(ctx_p1, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(ctx_p2, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(mark, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  int64_t word_dict_len = 44068;
+  int64_t predicate_dict_len = 3162;
+  int64_t mark_dict_len = 2;
+
+  SetupLoDTensor(word,
+                 lod,
+                 static_cast<int64_t>(0),
+                 static_cast<int64_t>(word_dict_len - 1));
+  SetupLoDTensor(predicate,
+                 lod,
+                 static_cast<int64_t>(0),
+                 static_cast<int64_t>(predicate_dict_len - 1));
+  SetupLoDTensor(ctx_n2,
+                 lod,
+                 static_cast<int64_t>(0),
+                 static_cast<int64_t>(word_dict_len - 1));
+  SetupLoDTensor(ctx_n1,
+                 lod,
+                 static_cast<int64_t>(0),
+                 static_cast<int64_t>(word_dict_len - 1));
+  SetupLoDTensor(ctx_0,
+                 lod,
+                 static_cast<int64_t>(0),
+                 static_cast<int64_t>(word_dict_len - 1));
+  SetupLoDTensor(ctx_p1,
+                 lod,
+                 static_cast<int64_t>(0),
+                 static_cast<int64_t>(word_dict_len - 1));
+  SetupLoDTensor(ctx_p2,
+                 lod,
+                 static_cast<int64_t>(0),
+                 static_cast<int64_t>(word_dict_len - 1));
+  SetupLoDTensor(mark,
+                 lod,
+                 static_cast<int64_t>(0),
+                 static_cast<int64_t>(mark_dict_len - 1));
 
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&word);
diff --git a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
index e67064fb61d18ff8db540a68e94729649e44cd1a..824b3274ebc7ba046e61798b3f61ef9924a75679 100644
--- a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
@@ -31,7 +31,12 @@ TEST(inference, understand_sentiment) {
 
   paddle::framework::LoDTensor words;
   paddle::framework::LoD lod{{0, 4, 10}};
-  SetupLoDTensor(words, lod, static_cast<int64_t>(0), static_cast<int64_t>(10));
+  int64_t word_dict_len = 5147;
+
+  SetupLoDTensor(words,
+                 lod,
+                 static_cast<int64_t>(0),
+                 static_cast<int64_t>(word_dict_len - 1));
 
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&words);
diff --git a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
index e2f2f36a8222e03f77eca65d6331b4a52c0eea82..1481760c529c29a7290f476e2a22e1ded5ab7787 100644
--- a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
@@ -31,12 +31,12 @@ TEST(inference, word2vec) {
 
   paddle::framework::LoDTensor first_word, second_word, third_word, fourth_word;
   paddle::framework::LoD lod{{0, 1}};
-  int64_t dict_size = 2072;  // Hard-coding the size of dictionary
+  int64_t dict_size = 2073;  // The size of dictionary
 
-  SetupLoDTensor(first_word, lod, static_cast<int64_t>(0), dict_size);
-  SetupLoDTensor(second_word, lod, static_cast<int64_t>(0), dict_size);
-  SetupLoDTensor(third_word, lod, static_cast<int64_t>(0), dict_size);
-  SetupLoDTensor(fourth_word, lod, static_cast<int64_t>(0), dict_size);
+  SetupLoDTensor(first_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(second_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(third_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);
 
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&first_word);
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index abe2032cc058e50a63ac72cccd90e060c6e14479..49518e50d8541477234f17ac5b8709aeb57662ff 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -101,8 +101,8 @@ void TestInference(const std::string& dirname,
   if (IsCombined) {
     // All parameters are saved in a single file.
     // Hard-coding the file names of program and parameters in unittest.
-    // Users are free to specify different filename
-    // (provided: the filenames are changed in the python api as well: io.py)
+    // The file names should be consistent with that used in Python API
+    //  `fluid.io.save_inference_model`.
     std::string prog_filename = "__model_combined__";
     std::string param_filename = "__params_combined__";
     inference_program = paddle::inference::Load(executor,
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 8f14fd376ae51eff0f56c5a8d679c49cec23bd68..4da46e94c5cd979507fed80b35ebedf0cc6791d0 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -11,6 +11,8 @@ function(op_library TARGET)
     set(cc_srcs)
     set(cu_srcs)
     set(cu_cc_srcs)
+    set(cudnn_cu_cc_srcs)
+    set(CUDNN_FILE)
     set(op_common_deps operator op_registry math_function)
     set(options "")
     set(oneValueArgs "")
@@ -30,10 +32,16 @@ function(op_library TARGET)
         if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
             list(APPEND cu_srcs ${TARGET}.cu)
         endif()
+        string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}")
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
+            list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
+        endif()
     else()
         foreach(src ${op_library_SRCS})
             if (${src} MATCHES ".*\\.cu$")
                 list(APPEND cu_srcs ${src})
+            elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
+                list(APPEND cudnn_cu_cc_srcs ${src})
             elseif(${src} MATCHES ".*\\.cu.cc$")
                 list(APPEND cu_cc_srcs ${src})
             elseif(${src} MATCHES ".*\\.cc$")
@@ -54,7 +62,7 @@ function(op_library TARGET)
         set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
     endif()
     if (WITH_GPU)
-        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
+        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
     else()
         cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS}
@@ -98,6 +106,12 @@ function(op_library TARGET)
         set(pybind_flag 1)
     endif()
 
+    # pybind USE_OP_DEVICE_KERNEL for CUDNN
+    list(LENGTH cudnn_cu_cc_srcs cudnn_cu_cc_srcs_len)
+    if (WITH_GPU AND ${cudnn_cu_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
+    endif()
+
     # pybind USE_OP
     if (${pybind_flag} EQUAL 0)
         file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
@@ -141,6 +155,7 @@ op_library(print_op DEPS lod_tensor)
 op_library(adagrad_op DEPS selected_rows_functor)
 op_library(maxout_op DEPS maxouting)
 op_library(unpool_op DEPS unpooling)
+op_library(pool_op DEPS pooling)
 op_library(pool_with_index_op DEPS pooling)
 op_library(lod_rank_table_op DEPS lod_rank_table)
 op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
@@ -152,43 +167,17 @@ op_library(lstm_op DEPS sequence2batch lstm_compute)
 op_library(lstmp_op DEPS sequence2batch lstm_compute)
 op_library(gru_op DEPS sequence2batch gru_compute)
 op_library(recurrent_op DEPS executor)
-op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale math_function)
+op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 op_library(cos_sim_op DEPS cos_sim_functor)
 op_library(parallel_do_op DEPS executor)
 op_library(create_reader_op DEPS reader)
 
-# Regist multiple Kernel to pybind
 if (WITH_GPU)
-
-op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS
-    vol2col depthwise_conv)
-
-op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function)
-op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling)
-op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc
-  conv_transpose_cudnn_op.cu.cc DEPS vol2col)
-file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(conv2d, CUDNN);\n")
-file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(pool2d, CUDNN);\n")
-file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(conv2d_transpose, CUDNN);\n")
+    op_library(conv_op DEPS vol2col depthwise_conv)
 else()
-op_library(conv_op SRCS conv_op.cc DEPS vol2col)
-op_library(pool_op SRCS pool_op.cc DEPS pooling)
-op_library(conv_transpose_op SRCS conv_transpose_op.cc DEPS vol2col)
+    op_library(conv_op DEPS vol2col)
 endif()
-
-cc_library(batch_size_like SRCS batch_size_like.cc DEPS op_registry)
-
-op_library(fill_constant_batch_size_like_op
-  SRCS fill_constant_batch_size_like_op.cc fill_constant_batch_size_like_op.cu.cc
-  DEPS batch_size_like)
-
-op_library(uniform_random_batch_size_like_op
-  SRCS uniform_random_batch_size_like_op.cc
-  DEPS batch_size_like uniform_random_op)
-
-op_library(gaussian_random_batch_size_like_op
-  SRCS gaussian_random_batch_size_like_op.cc
-  DEPS batch_size_like gaussian_random_op)
+op_library(conv_transpose_op DEPS vol2col)
 
 # FIXME(typhoonzero): save/load depends lodtensor serialization functions
 op_library(save_op DEPS lod_tensor)
diff --git a/paddle/fluid/operators/batch_size_like.cc b/paddle/fluid/operators/batch_size_like.cc
deleted file mode 100644
index 4d4a6d4c472fe2dedb0cd37bff7bbf5bdad3ead7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/batch_size_like.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/batch_size_like.h"
-
-namespace paddle {
-namespace operators {
-
-void BatchSizeLikeOp::InferShape(framework::InferShapeContext *ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("Input"),
-                 "Input(Input) of %s should not be null.", Type());
-  PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of %s should not be null.",
-                 Type());
-
-  auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
-  PADDLE_ENFORCE_GT(shape.size(), 0);
-  std::vector<int64_t> shape_int64(shape.size(), 0);
-  std::transform(shape.begin(), shape.end(), shape_int64.begin(),
-                 [](int a) { return static_cast<int64_t>(a); });
-  auto output_dim = framework::make_ddim(shape_int64);
-
-  int input_dim_idx = ctx->Attrs().Get<int>("input_dim_idx");
-  PADDLE_ENFORCE_GE(input_dim_idx, 0);
-  PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), input_dim_idx);
-
-  int output_dim_idx = ctx->Attrs().Get<int>("output_dim_idx");
-  PADDLE_ENFORCE_GE(output_dim_idx, 0);
-  PADDLE_ENFORCE_GT(static_cast<int>(shape.size()), output_dim_idx);
-
-  output_dim[output_dim_idx] = ctx->GetInputDim("Input")[input_dim_idx];
-  ctx->SetOutputDim("Out", output_dim);
-}
-
-BatchSizeLikeOpMaker::BatchSizeLikeOpMaker(OpProto *proto,
-                                           OpAttrChecker *op_checker)
-    : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-  AddInput("Input",
-           "(Tensor) Tensor "
-           "whose input_dim_idx'th dimension specifies the batch_size");
-  AddOutput("Out",
-            "(Tensor) Tensor of specified shape will be filled "
-            "with the specified value");
-  AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
-  AddAttr<int>("input_dim_idx",
-               "(int, default 0) The index of input's batch size dimension")
-      .SetDefault(0);
-  AddAttr<int>("output_dim_idx",
-               "(int, default 0) The index of output's batch size dimension")
-      .SetDefault(0);
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/batch_size_like.h b/paddle/fluid/operators/batch_size_like.h
index 87e8f053a73a23cd9e231ada6501d0d9344bb1a6..0bdf27e620a3a7c7b62b955f708a5e2aad1a6986 100644
--- a/paddle/fluid/operators/batch_size_like.h
+++ b/paddle/fluid/operators/batch_size_like.h
@@ -24,12 +24,50 @@ class BatchSizeLikeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of %s should not be null.", Type());
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of %s should not be null.", Type());
+
+    auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    PADDLE_ENFORCE_GT(shape.size(), 0);
+    std::vector<int64_t> shape_int64(shape.size(), 0);
+    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
+                   [](int a) { return static_cast<int64_t>(a); });
+    auto output_dim = framework::make_ddim(shape_int64);
+
+    int input_dim_idx = ctx->Attrs().Get<int>("input_dim_idx");
+    PADDLE_ENFORCE_GE(input_dim_idx, 0);
+    PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), input_dim_idx);
+
+    int output_dim_idx = ctx->Attrs().Get<int>("output_dim_idx");
+    PADDLE_ENFORCE_GE(output_dim_idx, 0);
+    PADDLE_ENFORCE_GT(static_cast<int>(shape.size()), output_dim_idx);
+
+    output_dim[output_dim_idx] = ctx->GetInputDim("Input")[input_dim_idx];
+    ctx->SetOutputDim("Out", output_dim);
+  }
 };
 
 class BatchSizeLikeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BatchSizeLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker);
+  BatchSizeLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(Tensor) Tensor "
+             "whose input_dim_idx'th dimension specifies the batch_size");
+    AddOutput("Out",
+              "(Tensor) Tensor of specified shape will be filled "
+              "with the specified value");
+    AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
+    AddAttr<int>("input_dim_idx",
+                 "(int, default 0) The index of input's batch size dimension")
+        .SetDefault(0);
+    AddAttr<int>("output_dim_idx",
+                 "(int, default 0) The index of output's batch size dimension")
+        .SetDefault(0);
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/bipartite_match_op.cc b/paddle/fluid/operators/bipartite_match_op.cc
index c536cf6b6b822c8d9553d7d2cf57902e5e6e5343..2b3f26c0a890c33f9b4f4c8a5a271123d7ff0b31 100644
--- a/paddle/fluid/operators/bipartite_match_op.cc
+++ b/paddle/fluid/operators/bipartite_match_op.cc
@@ -94,6 +94,38 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
     }
   }
 
+  void ArgMaxMatch(const Tensor& dist, int* match_indices, T* match_dist,
+                   T overlap_threshold) const {
+    constexpr T kEPS = static_cast<T>(1e-6);
+    int64_t row = dist.dims()[0];
+    int64_t col = dist.dims()[1];
+    auto* dist_data = dist.data<T>();
+    for (int64_t j = 0; j < col; ++j) {
+      if (match_indices[j] != -1) {
+        // the j-th column has been matched to one entity.
+        continue;
+      }
+      int max_row_idx = -1;
+      T max_dist = -1;
+      for (int i = 0; i < row; ++i) {
+        T dist = dist_data[i * col + j];
+        if (dist < kEPS) {
+          // distance is 0 between m-th row and j-th column
+          continue;
+        }
+        if (dist >= overlap_threshold && dist > max_dist) {
+          max_row_idx = i;
+          max_dist = dist;
+        }
+      }
+      if (max_row_idx != -1) {
+        PADDLE_ENFORCE_EQ(match_indices[j], -1);
+        match_indices[j] = max_row_idx;
+        match_dist[j] = max_dist;
+      }
+    }
+  }
+
   void Compute(const framework::ExecutionContext& context) const override {
     auto* dist_mat = context.Input<LoDTensor>("DistMat");
     auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices");
@@ -120,13 +152,21 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
 
     int* indices = match_indices->data<int>();
     T* dist = match_dist->data<T>();
+    auto type = context.Attr<std::string>("match_type");
+    auto threshold = context.Attr<float>("dist_threshold");
     if (n == 1) {
       BipartiteMatch(*dist_mat, indices, dist);
+      if (type == "per_prediction") {
+        ArgMaxMatch(*dist_mat, indices, dist, threshold);
+      }
     } else {
       auto lod = dist_mat->lod().back();
       for (size_t i = 0; i < lod.size() - 1; ++i) {
         Tensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]);
         BipartiteMatch(one_ins, indices + i * col, dist + i * col);
+        if (type == "per_prediction") {
+          ArgMaxMatch(one_ins, indices + i * col, dist + i * col, threshold);
+        }
       }
     }
   }
@@ -147,6 +187,19 @@ class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {
         "This tensor can contain LoD information to represent a batch of "
         "inputs. One instance of this batch can contain different numbers of "
         "entities.");
+    AddAttr<std::string>(
+        "match_type",
+        "(string, defalut: per_prediction) "
+        "The type of matching method, should be 'bipartite' or "
+        "'per_prediction', 'bipartite' by defalut.")
+        .SetDefault("bipartite")
+        .InEnum({"bipartite", "per_prediction"});
+    AddAttr<float>(
+        "dist_threshold",
+        "(float, defalut: 0.5) "
+        "If `match_type` is 'per_prediction', this threshold is to determine "
+        "the extra matching bboxes based on the maximum distance.")
+        .SetDefault(0.5);
     AddOutput("ColToRowMatchIndices",
               "(Tensor) A 2-D Tensor with shape [N, M] in int type. "
               "N is the batch size. If ColToRowMatchIndices[i][j] is -1, it "
@@ -168,10 +221,10 @@ distance matrix. For input 2D matrix, the bipartite matching algorithm can
 find the matched column for each row, also can find the matched row for
 each column. And this operator only calculate matched indices from column
 to row. For each instance, the number of matched indices is the number of
-of columns of the input ditance matrix.
+of columns of the input distance matrix.
 
 There are two outputs to save matched indices and distance.
-A simple description, this algothrim matched the best (maximum distance)
+A simple description, this algorithm matched the best (maximum distance)
 row entity to the column entity and the matched indices are not duplicated
 in each row of ColToRowMatchIndices. If the column entity is not matched
 any row entity, set -1 in ColToRowMatchIndices.
diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h
index eb0e43ad2d84f681f39ed4adc5a27f6d3ab00f08..208a4481c6afe1b8f62e8f675c951c3349639f46 100644
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
@@ -34,12 +35,46 @@ class ConcatKernel : public framework::OpKernel<T> {
     auto out_stride = framework::stride_numel(out->dims());
 
     size_t output_offset = 0;
-    for (auto* in : ins) {
-      auto in_stride = framework::stride_numel(in->dims());
-      StridedNumelCopyWithAxis<T>(ctx.device_context(), axis,
-                                  out->data<T>() + output_offset, out_stride,
-                                  in->data<T>(), in_stride, in_stride[axis]);
-      output_offset += in_stride[axis];
+
+    // If axis >=1, copy to out immediately need to call many times
+    // of cuda memcpy. Copy the input to cpu and do the stride copy,
+    // then copy to gpu output.
+
+    if (platform::is_gpu_place(place) && axis >= 1) {
+      platform::CPUPlace copy_place;
+      auto& cpu_ctx = *platform::DeviceContextPool::Instance().Get(copy_place);
+      framework::Tensor cpu_out;
+      cpu_out.Resize(out->dims());
+      cpu_out.mutable_data<T>(copy_place);
+      auto& dev_ctx = ctx.device_context();
+      std::vector<std::unique_ptr<framework::Tensor>> cpu_ins;
+      for (auto* in : ins) {
+        std::unique_ptr<framework::Tensor> cpu_in(new framework::Tensor);
+        framework::TensorCopy(*in, copy_place, dev_ctx, cpu_in.get());
+        cpu_ins.emplace_back(std::move(cpu_in));
+      }
+      // TODO(dzhwinter): overlap copy and compute stream
+      // https://devblogs.nvidia.com/how-overlap-data-transfers-cuda-cc/
+      dev_ctx.Wait();
+
+      for (auto& in : cpu_ins) {
+        auto& cpu_in = *in.get();
+        auto in_stride = framework::stride_numel(cpu_in.dims());
+
+        StridedNumelCopyWithAxis<T>(
+            cpu_ctx, axis, cpu_out.data<T>() + output_offset, out_stride,
+            cpu_in.data<T>(), in_stride, in_stride[axis]);
+        output_offset += in_stride[axis];
+      }
+      framework::TensorCopy(cpu_out, place, dev_ctx, out);
+    } else {
+      for (auto* in : ins) {
+        auto in_stride = framework::stride_numel(in->dims());
+        StridedNumelCopyWithAxis<T>(ctx.device_context(), axis,
+                                    out->data<T>() + output_offset, out_stride,
+                                    in->data<T>(), in_stride, in_stride[axis]);
+        output_offset += in_stride[axis];
+      }
     }
   }
 };
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 2ecece707314f1e8b1b0bc9ad28f53ec5e1d405e..83b7708bf337b70f97c5e9126efd142b9b957b00 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -54,12 +54,6 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
 
   std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
   for (size_t i = 0; i < strides.size(); ++i) {
-    PADDLE_ENFORCE(in_dims[i + 2] + 2 * paddings[i] -
-                           (dilations[i] * (filter_dims[i + 2] - 1) + 1) >
-                       0,
-                   "Due to the settings of paddings, filter_dims and "
-                   "dilations, the output size is less than 0, please check "
-                   "again.");
     output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
                                           dilations[i], paddings[i],
                                           strides[i]));
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index c93c2e73f720ae025a4ad4f8146a7c6c3c382eea..12b45f1d65019f623268cb9da9004bac5e1f72a3 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -31,7 +31,14 @@ using Tensor = framework::Tensor;
 inline int ConvOutputSize(int input_size, int filter_size, int dilation,
                           int padding, int stride) {
   const int dkernel = dilation * (filter_size - 1) + 1;
-  const int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
+  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
+  PADDLE_ENFORCE(
+      output_size > 0,
+      "Due to the settings of padding(%d), filter_size(%d), dilation(%d) and "
+      "stride(%d), the output size is less than 0, please check "
+      "again. Input_size:%d",
+      padding, filter_size, dilation, stride, input_size);
+
   return output_size;
 }
 inline bool IsExpand(std::vector<int64_t>& filter_dim,
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index ee9044b1f5d46dc725c9583d0d90ab5681d2850c..7266f3276477891d3c7b6827316a428ef7a31c6e 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -177,8 +177,8 @@ std::shared_ptr<grpc::Channel> RPCClient::GetChannel(const std::string& ep) {
   args.SetMaxSendMessageSize(std::numeric_limits<int>::max());
   args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
 
-  auto ch = std::shared_ptr<grpc::Channel>(
-      grpc::CreateCustomChannel(ep, grpc::InsecureChannelCredentials(), args));
+  auto ch =
+      grpc::CreateCustomChannel(ep, grpc::InsecureChannelCredentials(), args);
 
   channels_[ep] = ch;
   return ch;
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index ee0e3533ce028992af3d4558e3fd198a09c4816b..8e9923c87ce22ed229f78ef15430e50cab16c947 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -129,6 +129,8 @@ class ListenAndServOp : public framework::OperatorBase {
       }
       if (exit_flag) {
         rpc_service_->ShutDown();
+        rpc_service_->SetCond(1);
+        break;
       }
       try {
         executor.Run(*program, &recv_scope, block->ID(), /*global_block*/
diff --git a/paddle/fluid/operators/nccl_op.cc b/paddle/fluid/operators/nccl_op.cc
index 0994bba782b42be994ae479f4c9c4de5a2e384ed..9185666c56c4621d42429c9cfdb079001c6336f1 100644
--- a/paddle/fluid/operators/nccl_op.cc
+++ b/paddle/fluid/operators/nccl_op.cc
@@ -65,7 +65,7 @@ class NCCLInitOpVarTypeInference : public framework::VarTypeInference {
                   framework::BlockDesc *block) const override {
     auto out_var_name = op_desc.Output("Communicator").front();
     auto &out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    auto var_type = framework::proto::VarType::NCCL_COM;
+    auto var_type = framework::proto::VarType::RAW;
     out_var.SetType(var_type);
   }
 };
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index c7729ad1322588c0558a136dbd5d48f757d38412..a87a3511ee46dd657c27da26feb43ba43a08f25d 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -19,6 +19,11 @@ namespace operators {
 
 int PoolOutputSize(int input_size, int filter_size, int padding, int stride) {
   int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  PADDLE_ENFORCE(output_size > 0,
+                 "Due to the settings of padding(%d), filter_size(%d) and "
+                 "stride(%d), the output size is less than 0, please check "
+                 "again. Input_size:%d",
+                 padding, filter_size, stride, input_size);
   return output_size;
 }
 
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index a90ffb4ff3e6e1bfcc0d00bc4714b3067fdede6c..3580932356fd5f29d5e4d00a70e64c207c64e41e 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -121,10 +121,15 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
 
 REGISTER_OP(reshape, ops::ReshapeOp, ops::ReshapeOpMaker, reshape_grad,
             ops::ReshapeGradOp);
-REGISTER_OP_CPU_KERNEL(reshape,
-                       ops::ReshapeKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    reshape_grad, ops::ReshapeGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(reshape, ops::ReshapeKernel<CPU, float>,
+                       ops::ReshapeKernel<CPU, double>,
+                       ops::ReshapeKernel<CPU, int>,
+                       ops::ReshapeKernel<CPU, int64_t>);
+REGISTER_OP_CPU_KERNEL(reshape_grad, ops::ReshapeGradKernel<CPU, float>,
+                       ops::ReshapeGradKernel<CPU, double>,
+                       ops::ReshapeGradKernel<CPU, int>,
+                       ops::ReshapeGradKernel<CPU, int64_t>);
diff --git a/paddle/fluid/operators/reshape_op.cu b/paddle/fluid/operators/reshape_op.cu
index d5ceaf784c0e4b1c8d527958be31d5186c2823d3..c628c634e2bc9ae260948a6e7ccf786cbd6c5c3c 100644
--- a/paddle/fluid/operators/reshape_op.cu
+++ b/paddle/fluid/operators/reshape_op.cu
@@ -13,10 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/reshape_op.h"
+using CUDA = paddle::platform::CUDADeviceContext;
 
-REGISTER_OP_CUDA_KERNEL(
-    reshape,
-    paddle::operators::ReshapeKernel<paddle::platform::CUDAPlace, float>);
-REGISTER_OP_CUDA_KERNEL(
-    reshape_grad,
-    paddle::operators::ReshapeGradKernel<paddle::platform::CUDAPlace, float>);
+REGISTER_OP_CUDA_KERNEL(reshape, paddle::operators::ReshapeKernel<CUDA, float>,
+                        paddle::operators::ReshapeKernel<CUDA, double>,
+                        paddle::operators::ReshapeKernel<CUDA, int>,
+                        paddle::operators::ReshapeKernel<CUDA, int64_t>);
+REGISTER_OP_CUDA_KERNEL(reshape_grad,
+                        paddle::operators::ReshapeGradKernel<CUDA, float>,
+                        paddle::operators::ReshapeGradKernel<CUDA, double>,
+                        paddle::operators::ReshapeGradKernel<CUDA, int>,
+                        paddle::operators::ReshapeGradKernel<CUDA, int64_t>);
diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc
index 58850bf566e00f88de19305110e2ef696b73467e..178976f96fdbd08cead7b7c518ea1fbaaa2a5db8 100644
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -121,9 +121,27 @@ This operator will send tensor to recv_op at the parameter server.
   }
 };
 
+class SendOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    auto out_var_name = op_desc.Output("RPCClient").front();
+    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
+    auto var_type = framework::proto::VarType::RAW;
+    out_var.SetType(var_type);
+  }
+};
+
+class SendOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(send, ops::SendOp, ops::SendOpMaker);
+REGISTER_OPERATOR(send, ops::SendOp, paddle::framework::EmptyGradOpMaker,
+                  ops::SendOpMaker, ops::SendOpVarTypeInference,
+                  ops::SendOpShapeInference);
diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc
index 008c012a32e0c88dfb0c05d7e485ffc367b3cac5..e9fb845b475ff5776bf948ab120a44c16ed87aa0 100644
--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -95,7 +95,7 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs,
   for (auto kv : outputs) {
     for (auto v : kv.second) {
       auto var = block->Var(v);
-      var->SetDataType(f::proto::DataType::FP32);
+      var->SetDataType(f::proto::VarType::FP32);
     }
   }
 
@@ -122,33 +122,37 @@ void StartServerNet(bool is_sparse) {
 
   // sub program run in listen_and_serv_op, for simple test we use sum
   f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
+  f::BlockDesc *optimize_block = program.MutableBlock(0);
   // X for server side tensors, RX for received tensers, must be of same shape.
-  AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, block);
+  AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block);
 
   f::AttributeMap attrs;
   attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
+  attrs.insert({"Fanin", 1});
   attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
   attrs.insert({"GradList", std::vector<std::string>({"x1"})});
-  attrs.insert({"OptimizeBlock", block});
+  attrs.insert({"OptimizeBlock", optimize_block});
   listen_and_serv_op =
-      f::OpRegistry::CreateOp("listen_and_serv", {}, {}, attrs);
+      f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs);
   listen_and_serv_op->Run(scope, place);
 }
 
 TEST(SendRecvOp, CPUDense) {
   std::thread server_thread(StartServerNet, false);
-  sleep(10);  // wait server to start
+  sleep(5);  // wait server to start
   // local net
   f::Scope scope;
   p::CPUPlace place;
   InitTensorsInScope(scope, place);
+  // create rpc client var
+  scope.Var("RPC_CLIENT_VAR");
 
   f::AttributeMap attrs;
   attrs.insert({"endpoints", std::vector<std::string>({"127.0.0.1:6174"})});
   attrs.insert({"epmap", std::vector<std::string>({"127.0.0.1:6174"})});
-  auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}},
-                                         {{"Out", {"Out"}}}, attrs);
+  auto send_op = f::OpRegistry::CreateOp(
+      "send", {{"X", {"x1"}}},
+      {{"Out", {"Out"}}, {"RPCClient", {"RPC_CLIENT_VAR"}}}, attrs);
   send_op->Run(scope, place);
 
   auto in_var = scope.Var("x1");
@@ -175,11 +179,13 @@ TEST(SendRecvOp, CPUSparse) {
   p::CPUPlace place;
   p::CPUDeviceContext ctx(place);
   InitSelectedRowsInScope(scope, place);
+  scope.Var("RPC_CLIENT_VAR");
   f::AttributeMap attrs;
   attrs.insert({"endpoints", std::vector<std::string>({"127.0.0.1:6174"})});
   attrs.insert({"epmap", std::vector<std::string>({"127.0.0.1:6174"})});
-  auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}},
-                                         {{"Out", {"Out"}}}, attrs);
+  auto send_op = f::OpRegistry::CreateOp(
+      "send", {{"X", {"x1"}}},
+      {{"Out", {"Out"}}, {"RPCClient", {"RPC_CLIENT_VAR"}}}, attrs);
   send_op->Run(scope, place);
 
   auto x0 = scope.Var("x0")->GetMutable<f::SelectedRows>();
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 0d0cee21d14f29c03ebabcb921ecc4f29f352b55..28a668c86aa322803a65b916b4273181f5652e21 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -1,3 +1,5 @@
+proto_library(profiler_proto SRCS profiler.proto)
+
 if(WITH_GPU)
   cc_library(enforce SRCS enforce.cc DEPS)
 else()
@@ -37,7 +39,8 @@ nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
 nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context)
 
-cc_library(profiler SRCS profiler.cc DEPS device_context)
+cc_library(device_tracer SRCS device_tracer.cc DEPS profiler_proto ${GPU_CTX_DEPS})
+cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
 
 nv_test(float16_gpu_test SRCS float16_test.cu)
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..87bbdfa5fd5d9781d5f2b310d2142b1b4decbf9b
--- /dev/null
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -0,0 +1,285 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/device_tracer.h"
+#include <map>
+#include <mutex>
+#include "glog/logging.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace paddle {
+namespace platform {
+namespace {
+
+thread_local const char *cur_annotation = nullptr;
+std::once_flag tracer_once_flag;
+DeviceTracer *tracer = nullptr;
+}  // namespace
+#ifdef PADDLE_WITH_CUPTI
+
+namespace {
+// TODO(panyx0718): Revisit the buffer size here.
+uint64_t kBufSize = 32 * 1024;
+uint64_t kAlignSize = 8;
+
+#define ALIGN_BUFFER(buffer, align)                                 \
+  (((uintptr_t)(buffer) & ((align)-1))                              \
+       ? ((buffer) + (align) - ((uintptr_t)(buffer) & ((align)-1))) \
+       : (buffer))
+
+#define CUPTI_CALL(call)                                                   \
+  do {                                                                     \
+    CUptiResult _status = call;                                            \
+    if (_status != CUPTI_SUCCESS) {                                        \
+      const char *errstr;                                                  \
+      dynload::cuptiGetResultString(_status, &errstr);                     \
+      fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
+              __FILE__, __LINE__, #call, errstr);                          \
+      exit(-1);                                                            \
+    }                                                                      \
+  } while (0)
+
+void EnableActivity() {
+  // Device activity record is created when CUDA initializes, so we
+  // want to enable it before cuInit() or any CUDA runtime call.
+  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY));
+  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));
+  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
+  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
+  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
+  // We don't track these activities for now.
+  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT));
+  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
+  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
+  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_NAME));
+  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MARKER));
+}
+
+void DisableActivity() {
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY));
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL));
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE));
+  // Disable all other activity record kinds.
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT));
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DRIVER));
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME));
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET));
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME));
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER));
+  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD));
+}
+
+void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size,
+                              size_t *maxNumRecords) {
+  uint8_t *buf = (uint8_t *)malloc(kBufSize + kAlignSize);
+  *size = kBufSize;
+  *buffer = ALIGN_BUFFER(buf, kAlignSize);
+  *maxNumRecords = 0;
+}
+
+void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
+                              size_t size, size_t validSize) {
+  CUptiResult status;
+  CUpti_Activity *record = NULL;
+  if (validSize > 0) {
+    do {
+      status = dynload::cuptiActivityGetNextRecord(buffer, validSize, &record);
+      if (status == CUPTI_SUCCESS) {
+        switch (record->kind) {
+          case CUPTI_ACTIVITY_KIND_KERNEL:
+          case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: {
+            auto *kernel =
+                reinterpret_cast<const CUpti_ActivityKernel3 *>(record);
+            tracer->AddKernelRecords(kernel->start, kernel->end,
+                                     kernel->deviceId, kernel->streamId,
+                                     kernel->correlationId);
+            break;
+          }
+          default: { break; }
+        }
+      } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
+        // Seems not an error in this case.
+        break;
+      } else {
+        CUPTI_CALL(status);
+      }
+    } while (1);
+
+    size_t dropped;
+    CUPTI_CALL(
+        dynload::cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped));
+    if (dropped != 0) {
+      fprintf(stderr, "Dropped %u activity records\n", (unsigned int)dropped);
+    }
+  }
+  free(buffer);
+}
+}  // namespace
+
+class DeviceTracerImpl : public DeviceTracer {
+ public:
+  DeviceTracerImpl() : enabled_(false) {}
+
+  void AddAnnotation(uint64_t id, const std::string &anno) {
+    std::lock_guard<std::mutex> l(trace_mu_);
+    correlations_[id] = anno;
+  }
+
+  void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
+                        uint32_t stream_id, uint32_t correlation_id) {
+    std::lock_guard<std::mutex> l(trace_mu_);
+    kernel_records_.push_back(
+        KernelRecord{start, end, device_id, stream_id, correlation_id});
+  }
+
+  bool IsEnabled() {
+    std::lock_guard<std::mutex> l(trace_mu_);
+    return enabled_;
+  }
+
+  void Enable() {
+    std::lock_guard<std::mutex> l(trace_mu_);
+    if (enabled_) {
+      fprintf(stderr, "DeviceTracer already enabled\n");
+      return;
+    }
+    EnableActivity();
+
+    // Register callbacks for buffer requests and completed by CUPTI.
+    CUPTI_CALL(dynload::cuptiActivityRegisterCallbacks(bufferRequested,
+                                                       bufferCompleted));
+
+    CUptiResult ret;
+    ret = dynload::cuptiSubscribe(
+        &subscriber_, static_cast<CUpti_CallbackFunc>(ApiCallback), this);
+    if (ret == CUPTI_ERROR_MAX_LIMIT_REACHED) {
+      fprintf(stderr, "CUPTI subcriber limit reached.\n");
+    } else if (ret != CUPTI_SUCCESS) {
+      fprintf(stderr, "Failed to create CUPTI subscriber.\n");
+    }
+    CUPTI_CALL(
+        dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API,
+                                     CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));
+
+    CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
+    enabled_ = true;
+  }
+
+  proto::Profile GenProfile() {
+    std::lock_guard<std::mutex> l(trace_mu_);
+    proto::Profile profile_pb;
+    profile_pb.set_start_ns(start_ns_);
+    profile_pb.set_end_ns(end_ns_);
+    std::map<std::string, std::vector<uint64_t>> event_times;
+    for (const KernelRecord &r : kernel_records_) {
+      if (correlations_.find(r.correlation_id) == correlations_.end()) {
+        fprintf(stderr, "cannot relate a kernel activity\n");
+        continue;
+      }
+      auto *event = profile_pb.add_events();
+      event->set_name(correlations_.at(r.correlation_id));
+      event->set_start_ns(r.start_ns);
+      event->set_end_ns(r.end_ns);
+      event->set_stream_id(r.stream_id);
+      event->set_device_id(r.device_id);
+      event_times[event->name()].push_back(r.end_ns - r.start_ns);
+    }
+    for (const auto &et : event_times) {
+      fprintf(
+          stderr, "%s: total: %fms invoked cuda kernels: %lu\n",
+          et.first.c_str(),
+          std::accumulate(et.second.begin(), et.second.end(), 0) / 1000000.0,
+          et.second.size());
+    }
+    return profile_pb;
+  }
+
+  void Disable() {
+    // flush might cause additional calls to DeviceTracker.
+    dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED);
+    std::lock_guard<std::mutex> l(trace_mu_);
+    DisableActivity();
+    dynload::cuptiUnsubscribe(subscriber_);
+    CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_));
+    PADDLE_ENFORCE(dynload::cuptiFinalize());
+    enabled_ = false;
+  }
+
+ private:
+  static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain,
+                                   CUpti_CallbackId cbid, const void *cbdata) {
+    auto *cbInfo = reinterpret_cast<const CUpti_CallbackData *>(cbdata);
+    DeviceTracer *tracer = reinterpret_cast<DeviceTracer *>(userdata);
+
+    if ((domain == CUPTI_CB_DOMAIN_DRIVER_API) &&
+        (cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)) {
+      if (cbInfo->callbackSite == CUPTI_API_ENTER) {
+        const std::string anno =
+            cur_annotation ? cur_annotation : cbInfo->symbolName;
+        tracer->AddAnnotation(cbInfo->correlationId, anno);
+      }
+    } else {
+      VLOG(1) << "Unhandled API Callback for " << domain << " " << cbid;
+    }
+  }
+
+  std::mutex trace_mu_;
+  bool enabled_;
+  uint64_t start_ns_;
+  uint64_t end_ns_;
+  std::vector<KernelRecord> kernel_records_;
+  std::unordered_map<uint32_t, std::string> correlations_;
+  CUpti_SubscriberHandle subscriber_;
+};
+
+#endif  // PADDLE_WITH_CUPTI
+
+class DeviceTracerDummy : public DeviceTracer {
+ public:
+  DeviceTracerDummy() {}
+
+  void AddAnnotation(uint64_t id, const std::string &anno) {}
+
+  void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
+                        uint32_t stream_id, uint32_t correlation_id) {}
+
+  bool IsEnabled() { return false; }
+
+  void Enable() {}
+
+  proto::Profile GenProfile() { return proto::Profile(); }
+
+  void Disable() {}
+};
+
+void CreateTracer(DeviceTracer **t) {
+#ifdef PADDLE_WITH_CUPTI
+  *t = new DeviceTracerImpl();
+#else
+  *t = new DeviceTracerDummy();
+#endif  // PADDLE_WITH_CUPTI
+}
+
+DeviceTracer *GetDeviceTracer() {
+  std::call_once(tracer_once_flag, CreateTracer, &tracer);
+  return tracer;
+}
+
+void SetCurAnnotation(const char *anno) { cur_annotation = anno; }
+
+void ClearCurAnnotation() { cur_annotation = nullptr; }
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h
new file mode 100644
index 0000000000000000000000000000000000000000..06cea84cc80ebefe9f5c396673cc9a35673f718f
--- /dev/null
+++ b/paddle/fluid/platform/device_tracer.h
@@ -0,0 +1,72 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/platform/dynload/cupti.h"
+#include "paddle/fluid/platform/profiler.pb.h"
+
+namespace paddle {
+namespace platform {
+
+///////////////////////
+// WARN: Under Development. Don't depend on it yet.
+//////////////////////
+
+// DeviceTracer performs the following tasks:
+// 1. Register cuda callbacks for various events: kernel, memcpy, etc.
+// 2. Collect cuda statistics: start/end ts, memory, etc.
+// 3. Generate a protobuf for further analysis.
+class DeviceTracer {
+ public:
+  struct KernelRecord {
+    uint64_t start_ns;
+    uint64_t end_ns;
+    uint32_t device_id;
+    uint32_t stream_id;
+    uint32_t correlation_id;
+  };
+
+  virtual ~DeviceTracer() {}
+  // Needs to be called once before use.
+  virtual void Enable() = 0;
+  // Needs to be called once after use.
+  virtual void Disable() = 0;
+
+  // Add a pair to correlate internal cuda id with high level
+  // annotation (string). So cuda statistics can be represented by
+  // human-readable annotations.
+  virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0;
+
+  // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
+  // added before for human readability.
+  virtual void AddKernelRecords(uint64_t start, uint64_t end,
+                                uint32_t device_id, uint32_t stream_id,
+                                uint32_t correlation_id) = 0;
+
+  // Generate a proto after done (Disabled).
+  virtual proto::Profile GenProfile() = 0;
+
+  virtual bool IsEnabled() = 0;
+};
+
+// Get a DeviceTracer.
+DeviceTracer* GetDeviceTracer();
+
+// Set a name for the cuda kernel operation being launched by the thread.
+void SetCurAnnotation(const char* anno);
+// Clear the name after the operation is done.
+void ClearCurAnnotation();
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 264b4ebf2c06d9e688a32a223dff3ec079333fd9..567c137a55e4e0cb0b5080893be305e847bb61e1 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -1,4 +1,8 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
-nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc nccl.cc
-        DEPS dynamic_loader)
+
+list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc nccl.cc)
+if (CUPTI_FOUND)
+    list(APPEND CUDA_SRCS cupti.cc)
+endif(CUPTI_FOUND)
+nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
 cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
diff --git a/paddle/fluid/platform/dynload/cupti.cc b/paddle/fluid/platform/dynload/cupti.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a25660c6ed411bbe444ac8aa10a324cbed9c9d4f
--- /dev/null
+++ b/paddle/fluid/platform/dynload/cupti.cc
@@ -0,0 +1,35 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUPTI
+
+#include "paddle/fluid/platform/dynload/cupti.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag cupti_dso_flag;
+void *cupti_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUPTI_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+
+#endif  // PADDLE_WITH_CUPTI
diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h
new file mode 100644
index 0000000000000000000000000000000000000000..a79868c18c14b6bcdf85d60e766c7ec8be993f28
--- /dev/null
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -0,0 +1,86 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_CUPTI
+#include <cuda.h>
+#include <cupti.h>
+#include <dlfcn.h>
+#include <mutex>
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag cupti_dso_flag;
+extern void *cupti_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cupti routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#ifdef PADDLE_USE_DSO
+#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)                    \
+  struct DynLoad__##__name {                                       \
+    template <typename... Args>                                    \
+    inline CUptiResult CUPTIAPI operator()(Args... args) {         \
+      typedef CUptiResult CUPTIAPI (*cuptiFunc)(Args...);          \
+      std::call_once(cupti_dso_flag,                               \
+                     paddle::platform::dynload::GetCUPTIDsoHandle, \
+                     &cupti_dso_handle);                           \
+      void *p_##__name = dlsym(cupti_dso_handle, #__name);         \
+      return reinterpret_cast<cuptiFunc>(p_##__name)(args...);     \
+    }                                                              \
+  };                                                               \
+  extern DynLoad__##__name __name
+#else
+#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)            \
+  struct DynLoad__##__name {                               \
+    template <typename... Args>                            \
+    inline CUptiResult CUPTIAPI operator()(Args... args) { \
+      return __name(args...);                              \
+    }                                                      \
+  };                                                       \
+  extern DynLoad__##__name __name
+#endif
+
+#define CUPTI_ROUTINE_EACH(__macro)           \
+  __macro(cuptiActivityEnable);               \
+  __macro(cuptiActivityDisable);              \
+  __macro(cuptiActivityRegisterCallbacks);    \
+  __macro(cuptiActivityGetAttribute);         \
+  __macro(cuptiActivitySetAttribute);         \
+  __macro(cuptiGetTimestamp);                 \
+  __macro(cuptiActivityGetNextRecord);        \
+  __macro(cuptiGetResultString);              \
+  __macro(cuptiActivityGetNumDroppedRecords); \
+  __macro(cuptiActivityFlushAll);             \
+  __macro(cuptiFinalize);                     \
+  __macro(cuptiSubscribe);                    \
+  __macro(cuptiUnsubscribe);                  \
+  __macro(cuptiEnableCallback);
+
+CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+
+#endif  // PADDLE_WITH_CUPTI
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index db1eb41f28e67ee4ed6b276714db989bd25ece2e..8eb5966e5776004a03fee17b74ae72614331a694 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -40,10 +40,14 @@ DEFINE_string(nccl_dir, "",
               "libcurand. For instance, /usr/local/cuda/lib64. If default, "
               "dlopen will search cuda from LD_LIBRARY_PATH");
 
+DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
+
 namespace paddle {
 namespace platform {
 namespace dynload {
 
+static const char* cupti_lib_path = CUPTI_LIB_PATH;
+
 static inline std::string join(const std::string& part1,
                                const std::string& part2) {
   // directory separator
@@ -143,6 +147,18 @@ void GetCUDNNDsoHandle(void** dso_handle) {
 #endif
 }
 
+void GetCUPTIDsoHandle(void** dso_handle) {
+  std::string cupti_path = cupti_lib_path;
+  if (!FLAGS_cupti_dir.empty()) {
+    cupti_path = FLAGS_cupti_dir;
+  }
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", dso_handle, false);
+#else
+  GetDsoHandleFromSearchPath(cupti_path, "libcupti.so", dso_handle, false);
+#endif
+}
+
 void GetCurandDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
   GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index 4ffc335332698d1aba262edf2800965e72de77cb..b5b9c4af916241c1c7361b506f74563ebcf69b9a 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -34,6 +34,8 @@ void GetCublasDsoHandle(void** dso_handle);
  */
 void GetCUDNNDsoHandle(void** dso_handle);
 
+void GetCUPTIDsoHandle(void** dso_handle);
+
 /**
  * @brief    load the DSO of CURAND
  *
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 4804df7966dfedf7264eebaad3a42ed92739b096..201fc872946b70e3d7fbc318c8b04781056279b9 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -15,7 +15,13 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 #include <iomanip>
 #include <map>
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#endif  // PADDLE_WITH_CUDA
 #include "glog/logging.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/platform/device_tracer.h"
+#include "paddle/fluid/string/printf.h"
 
 namespace paddle {
 namespace platform {
@@ -126,15 +132,20 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
   GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
 }
 
-RecordEvent::RecordEvent(const std::string& name,
-                         const DeviceContext* dev_ctx) {
+RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx,
+                         int32_t block_id) {
   if (g_state == ProfilerState::kDisabled) return;
   dev_ctx_ = dev_ctx;
   name_ = name;
   PushEvent(name_, dev_ctx_);
+
+  full_name_ = string::Sprintf("%s_b%d", name, block_id);
+  // Maybe need the same push/pop behavior.
+  SetCurAnnotation(full_name_.c_str());
 }
 
 RecordEvent::~RecordEvent() {
+  ClearCurAnnotation();
   if (g_state == ProfilerState::kDisabled) return;
   PopEvent(name_, dev_ctx_);
 }
@@ -147,7 +158,14 @@ void EnableProfiler(ProfilerState state) {
                  "The profiling state should be disabled when calling ",
                  "EnableProfiler.");
   g_state = state;
-  g_profiler_place = (g_state == ProfilerState::kCUDA) ? "CUDA" : "CPU";
+  if (g_state == ProfilerState::kCUDA) {
+    g_profiler_place = "CUDA";
+  } else if (g_state == ProfilerState::kCPU) {
+    g_profiler_place = "CPU";
+  } else {
+    g_profiler_place = "All";
+    GetDeviceTracer()->Enable();
+  }
 #ifdef PADDLE_WITH_CUDA
   if (g_state == ProfilerState::kCUDA) {
     // Generate some dummy evenets first to reduce the startup overhead.
@@ -190,6 +208,12 @@ void DisableProfiler(EventSortingKey sorted_key) {
   Mark("_stop_profiler_", nullptr);
   g_state = ProfilerState::kDisabled;
 
+  DeviceTracer* tracer = GetDeviceTracer();
+  if (g_profiler_place == "All" && tracer && tracer->IsEnabled()) {
+    tracer->Disable();
+    tracer->GenProfile();
+  }
+
   std::vector<std::vector<Event>> all_events = GetAllEvents();
   ParseEvents(all_events, sorted_key);
   ResetProfiler();
@@ -254,9 +278,11 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
         }
 
         if (rit != pushed_events.rend()) {
-          double event_time = (g_profiler_place == "CUDA")
-                                  ? rit->CudaElapsedMs(events[i][j])
-                                  : rit->CpuElapsedMs(events[i][j]);
+          double event_time =
+              (g_profiler_place == "CUDA" || g_profiler_place == "All")
+                  ? rit->CudaElapsedMs(events[i][j])
+                  : rit->CpuElapsedMs(events[i][j]);
+
           std::string event_name =
               "thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
           max_name_width = std::max(max_name_width, event_name.size());
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index a3d22df70057e7967d9fc349ea0cbd73ceb8e0e9..830b86c88ee11b217114c95348c2d25d0dcdf961 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <mutex>
 #include <vector>
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/profiler.pb.h"
 
 namespace paddle {
 namespace platform {
@@ -93,6 +94,7 @@ enum ProfilerState {
   kDisabled,  // disabled state
   kCPU,       // CPU profiling state
   kCUDA,      // GPU profiling state
+  kAll,       // Profile both CPU and GPU. (Currently experimental).
 };
 
 void Mark(const std::string& name, const DeviceContext* dev_ctx);
@@ -102,7 +104,8 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
 void PopEvent(const std::string& name, const DeviceContext* dev_ctx);
 
 struct RecordEvent {
-  explicit RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
+  RecordEvent(const std::string& name, const DeviceContext* dev_ctx,
+              int32_t block_id);
 
   ~RecordEvent();
 
@@ -110,9 +113,12 @@ struct RecordEvent {
   const DeviceContext* dev_ctx_;
   // Event name
   std::string name_;
+  // Need to distinguish name by op type, block_id, program_id and perhaps
+  // different kernel invocations within an op.
+  std::string full_name_;
 };
 
-// Return the event list of all threads. Asummed the returned value calls
+// Return the event list of all threads. Assumed the returned value calls
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
 std::vector<std::vector<Event>> GetAllEvents();
 
diff --git a/paddle/fluid/platform/profiler.proto b/paddle/fluid/platform/profiler.proto
new file mode 100644
index 0000000000000000000000000000000000000000..bdd86a0440d2b00eaee14195030456d0ad217f9a
--- /dev/null
+++ b/paddle/fluid/platform/profiler.proto
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax = "proto2";
+package paddle.platform.proto;
+
+message Event {
+  optional string name = 1;
+  optional uint64 start_ns = 2;
+  optional uint64 end_ns = 3;
+  optional uint32 device_id = 5;
+  optional uint32 stream_id = 6;
+}
+
+message Profile {
+  repeated Event events = 1;
+  optional uint64 start_ns = 2;
+  optional uint64 end_ns = 3;
+}
\ No newline at end of file
diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc
index dae4d2206e0a1ec6ef99122460a15c064efe58fd..8bc480857a4c3ae2825f08a8d9ed9c152adb80d4 100644
--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
@@ -95,7 +95,7 @@ TEST(RecordEvent, RecordEvent) {
    */
   for (int i = 1; i < 5; ++i) {
     std::string name = "evs_op_" + std::to_string(i);
-    RecordEvent record_event(name, dev_ctx);
+    RecordEvent record_event(name, dev_ctx, 0);
     int counter = 1;
     while (counter != i * 1000) counter++;
   }
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index b725be79529c5ccdde12446b5b5c09eaf47550e6..b0a2497d919b65afbe5eeaf4fe47c19baa1aba1c 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -252,7 +252,7 @@ void BindVarDsec(py::module &m) {
       .value("CHANNEL", proto::VarType::CHANNEL)
       .value("PLACE_LIST", proto::VarType::PLACE_LIST)
       .value("READER", proto::VarType::READER)
-      .value("NCCL_COM", proto::VarType::NCCL_COM);
+      .value("RAW", proto::VarType::RAW);
 }
 
 void BindOpDesc(py::module &m) {
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index b3e03f33470810a685dc7bfe29f8da50454b2238..ac7d1efb577505b70e10a70cdcfd3ed9c5fe1f5c 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -459,6 +459,7 @@ All parameter, weight, gradient are variables in Paddle.
       .value("kDisabled", platform::ProfilerState::kDisabled)
       .value("kCPU", platform::ProfilerState::kCPU)
       .value("kCUDA", platform::ProfilerState::kCUDA)
+      .value("kAll", platform::ProfilerState::kAll)
       .export_values();
 
   py::enum_<platform::EventSortingKey>(m, "EventSortingKey", py::arithmetic())
diff --git a/paddle/scripts/docker/README.md b/paddle/scripts/docker/README.md
index 65c46745556bc5ea91fdd4e33060f2535422e8e8..78c0cc378231f763597556cc5450f6f03ab2b291 100644
--- a/paddle/scripts/docker/README.md
+++ b/paddle/scripts/docker/README.md
@@ -58,7 +58,7 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
 | `WITH_AVX` | OFF | Set to "ON" to enable AVX support. |
 | `WITH_TESTING` | OFF | Build unit tests binaries. |
 | `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. |
-| `WITH_GOLANG` | ON | Build fault-tolerant parameter server written in go. |
+| `WITH_GOLANG` | OFF | Build fault-tolerant parameter server written in go. |
 | `WITH_SWIG_PY` | ON | Build with SWIG python API support. |
 | `WITH_C_API` | OFF | Build capi libraries for inference. |
 | `WITH_PYTHON` | ON | Build with python support. Turn this off if build is only for capi. |
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 8ec3d0c657400165c2225238f21facfb6c84df7c..06319fc638984f8f8ed897c362f516e1534bc8db 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -40,7 +40,7 @@ function cmake_gen() {
         -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
         -DWITH_MKL=${WITH_MKL:-ON}
         -DWITH_AVX=${WITH_AVX:-OFF}
-        -DWITH_GOLANG=${WITH_GOLANG:-ON}
+        -DWITH_GOLANG=${WITH_GOLANG:-OFF}
         -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
         -DWITH_SWIG_PY=ON
         -DWITH_C_API=${WITH_C_API:-OFF}
@@ -49,6 +49,7 @@ function cmake_gen() {
         -DCUDNN_ROOT=/usr/
         -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON}
         -DWITH_TESTING=${WITH_TESTING:-ON}
+        -DWITH_FAST_BUNDLE_TEST=ON
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
     ========================================
 EOF
@@ -64,7 +65,7 @@ EOF
         -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
         -DWITH_MKL=${WITH_MKL:-ON} \
         -DWITH_AVX=${WITH_AVX:-OFF} \
-        -DWITH_GOLANG=${WITH_GOLANG:-ON} \
+        -DWITH_GOLANG=${WITH_GOLANG:-OFF} \
         -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
         -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
         -DWITH_C_API=${WITH_C_API:-OFF} \
@@ -72,6 +73,7 @@ EOF
         -DCUDNN_ROOT=/usr/ \
         -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
         -DWITH_TESTING=${WITH_TESTING:-ON} \
+        -DWITH_FAST_BUNDLE_TEST=ON \
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 }
 
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index 270f2f4c181df847348be12a199534d47b3577f5..0fea6a80794a64abc2dbf1428d534840febcd450 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -28,10 +28,9 @@ int main(int argc, char** argv) {
   }
 #ifdef PADDLE_WITH_CUDA
   new_argv.push_back(
-      strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory,"
-             "warpctc_dir"));
+      strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
 #else
-  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory,warpctc_dir"));
+  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory"));
 #endif
   int new_argc = static_cast<int>(new_argv.size());
   char** new_argv_address = new_argv.data();
diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py
index 2fcf3753c5f1211d3b27f38fbdc8d097c437c79a..8da9ca290b22ae69b1fd195d8614c31dc4e13e00 100644
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -226,8 +226,7 @@ class DistributeTranspiler:
         rpc_client_var = program.global_block().create_var(
             name="RPC_CLIENT_VAR",
             persistable=True,
-            dtype='float32',  # dtype and shape is not used in fact
-            shape=[0])
+            type=core.VarDesc.VarType.RAW)
 
         # create send_op
         program.global_block().append_op(
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 64441e8fa491dd71101c95e14bedf956eb61ee3e..2e23ddc9be45868218b0c751a226e492ddc5ae39 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -784,6 +784,7 @@ class Block(object):
         elif type(v) == Variable:
             var = Variable(
                 self,
+                type=v.type,
                 name=new_name,
                 error_clip=error_clip,
                 stop_gradient=stop_gradient)
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 33f709ece48c450fbf893855edd59cd687cb0d9d..1817caa94275e4efa47ec1a5a0aa861255c75561 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -68,7 +68,7 @@ def save_vars(executor,
               main_program=None,
               vars=None,
               predicate=None,
-              save_file_name=None):
+              filename=None):
     """
     Save variables to directory by executor.
 
@@ -80,8 +80,8 @@ def save_vars(executor,
     as a bool. If it returns true, the corresponding input variable will be saved.
     :param vars: variables need to be saved. If vars is specified, program & predicate
     will be ignored
-    :param save_file_name: The name of a single file that all vars are saved to. 
-    If it is None, save variables to separate files.
+    :param filename: The name of a single file that all vars are saved to.
+        If it is None, save variables to separate files.
 
     :return: None
     """
@@ -95,7 +95,7 @@ def save_vars(executor,
             executor,
             dirname=dirname,
             vars=filter(predicate, main_program.list_vars()),
-            save_file_name=save_file_name)
+            filename=filename)
     else:
         save_program = Program()
         save_block = save_program.global_block()
@@ -103,7 +103,7 @@ def save_vars(executor,
         save_var_map = {}
         for each_var in vars:
             new_var = _clone_var_in_block_(save_block, each_var)
-            if save_file_name is None:
+            if filename is None:
                 save_block.append_op(
                     type='save',
                     inputs={'X': [new_var]},
@@ -112,7 +112,7 @@ def save_vars(executor,
             else:
                 save_var_map[new_var.name] = new_var
 
-        if save_file_name is not None:
+        if filename is not None:
             save_var_list = []
             for name in sorted(save_var_map.keys()):
                 save_var_list.append(save_var_map[name])
@@ -121,12 +121,12 @@ def save_vars(executor,
                 type='save_combine',
                 inputs={'X': save_var_list},
                 outputs={},
-                attrs={'file_path': os.path.join(dirname, save_file_name)})
+                attrs={'file_path': os.path.join(dirname, filename)})
 
         executor.run(save_program)
 
 
-def save_params(executor, dirname, main_program=None, save_file_name=None):
+def save_params(executor, dirname, main_program=None, filename=None):
     """
     Save all parameters to directory with executor.
     """
@@ -136,11 +136,10 @@ def save_params(executor, dirname, main_program=None, save_file_name=None):
         main_program=main_program,
         vars=None,
         predicate=is_parameter,
-        save_file_name=save_file_name)
+        filename=filename)
 
 
-def save_persistables(executor, dirname, main_program=None,
-                      save_file_name=None):
+def save_persistables(executor, dirname, main_program=None, filename=None):
     """
     Save all persistables to directory with executor.
     """
@@ -150,7 +149,7 @@ def save_persistables(executor, dirname, main_program=None,
         main_program=main_program,
         vars=None,
         predicate=is_persistable,
-        save_file_name=save_file_name)
+        filename=filename)
 
 
 def load_vars(executor,
@@ -158,7 +157,7 @@ def load_vars(executor,
               main_program=None,
               vars=None,
               predicate=None,
-              load_file_name=None):
+              filename=None):
     """
     Load variables from directory by executor.
 
@@ -170,8 +169,8 @@ def load_vars(executor,
     as a bool. If it returns true, the corresponding input variable will be loaded.
     :param vars: variables need to be loaded. If vars is specified, program &
     predicate will be ignored
-    :param load_file_name: The name of the single file that all vars are loaded from.   
-    If it is None, load variables from separate files.
+    :param filename: The name of the single file that all vars are loaded from.
+        If it is None, load variables from separate files.
 
     :return: None
     """
@@ -185,7 +184,7 @@ def load_vars(executor,
             executor,
             dirname=dirname,
             vars=filter(predicate, main_program.list_vars()),
-            load_file_name=load_file_name)
+            filename=filename)
     else:
         load_prog = Program()
         load_block = load_prog.global_block()
@@ -194,7 +193,7 @@ def load_vars(executor,
         for each_var in vars:
             assert isinstance(each_var, Variable)
             new_var = _clone_var_in_block_(load_block, each_var)
-            if load_file_name is None:
+            if filename is None:
                 load_block.append_op(
                     type='load',
                     inputs={},
@@ -203,7 +202,7 @@ def load_vars(executor,
             else:
                 load_var_map[new_var.name] = new_var
 
-        if load_file_name is not None:
+        if filename is not None:
             load_var_list = []
             for name in sorted(load_var_map.keys()):
                 load_var_list.append(load_var_map[name])
@@ -212,12 +211,12 @@ def load_vars(executor,
                 type='load_combine',
                 inputs={},
                 outputs={"Out": load_var_list},
-                attrs={'file_path': os.path.join(dirname, load_file_name)})
+                attrs={'file_path': os.path.join(dirname, filename)})
 
         executor.run(load_prog)
 
 
-def load_params(executor, dirname, main_program=None, load_file_name=None):
+def load_params(executor, dirname, main_program=None, filename=None):
     """
     load all parameters from directory by executor.
     """
@@ -226,11 +225,10 @@ def load_params(executor, dirname, main_program=None, load_file_name=None):
         dirname=dirname,
         main_program=main_program,
         predicate=is_parameter,
-        load_file_name=load_file_name)
+        filename=filename)
 
 
-def load_persistables(executor, dirname, main_program=None,
-                      load_file_name=None):
+def load_persistables(executor, dirname, main_program=None, filename=None):
     """
     load all persistables from directory by executor.
     """
@@ -239,7 +237,7 @@ def load_persistables(executor, dirname, main_program=None,
         dirname=dirname,
         main_program=main_program,
         predicate=is_persistable,
-        load_file_name=load_file_name)
+        filename=filename)
 
 
 def get_inference_program(target_vars, main_program=None):
@@ -299,7 +297,8 @@ def save_inference_model(dirname,
                          target_vars,
                          executor,
                          main_program=None,
-                         save_file_name=None):
+                         model_filename=None,
+                         params_filename=None):
     """
     Build a model especially for inference,
     and save it to directory by the executor.
@@ -310,8 +309,11 @@ def save_inference_model(dirname,
     :param executor: executor that save inference model
     :param main_program: original program, which will be pruned to build the inference model.
             Default default_main_program().
-    :param save_file_name: The name of a single file that all parameters are saved to. 
-    If it is None, save parameters to separate files.
+    :param model_filename: The name of file to save inference program.
+        If not specified, default filename `__model__` will be used.
+    :param params_filename: The name of file to save parameters.
+        It is used for the case that all parameters are saved in a single binary file.
+        If not specified, parameters are considered saved in separate files.
 
     :return: None
     """
@@ -342,15 +344,19 @@ def save_inference_model(dirname,
     prepend_feed_ops(inference_program, feeded_var_names)
     append_fetch_ops(inference_program, fetch_var_names)
 
-    if save_file_name == None:
-        model_file_name = dirname + "/__model__"
+    if model_filename is not None:
+        model_filename = os.path.basename(model_filename)
     else:
-        model_file_name = dirname + "/__model_combined__"
+        model_filename = "__model__"
+    model_filename = os.path.join(dirname, model_filename)
 
-    with open(model_file_name, "wb") as f:
+    if params_filename is not None:
+        params_filename = os.path.basename(params_filename)
+
+    with open(model_filename, "wb") as f:
         f.write(inference_program.desc.serialize_to_string())
 
-    save_persistables(executor, dirname, inference_program, save_file_name)
+    save_persistables(executor, dirname, inference_program, params_filename)
 
 
 def get_feed_targets_names(program):
@@ -371,15 +377,21 @@ def get_fetch_targets_names(program):
     return fetch_targets_names
 
 
-def load_inference_model(dirname, executor, load_file_name=None):
+def load_inference_model(dirname,
+                         executor,
+                         model_filename=None,
+                         params_filename=None):
     """
     Load inference model from a directory
 
     :param dirname: directory path
     :param executor: executor that load inference model
-    :param load_file_name: The name of the single file that all parameters are loaded from.   
-    If it is None, load parameters from separate files.
-    
+    :param model_filename: The name of file to load inference program.
+        If not specified, default filename `__model__` will be used.
+    :param params_filename: The name of file to load parameters.
+        It is used for the case that all parameters are saved in a single binary file.
+        If not specified, parameters are considered saved in separate files.
+
     :return: [program, feed_target_names, fetch_targets]
              program: program especially for inference.
              feed_target_names: Names of variables that need to feed data
@@ -388,16 +400,20 @@ def load_inference_model(dirname, executor, load_file_name=None):
     if not os.path.isdir(dirname):
         raise ValueError("There is no directory named '%s'", dirname)
 
-    if load_file_name == None:
-        model_file_name = dirname + "/__model__"
+    if model_filename is not None:
+        model_filename = os.path.basename(model_filename)
     else:
-        model_file_name = dirname + "/__model_combined__"
+        model_filename = "__model__"
+    model_filename = os.path.join(dirname, model_filename)
+
+    if params_filename is not None:
+        params_filename = os.path.basename(params_filename)
 
-    with open(model_file_name, "rb") as f:
+    with open(model_filename, "rb") as f:
         program_desc_str = f.read()
 
     program = Program.parse_from_string(program_desc_str)
-    load_persistables(executor, dirname, program, load_file_name)
+    load_persistables(executor, dirname, program, params_filename)
 
     feed_target_names = get_feed_targets_names(program)
     fetch_target_names = get_fetch_targets_names(program)
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 5ae4da1ea31d036217c5595f8b30842403896a7c..fff64a57a43bc3f1ce5806d66e857d033f780620 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -16,6 +16,7 @@ All layers just related to the detection neural network.
 """
 
 from layer_function_generator import generate_layer_fn
+from layer_function_generator import autodoc
 from ..layer_helper import LayerHelper
 import tensor
 import ops
@@ -28,6 +29,7 @@ __all__ = [
     'target_assign',
     'detection_output',
     'ssd_loss',
+    'detection_map',
 ]
 
 __auto__ = [
@@ -132,7 +134,48 @@ def detection_output(scores,
     return nmsed_outs
 
 
-def bipartite_match(dist_matrix, name=None):
+@autodoc()
+def detection_map(detect_res,
+                  label,
+                  pos_count=None,
+                  true_pos=None,
+                  false_pos=None,
+                  overlap_threshold=0.3,
+                  evaluate_difficult=True,
+                  ap_type='integral'):
+    helper = LayerHelper("detection_map", **locals())
+
+    map_out = helper.create_tmp_variable(dtype='float32')
+    accum_pos_count_out = helper.create_tmp_variable(dtype='int32')
+    accum_true_pos_out = helper.create_tmp_variable(dtype='float32')
+    accum_false_pos_out = helper.create_tmp_variable(dtype='float32')
+    helper.append_op(
+        type="detection_map",
+        inputs={
+            'Label': label,
+            'DetectRes': detect_res,
+            'PosCount': pos_count,
+            'TruePos': true_pos,
+            'FalsePos': false_pos
+        },
+        outputs={
+            'MAP': map_out,
+            'AccumPosCount': accum_pos_count_out,
+            'AccumTruePos': accum_true_pos_out,
+            'AccumFalsePos': accum_false_pos_out
+        },
+        attrs={
+            'overlap_threshold': overlap_threshold,
+            'evaluate_difficult': evaluate_difficult,
+            'ap_type': ap_type
+        })
+    return map_out, accum_pos_count_out, accum_true_pos_out, accum_false_pos_out
+
+
+def bipartite_match(dist_matrix,
+                    match_type=None,
+                    dist_threshold=None,
+                    name=None):
     """
     **Bipartite matchint operator**
 
@@ -164,6 +207,11 @@ def bipartite_match(dist_matrix, name=None):
             This tensor can contain LoD information to represent a batch of
             inputs. One instance of this batch can contain different numbers of
             entities.
+        match_type(string|None): The type of matching method, should be
+           'bipartite' or 'per_prediction', 'bipartite' by defalut.
+        dist_threshold(float|None): If `match_type` is 'per_prediction',
+            this threshold is to determine the extra matching bboxes based
+            on the maximum distance, 0.5 by defalut.
     Returns:
         match_indices(Variable): A 2-D Tensor with shape [N, M] in int type.
             N is the batch size. If match_indices[i][j] is -1, it
@@ -183,6 +231,10 @@ def bipartite_match(dist_matrix, name=None):
     helper.append_op(
         type='bipartite_match',
         inputs={'DistMat': dist_matrix},
+        attrs={
+            'match_type': match_type,
+            'dist_threshold': dist_threshold,
+        },
         outputs={
             'ColToRowMatchIndices': match_indices,
             'ColToRowMatchDist': match_distance
@@ -333,7 +385,7 @@ def ssd_loss(location,
         loc_loss_weight (float): Weight for localization loss, 1.0 by default.
         conf_loss_weight (float): Weight for confidence loss, 1.0 by default.
         match_type (str): The type of matching method during training, should
-            be 'bipartite' or 'per_prediction'.
+            be 'bipartite' or 'per_prediction', 'per_prediction' by defalut.
         mining_type (str): The hard example mining type, should be 'hard_example'
             or 'max_negative', now only support `max_negative`.
 
@@ -381,7 +433,8 @@ def ssd_loss(location,
     #   1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
     iou = iou_similarity(x=gt_box, y=prior_box)
     #   1.2 Compute matched boundding box by bipartite matching algorithm.
-    matched_indices, matched_dist = bipartite_match(iou)
+    matched_indices, matched_dist = bipartite_match(iou, match_type,
+                                                    overlap_threshold)
 
     # 2. Compute confidence for mining hard examples
     # 2.1. Get the target label based on matched indices
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 87cc96eb823a72b8ab65727fda828a864c0bb847..e0b620bb277c20550c49162e96f45dd598b2a7ac 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -21,6 +21,7 @@ from ..framework import Variable
 from ..param_attr import ParamAttr
 from layer_function_generator import autodoc
 from tensor import concat
+import utils
 
 __all__ = [
     'fc',
@@ -1139,8 +1140,8 @@ def sequence_conv(input,
 def conv2d(input,
            num_filters,
            filter_size,
-           stride=None,
-           padding=None,
+           stride=1,
+           padding=0,
            groups=None,
            param_attr=None,
            bias_attr=None,
@@ -1253,12 +1254,10 @@ def conv2d(input,
             raise ValueError("num_channels must be divisible by groups.")
         num_filter_channels = num_channels / groups
 
-    if isinstance(filter_size, int):
-        filter_size = [filter_size, filter_size]
-    if isinstance(stride, int):
-        stride = [stride, stride]
-    if isinstance(padding, int):
-        padding = [padding, padding]
+    filter_size = utils.convert_to_list(filter_size, 2, 'filter_size')
+    stride = utils.convert_to_list(stride, 2, 'stride')
+    padding = utils.convert_to_list(padding, 2, 'padding')
+
     if not isinstance(use_cudnn, bool):
         raise ValueError("use_cudnn should be True or False")
 
@@ -1433,10 +1432,10 @@ def sequence_last_step(input):
 
 
 def pool2d(input,
-           pool_size,
-           pool_type,
-           pool_stride=None,
-           pool_padding=None,
+           pool_size=-1,
+           pool_type="max",
+           pool_stride=1,
+           pool_padding=0,
            global_pooling=False,
            use_cudnn=True,
            name=None):
@@ -1444,20 +1443,20 @@ def pool2d(input,
     This function adds the operator for pooling in 2 dimensions, using the
     pooling configurations mentioned in input parameters.
     """
-    if pool_padding is None:
-        pool_padding = [0, 0]
-    if pool_stride is None:
-        pool_stride = [1, 1]
     if pool_type not in ["max", "avg"]:
         raise ValueError(
             "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
             str(pool_type))
-    if isinstance(pool_size, int):
-        pool_size = [pool_size, pool_size]
-    if isinstance(pool_stride, int):
-        pool_stride = [pool_stride, pool_stride]
-    if isinstance(pool_padding, int):
-        pool_padding = [pool_padding, pool_padding]
+
+    if global_pooling is False and pool_size == -1:
+        raise ValueError(
+            "When the global_pooling is False, pool_size must be passed "
+            "and be a valid value. Received pool_size: " + str(pool_size))
+
+    pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
+    pool_padding = utils.convert_to_list(pool_padding, 2, 'pool_padding')
+    pool_stride = utils.convert_to_list(pool_stride, 2, 'pool_stride')
+
     if not isinstance(use_cudnn, bool):
         raise ValueError("use_cudnn should be True or False")
 
@@ -1686,9 +1685,9 @@ def conv2d_transpose(input,
                      num_filters,
                      output_size=None,
                      filter_size=None,
-                     padding=None,
-                     stride=None,
-                     dilation=None,
+                     padding=0,
+                     stride=1,
+                     dilation=1,
                      param_attr=None,
                      use_cudnn=True,
                      name=None):
@@ -1784,26 +1783,12 @@ def conv2d_transpose(input,
         raise TypeError("Input of conv2d_transpose must be Variable")
     input_channel = input.shape[1]
 
-    op_attr = dict()
-
-    if isinstance(padding, int):
-        op_attr['paddings'] = [padding, padding]
-    elif padding is not None:
-        op_attr['paddings'] = padding
-
-    if isinstance(stride, int):
-        op_attr['strides'] = [stride, stride]
-    elif stride is not None:
-        op_attr['strides'] = stride
-
-    if isinstance(dilation, int):
-        op_attr['dilations'] = [dilation, dilation]
-    elif dilation is not None:
-        op_attr['dilations'] = dilation
+    padding = utils.convert_to_list(padding, 2, 'padding')
+    stride = utils.convert_to_list(stride, 2, 'stride')
+    dilation = utils.convert_to_list(dilation, 2, 'dilation')
 
     if not isinstance(use_cudnn, bool):
         raise ValueError("use_cudnn should be True or False")
-    op_attr['use_cudnn'] = use_cudnn
 
     if filter_size is None:
         if output_size is None:
@@ -1811,10 +1796,6 @@ def conv2d_transpose(input,
         if isinstance(output_size, int):
             output_size = [output_size, output_size]
 
-        padding = op_attr.get('paddings', [0, 0])
-        stride = op_attr.get('strides', [1, 1])
-        dilation = op_attr.get('dilations', [1, 1])
-
         h_in = input.shape[2]
         w_in = input.shape[3]
 
@@ -1823,9 +1804,9 @@ def conv2d_transpose(input,
         filter_size_w = (output_size[1] - (w_in - 1) * stride[1] + 2 *
                          padding[1] - 1) / dilation[1] + 1
         filter_size = [filter_size_h, filter_size_w]
-
-    elif isinstance(filter_size, int):
-        filter_size = [filter_size, filter_size]
+    else:
+        filter_size = utils.convert_to_list(filter_size, 2,
+                                            'conv2d_transpose.filter_size')
 
     filter_shape = [input_channel, num_filters] + filter_size
     img_filter = helper.create_parameter(
@@ -1837,7 +1818,12 @@ def conv2d_transpose(input,
         inputs={'Input': [input],
                 'Filter': [img_filter]},
         outputs={'Output': out},
-        attrs=op_attr)
+        attrs={
+            'strides': stride,
+            'paddings': padding,
+            'dilations': dilation,
+            'use_cudnn': use_cudnn
+        })
 
     return out
 
diff --git a/python/paddle/fluid/layers/utils.py b/python/paddle/fluid/layers/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..49ec3088831dff415e042e1b0a632f63106eb07b
--- /dev/null
+++ b/python/paddle/fluid/layers/utils.py
@@ -0,0 +1,59 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+
+
+def convert_to_list(value, n, name, dtype=np.int):
+    """
+    Converts a single numerical type or iterable of numerical
+    types into an numerical type list.
+
+    Arguments:
+      value: The value to validate and convert. Could an int, or any iterable
+        of ints.
+      n: The size of the list to be returned.
+      name: The name of the argument being validated, e.g. "stride" or
+        "filter_size". This is only used to format error messages.
+      dtype: the numerical type of the element of the list to be returned.
+
+    Returns:
+      A list of n dtypes.
+
+    Raises:
+      ValueError: If something else than an int/long or iterable thereof was
+        passed.
+    """
+    if isinstance(value, dtype):
+        return [value, ] * n
+    else:
+        try:
+            value_list = list(value)
+        except TypeError:
+            raise ValueError("The " + name +
+                             "'s type must be list or tuple. Received: " + str(
+                                 value))
+        if len(value_list) != n:
+            raise ValueError("The " + name + "'s length must be " + str(n) +
+                             ". Received: " + str(value))
+        for single_value in value_list:
+            try:
+                dtype(single_value)
+            except (ValueError, TypeError):
+                raise ValueError(
+                    "The " + name + "'s type must be a list or tuple of " + str(
+                        n) + " " + str(dtype) + " . Received: " + str(
+                            value) + " "
+                    "including element " + str(single_value) + " of type" + " "
+                    + str(type(single_value)))
+        return value_list
diff --git a/python/paddle/fluid/learning_rate_decay.py b/python/paddle/fluid/learning_rate_decay.py
index a914b94e234757f5bac03d8cad4c2fc69136b92a..631efa048740ea3d50947a321ae2e76c6a6048af 100644
--- a/python/paddle/fluid/learning_rate_decay.py
+++ b/python/paddle/fluid/learning_rate_decay.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import layers
-from framework import Variable
 from initializer import init_on_cpu
 
 __all__ = [
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index 4611986c9969f12b71290cf8ee03a50a6ad76f94..59e75209d39dc0f2b72ecf832ff15df192a2898e 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -97,9 +97,14 @@ def profiler(state, sorted_key=None):
             The `ave` means sorting by the average execution time.
     """
 
-    if state not in ['CPU', 'GPU']:
-        raise ValueError("The state must be 'CPU' or 'GPU'.")
-    prof_state = core.ProfilerState.kCUDA if state == "GPU" else core.ProfilerState.kCPU
+    if state not in ['CPU', 'GPU', "All"]:
+        raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.")
+    if state == "GPU":
+        prof_state = core.ProfilerState.kCUDA
+    elif state == "CPU":
+        prof_state = core.ProfilerState.kCPU
+    else:
+        prof_state = core.ProfilerState.kAll
     core.enable_profiler(prof_state)
     yield
 
diff --git a/python/paddle/fluid/tests/CMakeLists.txt b/python/paddle/fluid/tests/CMakeLists.txt
index 5ff7b1b027e0e17d233f2a8a1c9775ccfbe1797e..d24417bbacb503d9ea70e68e7e0edb59e7dddbde 100644
--- a/python/paddle/fluid/tests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/CMakeLists.txt
@@ -7,5 +7,4 @@ endforeach()
 
 add_subdirectory(unittests)
 add_subdirectory(book)
-add_subdirectory(book_distribute)
 add_subdirectory(book_memory_optimization)
diff --git a/python/paddle/fluid/tests/book/notest_rnn_encoder_decoer.py b/python/paddle/fluid/tests/book/notest_rnn_encoder_decoer.py
index 1ed58c3d3d170a938cef813692d7841227964b16..983f8f4dbeac83566839de25ec9765eb248be768 100644
--- a/python/paddle/fluid/tests/book/notest_rnn_encoder_decoer.py
+++ b/python/paddle/fluid/tests/book/notest_rnn_encoder_decoer.py
@@ -228,32 +228,34 @@ def infer(use_cuda, save_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
 
-    # Use fluid.io.load_inference_model to obtain the inference program desc,
-    # the feed_target_names (the names of variables that will be feeded 
-    # data using feed operators), and the fetch_targets (variables that 
-    # we want to obtain data from using fetch operators).
-    [inference_program, feed_target_names,
-     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
-    lod = [0, 4, 10]
-    word_data = create_random_lodtensor(lod, place, low=0, high=1)
-    trg_word = create_random_lodtensor(lod, place, low=0, high=1)
-
-    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
-    # and results will contain a list of data corresponding to fetch_targets.
-    assert feed_target_names[0] == 'source_sequence'
-    assert feed_target_names[1] == 'target_sequence'
-    results = exe.run(inference_program,
-                      feed={
-                          feed_target_names[0]: word_data,
-                          feed_target_names[1]: trg_word,
-                      },
-                      fetch_list=fetch_targets,
-                      return_numpy=False)
-    print(results[0].lod())
-    np_data = np.array(results[0])
-    print("Inference shape: ", np_data.shape)
-    print("Inference results: ", np_data)
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        # Use fluid.io.load_inference_model to obtain the inference program desc,
+        # the feed_target_names (the names of variables that will be feeded
+        # data using feed operators), and the fetch_targets (variables that
+        # we want to obtain data from using fetch operators).
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+        lod = [0, 4, 10]
+        word_data = create_random_lodtensor(lod, place, low=0, high=1)
+        trg_word = create_random_lodtensor(lod, place, low=0, high=1)
+
+        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+        # and results will contain a list of data corresponding to fetch_targets.
+        assert feed_target_names[0] == 'source_sequence'
+        assert feed_target_names[1] == 'target_sequence'
+        results = exe.run(inference_program,
+                          feed={
+                              feed_target_names[0]: word_data,
+                              feed_target_names[1]: trg_word,
+                          },
+                          fetch_list=fetch_targets,
+                          return_numpy=False)
+        print(results[0].lod())
+        np_data = np.array(results[0])
+        print("Inference shape: ", np_data.shape)
+        print("Inference results: ", np_data)
 
 
 def main(use_cuda):
diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py
index 8ceee52ff9c425a6cc5479acb9c5b8f0928fc991..93ef66851b0efd65361122853dadeefe11992ed5 100644
--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
@@ -19,9 +19,10 @@ import numpy
 import unittest
 import math
 import sys
+import os
 
 
-def train(use_cuda, save_dirname):
+def train(use_cuda, save_dirname, is_local):
     x = fluid.layers.data(name='x', shape=[13], dtype='float32')
 
     y_predict = fluid.layers.fc(input=x, size=1, act=None)
@@ -32,7 +33,7 @@ def train(use_cuda, save_dirname):
     avg_cost = fluid.layers.mean(cost)
 
     sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-    sgd_optimizer.minimize(avg_cost)
+    optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
 
     BATCH_SIZE = 20
 
@@ -42,27 +43,57 @@ def train(use_cuda, save_dirname):
         batch_size=BATCH_SIZE)
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
     exe = fluid.Executor(place)
 
-    exe.run(fluid.default_startup_program())
-
-    PASS_NUM = 100
-    for pass_id in range(PASS_NUM):
-        for data in train_reader():
-            avg_loss_value, = exe.run(fluid.default_main_program(),
-                                      feed=feeder.feed(data),
-                                      fetch_list=[avg_cost])
-            print(avg_loss_value)
-            if avg_loss_value[0] < 10.0:
-                if save_dirname is not None:
-                    fluid.io.save_inference_model(save_dirname, ['x'],
-                                                  [y_predict], exe)
-                return
-            if math.isnan(float(avg_loss_value)):
-                sys.exit("got NaN loss, training failed.")
-    raise AssertionError("Fit a line cost is too large, {0:2.2}".format(
-        avg_loss_value[0]))
+    def train_loop(main_program):
+        feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+        exe.run(fluid.default_startup_program())
+
+        PASS_NUM = 100
+        for pass_id in range(PASS_NUM):
+            for data in train_reader():
+                avg_loss_value, = exe.run(main_program,
+                                          feed=feeder.feed(data),
+                                          fetch_list=[avg_cost])
+                print(avg_loss_value)
+                if avg_loss_value[0] < 10.0:
+                    if save_dirname is not None:
+                        fluid.io.save_inference_model(save_dirname, ['x'],
+                                                      [y_predict], exe)
+                    return
+                if math.isnan(float(avg_loss_value)):
+                    sys.exit("got NaN loss, training failed.")
+        raise AssertionError("Fit a line cost is too large, {0:2.2}".format(
+            avg_loss_value[0]))
+
+    if is_local:
+        train_loop(fluid.default_main_program())
+    else:
+        port = os.getenv("PADDLE_INIT_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, port]))
+        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
+        trainers = int(os.getenv("TRAINERS"))
+        current_endpoint = os.getenv("POD_IP") + ":" + port
+        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            optimize_ops,
+            params_grads,
+            trainer_id,
+            pservers=pserver_endpoints,
+            trainers=trainers)
+        if training_role == "PSERVER":
+            pserver_prog = t.get_pserver_program(current_endpoint)
+            pserver_startup = t.get_startup_program(current_endpoint,
+                                                    pserver_prog)
+            exe.run(pserver_startup)
+            exe.run(pserver_prog)
+        elif training_role == "TRAINER":
+            train_loop(t.get_trainer_program())
 
 
 def infer(use_cuda, save_dirname=None):
@@ -72,33 +103,36 @@ def infer(use_cuda, save_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
 
-    # Use fluid.io.load_inference_model to obtain the inference program desc,
-    # the feed_target_names (the names of variables that will be feeded 
-    # data using feed operators), and the fetch_targets (variables that 
-    # we want to obtain data from using fetch operators).
-    [inference_program, feed_target_names,
-     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
-    # The input's dimension should be 2-D and the second dim is 13
-    # The input data should be >= 0
-    batch_size = 10
-    tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
-    assert feed_target_names[0] == 'x'
-    results = exe.run(inference_program,
-                      feed={feed_target_names[0]: tensor_x},
-                      fetch_list=fetch_targets)
-    print("infer shape: ", results[0].shape)
-    print("infer results: ", results[0])
-
-
-def main(use_cuda):
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        # Use fluid.io.load_inference_model to obtain the inference program desc,
+        # the feed_target_names (the names of variables that will be feeded
+        # data using feed operators), and the fetch_targets (variables that
+        # we want to obtain data from using fetch operators).
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+        # The input's dimension should be 2-D and the second dim is 13
+        # The input data should be >= 0
+        batch_size = 10
+        tensor_x = numpy.random.uniform(0, 10,
+                                        [batch_size, 13]).astype("float32")
+        assert feed_target_names[0] == 'x'
+        results = exe.run(inference_program,
+                          feed={feed_target_names[0]: tensor_x},
+                          fetch_list=fetch_targets)
+        print("infer shape: ", results[0].shape)
+        print("infer results: ", results[0])
+
+
+def main(use_cuda, is_local=True):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
 
     # Directory for saving the trained model
     save_dirname = "fit_a_line.inference.model"
 
-    train(use_cuda, save_dirname)
+    train(use_cuda, save_dirname, is_local)
     infer(use_cuda, save_dirname)
 
 
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index 615e23529a9ac613d5e37ae68175cc09ad73b43f..613f4a7bf1c41f9f320ba8d310545a182f95e316 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -21,6 +21,7 @@ import math
 import sys
 import numpy
 import unittest
+import os
 
 
 def resnet_cifar10(input, depth=32):
@@ -92,7 +93,7 @@ def vgg16_bn_drop(input):
     return fc2
 
 
-def train(net_type, use_cuda, save_dirname):
+def train(net_type, use_cuda, save_dirname, is_local):
     classdim = 10
     data_shape = [3, 32, 32]
 
@@ -117,7 +118,7 @@ def train(net_type, use_cuda, save_dirname):
     test_program = fluid.default_main_program().clone()
 
     optimizer = fluid.optimizer.Adam(learning_rate=0.001)
-    optimizer.minimize(avg_cost)
+    optimize_ops, params_grads = optimizer.minimize(avg_cost)
 
     BATCH_SIZE = 128
     PASS_NUM = 1
@@ -133,38 +134,68 @@ def train(net_type, use_cuda, save_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
     feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
-    exe.run(fluid.default_startup_program())
-
-    loss = 0.0
-    for pass_id in range(PASS_NUM):
-        for batch_id, data in enumerate(train_reader()):
-            exe.run(feed=feeder.feed(data))
-
-            if (batch_id % 10) == 0:
-                acc_list = []
-                avg_loss_list = []
-                for tid, test_data in enumerate(test_reader()):
-                    loss_t, acc_t = exe.run(program=test_program,
-                                            feed=feeder.feed(test_data),
-                                            fetch_list=[avg_cost, acc])
-                    if math.isnan(float(loss_t)):
-                        sys.exit("got NaN loss, training failed.")
-                    acc_list.append(float(acc_t))
-                    avg_loss_list.append(float(loss_t))
-                    break  # Use 1 segment for speeding up CI
-
-                acc_value = numpy.array(acc_list).mean()
-                avg_loss_value = numpy.array(avg_loss_list).mean()
-
-                print(
-                    'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
-                    format(pass_id, batch_id + 1,
-                           float(avg_loss_value), float(acc_value)))
-
-                if acc_value > 0.01:  # Low threshold for speeding up CI
-                    fluid.io.save_inference_model(save_dirname, ["pixel"],
-                                                  [predict], exe)
-                    return
+
+    def train_loop(main_program):
+        exe.run(fluid.default_startup_program())
+        loss = 0.0
+        for pass_id in range(PASS_NUM):
+            for batch_id, data in enumerate(train_reader()):
+                exe.run(main_program, feed=feeder.feed(data))
+
+                if (batch_id % 10) == 0:
+                    acc_list = []
+                    avg_loss_list = []
+                    for tid, test_data in enumerate(test_reader()):
+                        loss_t, acc_t = exe.run(program=test_program,
+                                                feed=feeder.feed(test_data),
+                                                fetch_list=[avg_cost, acc])
+                        if math.isnan(float(loss_t)):
+                            sys.exit("got NaN loss, training failed.")
+                        acc_list.append(float(acc_t))
+                        avg_loss_list.append(float(loss_t))
+                        break  # Use 1 segment for speeding up CI
+
+                    acc_value = numpy.array(acc_list).mean()
+                    avg_loss_value = numpy.array(avg_loss_list).mean()
+
+                    print(
+                        'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
+                        format(pass_id, batch_id + 1,
+                               float(avg_loss_value), float(acc_value)))
+
+                    if acc_value > 0.01:  # Low threshold for speeding up CI
+                        fluid.io.save_inference_model(save_dirname, ["pixel"],
+                                                      [predict], exe)
+                        return
+
+    if is_local:
+        train_loop(fluid.default_main_program())
+    else:
+        port = os.getenv("PADDLE_INIT_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, port]))
+        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
+        trainers = int(os.getenv("TRAINERS"))
+        current_endpoint = os.getenv("POD_IP") + ":" + port
+        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            optimize_ops,
+            params_grads,
+            trainer_id,
+            pservers=pserver_endpoints,
+            trainers=trainers)
+        if training_role == "PSERVER":
+            pserver_prog = t.get_pserver_program(current_endpoint)
+            pserver_startup = t.get_startup_program(current_endpoint,
+                                                    pserver_prog)
+            exe.run(pserver_startup)
+            exe.run(pserver_prog)
+        elif training_role == "TRAINER":
+            train_loop(t.get_trainer_program())
 
 
 def infer(use_cuda, save_dirname=None):
@@ -174,32 +205,36 @@ def infer(use_cuda, save_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
 
-    # Use fluid.io.load_inference_model to obtain the inference program desc,
-    # the feed_target_names (the names of variables that will be feeded 
-    # data using feed operators), and the fetch_targets (variables that 
-    # we want to obtain data from using fetch operators).
-    [inference_program, feed_target_names,
-     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
-    # The input's dimension of conv should be 4-D or 5-D.
-    tensor_img = numpy.random.rand(1, 3, 32, 32).astype("float32")
-
-    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
-    # and results will contain a list of data corresponding to fetch_targets.
-    results = exe.run(inference_program,
-                      feed={feed_target_names[0]: tensor_img},
-                      fetch_list=fetch_targets)
-    print("infer results: ", results[0])
-
-
-def main(net_type, use_cuda):
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        # Use fluid.io.load_inference_model to obtain the inference program desc,
+        # the feed_target_names (the names of variables that will be feeded
+        # data using feed operators), and the fetch_targets (variables that
+        # we want to obtain data from using fetch operators).
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+        # The input's dimension of conv should be 4-D or 5-D.
+        # Use normilized image pixels as input data, which should be in the range [0, 1.0].
+        batch_size = 1
+        tensor_img = numpy.random.rand(batch_size, 3, 32, 32).astype("float32")
+
+        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+        # and results will contain a list of data corresponding to fetch_targets.
+        results = exe.run(inference_program,
+                          feed={feed_target_names[0]: tensor_img},
+                          fetch_list=fetch_targets)
+        print("infer results: ", results[0])
+
+
+def main(net_type, use_cuda, is_local=True):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
 
     # Directory for saving the trained model
     save_dirname = "image_classification_" + net_type + ".inference.model"
 
-    train(net_type, use_cuda, save_dirname)
+    train(net_type, use_cuda, save_dirname, is_local)
     infer(use_cuda, save_dirname)
 
 
diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index 05cf0ba6c6c8e76cd91427d762478d0820cd8259..5c6374b93175d85c49633b73b20aa5e3b64ff9f1 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -22,11 +22,12 @@ from paddle.fluid.initializer import init_on_cpu
 import contextlib
 import time
 import unittest
+import os
 
 word_dict, verb_dict, label_dict = conll05.get_dict()
 word_dict_len = len(word_dict)
 label_dict_len = len(label_dict)
-pred_len = len(verb_dict)
+pred_dict_len = len(verb_dict)
 
 mark_dict_len = 2
 word_dim = 32
@@ -53,7 +54,7 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
     # 8 features
     predicate_embedding = fluid.layers.embedding(
         input=predicate,
-        size=[pred_len, word_dim],
+        size=[pred_dict_len, word_dim],
         dtype='float32',
         is_sparse=IS_SPARSE,
         param_attr='vemb')
@@ -138,7 +139,7 @@ def create_random_lodtensor(lod, place, low, high):
     return res
 
 
-def train(use_cuda, save_dirname=None):
+def train(use_cuda, save_dirname=None, is_local=True):
     # define network topology
     word = fluid.layers.data(
         name='word_data', shape=[1], dtype='int64', lod_level=1)
@@ -174,7 +175,7 @@ def train(use_cuda, save_dirname=None):
             decay_steps=100000,
             decay_rate=0.5,
             staircase=True))
-    sgd_optimizer.minimize(avg_cost)
+    optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
 
     # TODO(qiao)
     # add dependency track and move this config before optimizer
@@ -200,44 +201,78 @@ def train(use_cuda, save_dirname=None):
         place=place)
     exe = fluid.Executor(place)
 
-    exe.run(fluid.default_startup_program())
-
-    embedding_param = fluid.global_scope().find_var(embedding_name).get_tensor()
-    embedding_param.set(
-        load_parameter(conll05.get_embedding(), word_dict_len, word_dim), place)
-
-    start_time = time.time()
-    batch_id = 0
-    for pass_id in xrange(PASS_NUM):
-        chunk_evaluator.reset(exe)
-        for data in train_data():
-            cost, precision, recall, f1_score = exe.run(
-                fluid.default_main_program(),
-                feed=feeder.feed(data),
-                fetch_list=[avg_cost] + chunk_evaluator.metrics)
-            pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(
-                exe)
-
-            if batch_id % 10 == 0:
-                print("avg_cost:" + str(cost) + " precision:" + str(
-                    precision) + " recall:" + str(recall) + " f1_score:" + str(
-                        f1_score) + " pass_precision:" + str(
-                            pass_precision) + " pass_recall:" + str(pass_recall)
-                      + " pass_f1_score:" + str(pass_f1_score))
-                if batch_id != 0:
-                    print("second per batch: " + str((time.time() - start_time)
-                                                     / batch_id))
-                # Set the threshold low to speed up the CI test
-                if float(pass_precision) > 0.05:
-                    if save_dirname is not None:
-                        fluid.io.save_inference_model(save_dirname, [
-                            'word_data', 'verb_data', 'ctx_n2_data',
-                            'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
-                            'ctx_p2_data', 'mark_data'
-                        ], [feature_out], exe)
-                    return
-
-            batch_id = batch_id + 1
+    def train_loop(main_program):
+        exe.run(fluid.default_startup_program())
+
+        embedding_param = fluid.global_scope().find_var(
+            embedding_name).get_tensor()
+        embedding_param.set(
+            load_parameter(conll05.get_embedding(), word_dict_len, word_dim),
+            place)
+
+        start_time = time.time()
+        batch_id = 0
+        for pass_id in xrange(PASS_NUM):
+            chunk_evaluator.reset(exe)
+            for data in train_data():
+                cost, precision, recall, f1_score = exe.run(
+                    main_program,
+                    feed=feeder.feed(data),
+                    fetch_list=[avg_cost] + chunk_evaluator.metrics)
+                pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(
+                    exe)
+
+                if batch_id % 10 == 0:
+                    print("avg_cost:" + str(cost) + " precision:" + str(
+                        precision) + " recall:" + str(recall) + " f1_score:" +
+                          str(f1_score) + " pass_precision:" + str(
+                              pass_precision) + " pass_recall:" + str(
+                                  pass_recall) + " pass_f1_score:" + str(
+                                      pass_f1_score))
+                    if batch_id != 0:
+                        print("second per batch: " + str((time.time(
+                        ) - start_time) / batch_id))
+                    # Set the threshold low to speed up the CI test
+                    if float(pass_precision) > 0.05:
+                        if save_dirname is not None:
+                            # TODO(liuyiqun): Change the target to crf_decode
+                            fluid.io.save_inference_model(save_dirname, [
+                                'word_data', 'verb_data', 'ctx_n2_data',
+                                'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
+                                'ctx_p2_data', 'mark_data'
+                            ], [feature_out], exe)
+                        return
+
+                batch_id = batch_id + 1
+
+    if is_local:
+        train_loop(fluid.default_main_program())
+    else:
+        port = os.getenv("PADDLE_INIT_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, port]))
+        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
+        trainers = int(os.getenv("TRAINERS"))
+        current_endpoint = os.getenv("POD_IP") + ":" + port
+        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            optimize_ops,
+            params_grads,
+            trainer_id,
+            pservers=pserver_endpoints,
+            trainers=trainers)
+        if training_role == "PSERVER":
+            pserver_prog = t.get_pserver_program(current_endpoint)
+            pserver_startup = t.get_startup_program(current_endpoint,
+                                                    pserver_prog)
+            exe.run(pserver_startup)
+            exe.run(pserver_prog)
+        elif training_role == "TRAINER":
+            train_loop(t.get_trainer_program())
 
 
 def infer(use_cuda, save_dirname=None):
@@ -247,61 +282,70 @@ def infer(use_cuda, save_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
 
-    # Use fluid.io.load_inference_model to obtain the inference program desc,
-    # the feed_target_names (the names of variables that will be feeded 
-    # data using feed operators), and the fetch_targets (variables that 
-    # we want to obtain data from using fetch operators).
-    [inference_program, feed_target_names,
-     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
-    lod = [0, 4, 10]
-    ts_word = create_random_lodtensor(lod, place, low=0, high=1)
-    ts_pred = create_random_lodtensor(lod, place, low=0, high=1)
-    ts_ctx_n2 = create_random_lodtensor(lod, place, low=0, high=1)
-    ts_ctx_n1 = create_random_lodtensor(lod, place, low=0, high=1)
-    ts_ctx_0 = create_random_lodtensor(lod, place, low=0, high=1)
-    ts_ctx_p1 = create_random_lodtensor(lod, place, low=0, high=1)
-    ts_ctx_p2 = create_random_lodtensor(lod, place, low=0, high=1)
-    ts_mark = create_random_lodtensor(lod, place, low=0, high=1)
-
-    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
-    # and results will contain a list of data corresponding to fetch_targets.
-    assert feed_target_names[0] == 'word_data'
-    assert feed_target_names[1] == 'verb_data'
-    assert feed_target_names[2] == 'ctx_n2_data'
-    assert feed_target_names[3] == 'ctx_n1_data'
-    assert feed_target_names[4] == 'ctx_0_data'
-    assert feed_target_names[5] == 'ctx_p1_data'
-    assert feed_target_names[6] == 'ctx_p2_data'
-    assert feed_target_names[7] == 'mark_data'
-
-    results = exe.run(inference_program,
-                      feed={
-                          feed_target_names[0]: ts_word,
-                          feed_target_names[1]: ts_pred,
-                          feed_target_names[2]: ts_ctx_n2,
-                          feed_target_names[3]: ts_ctx_n1,
-                          feed_target_names[4]: ts_ctx_0,
-                          feed_target_names[5]: ts_ctx_p1,
-                          feed_target_names[6]: ts_ctx_p2,
-                          feed_target_names[7]: ts_mark
-                      },
-                      fetch_list=fetch_targets,
-                      return_numpy=False)
-    print(results[0].lod())
-    np_data = np.array(results[0])
-    print("Inference Shape: ", np_data.shape)
-    print("Inference results: ", np_data)
-
-
-def main(use_cuda):
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        # Use fluid.io.load_inference_model to obtain the inference program desc,
+        # the feed_target_names (the names of variables that will be feeded
+        # data using feed operators), and the fetch_targets (variables that
+        # we want to obtain data from using fetch operators).
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+        lod = [0, 4, 10]
+        word = create_random_lodtensor(
+            lod, place, low=0, high=word_dict_len - 1)
+        pred = create_random_lodtensor(
+            lod, place, low=0, high=pred_dict_len - 1)
+        ctx_n2 = create_random_lodtensor(
+            lod, place, low=0, high=word_dict_len - 1)
+        ctx_n1 = create_random_lodtensor(
+            lod, place, low=0, high=word_dict_len - 1)
+        ctx_0 = create_random_lodtensor(
+            lod, place, low=0, high=word_dict_len - 1)
+        ctx_p1 = create_random_lodtensor(
+            lod, place, low=0, high=word_dict_len - 1)
+        ctx_p2 = create_random_lodtensor(
+            lod, place, low=0, high=word_dict_len - 1)
+        mark = create_random_lodtensor(
+            lod, place, low=0, high=mark_dict_len - 1)
+
+        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+        # and results will contain a list of data corresponding to fetch_targets.
+        assert feed_target_names[0] == 'word_data'
+        assert feed_target_names[1] == 'verb_data'
+        assert feed_target_names[2] == 'ctx_n2_data'
+        assert feed_target_names[3] == 'ctx_n1_data'
+        assert feed_target_names[4] == 'ctx_0_data'
+        assert feed_target_names[5] == 'ctx_p1_data'
+        assert feed_target_names[6] == 'ctx_p2_data'
+        assert feed_target_names[7] == 'mark_data'
+
+        results = exe.run(inference_program,
+                          feed={
+                              feed_target_names[0]: word,
+                              feed_target_names[1]: pred,
+                              feed_target_names[2]: ctx_n2,
+                              feed_target_names[3]: ctx_n1,
+                              feed_target_names[4]: ctx_0,
+                              feed_target_names[5]: ctx_p1,
+                              feed_target_names[6]: ctx_p2,
+                              feed_target_names[7]: mark
+                          },
+                          fetch_list=fetch_targets,
+                          return_numpy=False)
+        print(results[0].lod())
+        np_data = np.array(results[0])
+        print("Inference Shape: ", np_data.shape)
+
+
+def main(use_cuda, is_local=True):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
 
     # Directory for saving the trained model
     save_dirname = "label_semantic_roles.inference.model"
 
-    train(use_cuda, save_dirname)
+    train(use_cuda, save_dirname, is_local)
     infer(use_cuda, save_dirname)
 
 
diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py
index bd768d5f08de57005f76ea3ea25b318a930a58c7..caa9596a100de4f9364467690db1e80ee227c3c1 100644
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
@@ -20,6 +20,7 @@ import paddle.fluid.framework as framework
 import paddle.fluid.layers as pd
 from paddle.fluid.executor import Executor
 import unittest
+import os
 
 dict_size = 30000
 source_dict_dim = target_dict_dim = dict_size
@@ -168,7 +169,7 @@ def to_lodtensor(data, place):
     return res
 
 
-def train_main(use_cuda, is_sparse):
+def train_main(use_cuda, is_sparse, is_local=True):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
@@ -181,7 +182,7 @@ def train_main(use_cuda, is_sparse):
     avg_cost = pd.mean(cost)
 
     optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
-    optimizer.minimize(avg_cost)
+    optimize_ops, params_grads = optimizer.minimize(avg_cost)
 
     train_data = paddle.batch(
         paddle.reader.shuffle(
@@ -190,27 +191,57 @@ def train_main(use_cuda, is_sparse):
 
     exe = Executor(place)
 
-    exe.run(framework.default_startup_program())
-
-    batch_id = 0
-    for pass_id in xrange(1):
-        for data in train_data():
-            word_data = to_lodtensor(map(lambda x: x[0], data), place)
-            trg_word = to_lodtensor(map(lambda x: x[1], data), place)
-            trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
-            outs = exe.run(framework.default_main_program(),
-                           feed={
-                               'src_word_id': word_data,
-                               'target_language_word': trg_word,
-                               'target_language_next_word': trg_word_next
-                           },
-                           fetch_list=[avg_cost])
-            avg_cost_val = np.array(outs[0])
-            print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
-                  " avg_cost=" + str(avg_cost_val))
-            if batch_id > 3:
-                break
-            batch_id += 1
+    def train_loop(main_program):
+        exe.run(framework.default_startup_program())
+
+        batch_id = 0
+        for pass_id in xrange(1):
+            for data in train_data():
+                word_data = to_lodtensor(map(lambda x: x[0], data), place)
+                trg_word = to_lodtensor(map(lambda x: x[1], data), place)
+                trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
+                outs = exe.run(main_program,
+                               feed={
+                                   'src_word_id': word_data,
+                                   'target_language_word': trg_word,
+                                   'target_language_next_word': trg_word_next
+                               },
+                               fetch_list=[avg_cost])
+                avg_cost_val = np.array(outs[0])
+                print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
+                      " avg_cost=" + str(avg_cost_val))
+                if batch_id > 3:
+                    break
+                batch_id += 1
+
+    if is_local:
+        train_loop(framework.default_main_program())
+    else:
+        port = os.getenv("PADDLE_INIT_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, port]))
+        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
+        trainers = int(os.getenv("TRAINERS"))
+        current_endpoint = os.getenv("POD_IP") + ":" + port
+        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            optimize_ops,
+            params_grads,
+            trainer_id,
+            pservers=pserver_endpoints,
+            trainers=trainers)
+        if training_role == "PSERVER":
+            pserver_prog = t.get_pserver_program(current_endpoint)
+            pserver_startup = t.get_startup_program(current_endpoint,
+                                                    pserver_prog)
+            exe.run(pserver_startup)
+            exe.run(pserver_prog)
+        elif training_role == "TRAINER":
+            train_loop(t.get_trainer_program())
 
 
 def decode_main(use_cuda, is_sparse):
diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py
index 12307111d5dda549bff7ea40ac7c341c69c3e4bd..b57fe08e1a367c33db31c89127b6c2bc08253655 100644
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -20,27 +20,7 @@ import numpy
 import unittest
 import math
 import sys
-
-
-def parse_arg():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "nn_type",
-        help="The neural network type, in ['mlp', 'conv']",
-        type=str,
-        choices=['mlp', 'conv'])
-    parser.add_argument(
-        "--parallel",
-        help='Run in parallel or not',
-        default=False,
-        action="store_true")
-    parser.add_argument(
-        "--use_cuda",
-        help="Run the program by using CUDA",
-        default=False,
-        action="store_true")
-    return parser.parse_args()
-
+import os
 
 BATCH_SIZE = 64
 
@@ -78,7 +58,13 @@ def conv_net(img, label):
     return loss_net(conv_pool_2, label)
 
 
-def train(nn_type, use_cuda, parallel, save_dirname, save_param_filename):
+def train(nn_type,
+          use_cuda,
+          parallel,
+          save_dirname=None,
+          model_filename=None,
+          params_filename=None,
+          is_local=True):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
     img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
@@ -109,12 +95,11 @@ def train(nn_type, use_cuda, parallel, save_dirname, save_param_filename):
     test_program = fluid.default_main_program().clone()
 
     optimizer = fluid.optimizer.Adam(learning_rate=0.001)
-    optimizer.minimize(avg_loss)
+    optimize_ops, params_grads = optimizer.minimize(avg_loss)
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     exe = fluid.Executor(place)
-    exe.run(fluid.default_startup_program())
 
     train_reader = paddle.batch(
         paddle.reader.shuffle(
@@ -124,88 +109,133 @@ def train(nn_type, use_cuda, parallel, save_dirname, save_param_filename):
         paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
     feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
 
-    PASS_NUM = 100
-    for pass_id in range(PASS_NUM):
-        for batch_id, data in enumerate(train_reader()):
-            # train a mini-batch, fetch nothing
-            exe.run(feed=feeder.feed(data))
-            if (batch_id + 1) % 10 == 0:
-                acc_set = []
-                avg_loss_set = []
-                for test_data in test_reader():
-                    acc_np, avg_loss_np = exe.run(program=test_program,
-                                                  feed=feeder.feed(test_data),
-                                                  fetch_list=[acc, avg_loss])
-                    acc_set.append(float(acc_np))
-                    avg_loss_set.append(float(avg_loss_np))
-                # get test acc and loss
-                acc_val = numpy.array(acc_set).mean()
-                avg_loss_val = numpy.array(avg_loss_set).mean()
-                if float(acc_val) > 0.2:  # Smaller value to increase CI speed
-                    if save_dirname is not None:
-                        fluid.io.save_inference_model(
-                            save_dirname, ["img"], [prediction],
-                            exe,
-                            save_file_name=save_param_filename)
-                    return
-                else:
-                    print(
-                        'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
-                        format(pass_id, batch_id + 1,
-                               float(avg_loss_val), float(acc_val)))
-                    if math.isnan(float(avg_loss_val)):
-                        sys.exit("got NaN loss, training failed.")
-    raise AssertionError("Loss of recognize digits is too large")
-
-
-def infer(use_cuda, save_dirname=None, param_filename=None):
+    def train_loop(main_program):
+        exe.run(fluid.default_startup_program())
+
+        PASS_NUM = 100
+        for pass_id in range(PASS_NUM):
+            for batch_id, data in enumerate(train_reader()):
+                # train a mini-batch, fetch nothing
+                exe.run(main_program, feed=feeder.feed(data))
+                if (batch_id + 1) % 10 == 0:
+                    acc_set = []
+                    avg_loss_set = []
+                    for test_data in test_reader():
+                        acc_np, avg_loss_np = exe.run(
+                            program=test_program,
+                            feed=feeder.feed(test_data),
+                            fetch_list=[acc, avg_loss])
+                        acc_set.append(float(acc_np))
+                        avg_loss_set.append(float(avg_loss_np))
+                    # get test acc and loss
+                    acc_val = numpy.array(acc_set).mean()
+                    avg_loss_val = numpy.array(avg_loss_set).mean()
+                    if float(acc_val
+                             ) > 0.2:  # Smaller value to increase CI speed
+                        if save_dirname is not None:
+                            fluid.io.save_inference_model(
+                                save_dirname, ["img"], [prediction],
+                                exe,
+                                model_filename=model_filename,
+                                params_filename=params_filename)
+                        return
+                    else:
+                        print(
+                            'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
+                            format(pass_id, batch_id + 1,
+                                   float(avg_loss_val), float(acc_val)))
+                        if math.isnan(float(avg_loss_val)):
+                            sys.exit("got NaN loss, training failed.")
+        raise AssertionError("Loss of recognize digits is too large")
+
+    if is_local:
+        train_loop(fluid.default_main_program())
+    else:
+        port = os.getenv("PADDLE_INIT_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, port]))
+        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
+        pserver_endpoints = os.getenv("PSERVERS")
+        trainers = int(os.getenv("TRAINERS"))
+        current_endpoint = os.getenv("POD_IP") + ":" + port
+        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            optimize_ops,
+            params_grads,
+            trainer_id,
+            pservers=pserver_endpoints,
+            trainers=trainers)
+        if training_role == "PSERVER":
+            pserver_prog = t.get_pserver_program(current_endpoint)
+            pserver_startup = t.get_startup_program(current_endpoint,
+                                                    pserver_prog)
+            exe.run(pserver_startup)
+            exe.run(pserver_prog)
+        elif training_role == "TRAINER":
+            train_loop(t.get_trainer_program())
+
+
+def infer(use_cuda,
+          save_dirname=None,
+          model_filename=None,
+          params_filename=None):
     if save_dirname is None:
         return
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
 
-    # Use fluid.io.load_inference_model to obtain the inference program desc,
-    # the feed_target_names (the names of variables that will be feeded 
-    # data using feed operators), and the fetch_targets (variables that 
-    # we want to obtain data from using fetch operators).
-    [inference_program, feed_target_names, fetch_targets
-     ] = fluid.io.load_inference_model(save_dirname, exe, param_filename)
-
-    # The input's dimension of conv should be 4-D or 5-D.
-    # Use normilized image pixels as input data, which should be in the range [-1.0, 1.0].
-    batch_size = 1
-    tensor_img = numpy.random.uniform(-1.0, 1.0,
-                                      [batch_size, 1, 28, 28]).astype("float32")
-
-    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
-    # and results will contain a list of data corresponding to fetch_targets.
-    results = exe.run(inference_program,
-                      feed={feed_target_names[0]: tensor_img},
-                      fetch_list=fetch_targets)
-    print("infer results: ", results[0])
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        # Use fluid.io.load_inference_model to obtain the inference program desc,
+        # the feed_target_names (the names of variables that will be feeded
+        # data using feed operators), and the fetch_targets (variables that
+        # we want to obtain data from using fetch operators).
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(
+             save_dirname, exe, model_filename, params_filename)
+
+        # The input's dimension of conv should be 4-D or 5-D.
+        # Use normilized image pixels as input data, which should be in the range [-1.0, 1.0].
+        batch_size = 1
+        tensor_img = numpy.random.uniform(
+            -1.0, 1.0, [batch_size, 1, 28, 28]).astype("float32")
+
+        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+        # and results will contain a list of data corresponding to fetch_targets.
+        results = exe.run(inference_program,
+                          feed={feed_target_names[0]: tensor_img},
+                          fetch_list=fetch_targets)
+        print("infer results: ", results[0])
 
 
 def main(use_cuda, parallel, nn_type, combine):
+    save_dirname = None
+    model_filename = None
+    params_filename = None
     if not use_cuda and not parallel:
         save_dirname = "recognize_digits_" + nn_type + ".inference.model"
-        save_filename = None
         if combine == True:
-            save_filename = "__params_combined__"
-    else:
-        save_dirname = None
-        save_filename = None
+            model_filename = "__model_combined__"
+            params_filename = "__params_combined__"
 
+    # call train() with is_local argument to run distributed train
     train(
         nn_type=nn_type,
         use_cuda=use_cuda,
         parallel=parallel,
         save_dirname=save_dirname,
-        save_param_filename=save_filename)
+        model_filename=model_filename,
+        params_filename=params_filename)
     infer(
         use_cuda=use_cuda,
         save_dirname=save_dirname,
-        param_filename=save_filename)
+        model_filename=model_filename,
+        params_filename=params_filename)
 
 
 class TestRecognizeDigits(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/book/test_recommender_system.py b/python/paddle/fluid/tests/book/test_recommender_system.py
index c190107e02e044635ff0b47c61de41c8bfed5acc..5e258a2c5170f63aa1fbaab5f38efdba04c8d391 100644
--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -14,6 +14,7 @@
 
 import math
 import sys
+import os
 import numpy as np
 import paddle.v2 as paddle
 import paddle.fluid as fluid
@@ -152,19 +153,18 @@ def model():
     return scale_infer, avg_cost
 
 
-def train(use_cuda, save_dirname):
+def train(use_cuda, save_dirname, is_local=True):
     scale_infer, avg_cost = model()
 
     # test program
     test_program = fluid.default_main_program().clone()
 
     sgd_optimizer = SGDOptimizer(learning_rate=0.2)
-    opts = sgd_optimizer.minimize(avg_cost)
+    optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     exe = Executor(place)
-    exe.run(framework.default_startup_program())
 
     train_reader = paddle.batch(
         paddle.reader.shuffle(
@@ -212,36 +212,69 @@ def train(use_cuda, save_dirname):
             feed_tensors[key] = tensor
         return feed_tensors
 
-    PASS_NUM = 100
-    for pass_id in range(PASS_NUM):
-        for batch_id, data in enumerate(train_reader()):
-            # train a mini-batch
-            outs = exe.run(program=fluid.default_main_program(),
-                           feed=func_feed(feeding, data),
-                           fetch_list=[avg_cost])
-            out = np.array(outs[0])
-            if (batch_id + 1) % 10 == 0:
-                avg_cost_set = []
-                for test_data in test_reader():
-                    avg_cost_np = exe.run(program=test_program,
-                                          feed=func_feed(feeding, test_data),
-                                          fetch_list=[avg_cost])
-                    avg_cost_set.append(avg_cost_np[0])
-                    break  # test only 1 segment for speeding up CI
-
-                # get test avg_cost
-                test_avg_cost = np.array(avg_cost_set).mean()
-                if test_avg_cost < 6.0:
-                    # if avg_cost less than 6.0, we think our code is good.
-                    if save_dirname is not None:
-                        fluid.io.save_inference_model(save_dirname, [
-                            "user_id", "gender_id", "age_id", "job_id",
-                            "movie_id", "category_id", "movie_title"
-                        ], [scale_infer], exe)
-                    return
-
-            if math.isnan(float(out[0])):
-                sys.exit("got NaN loss, training failed.")
+    def train_loop(main_program):
+        exe.run(framework.default_startup_program())
+
+        PASS_NUM = 100
+        for pass_id in range(PASS_NUM):
+            for batch_id, data in enumerate(train_reader()):
+                # train a mini-batch
+                outs = exe.run(program=main_program,
+                               feed=func_feed(feeding, data),
+                               fetch_list=[avg_cost])
+                out = np.array(outs[0])
+                if (batch_id + 1) % 10 == 0:
+                    avg_cost_set = []
+                    for test_data in test_reader():
+                        avg_cost_np = exe.run(
+                            program=test_program,
+                            feed=func_feed(feeding, test_data),
+                            fetch_list=[avg_cost])
+                        avg_cost_set.append(avg_cost_np[0])
+                        break  # test only 1 segment for speeding up CI
+
+                    # get test avg_cost
+                    test_avg_cost = np.array(avg_cost_set).mean()
+                    if test_avg_cost < 6.0:
+                        # if avg_cost less than 6.0, we think our code is good.
+                        if save_dirname is not None:
+                            fluid.io.save_inference_model(save_dirname, [
+                                "user_id", "gender_id", "age_id", "job_id",
+                                "movie_id", "category_id", "movie_title"
+                            ], [scale_infer], exe)
+                        return
+
+                if math.isnan(float(out[0])):
+                    sys.exit("got NaN loss, training failed.")
+
+    if is_local:
+        train_loop(fluid.default_main_program())
+    else:
+        port = os.getenv("PADDLE_INIT_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, port]))
+        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
+        trainers = int(os.getenv("TRAINERS"))
+        current_endpoint = os.getenv("POD_IP") + ":" + port
+        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            optimize_ops,
+            params_grads,
+            trainer_id,
+            pservers=pserver_endpoints,
+            trainers=trainers)
+        if training_role == "PSERVER":
+            pserver_prog = t.get_pserver_program(current_endpoint)
+            pserver_startup = t.get_startup_program(current_endpoint,
+                                                    pserver_prog)
+            exe.run(pserver_startup)
+            exe.run(pserver_prog)
+        elif training_role == "TRAINER":
+            train_loop(t.get_trainer_program())
 
 
 def infer(use_cuda, save_dirname=None):
@@ -251,13 +284,6 @@ def infer(use_cuda, save_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
 
-    # Use fluid.io.load_inference_model to obtain the inference program desc,
-    # the feed_target_names (the names of variables that will be feeded
-    # data using feed operators), and the fetch_targets (variables that
-    # we want to obtain data from using fetch operators).
-    [inference_program, feed_target_names,
-     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
     def create_lod_tensor(data, lod=None):
         tensor = fluid.LoDTensor()
         if lod is None:
@@ -275,44 +301,53 @@ def infer(use_cuda, save_dirname=None):
         tensor.set(flattened_data, place)
         return tensor
 
-    # Use the first data from paddle.dataset.movielens.test() as input
-    assert feed_target_names[0] == "user_id"
-    user_id = create_lod_tensor([[1]])
-
-    assert feed_target_names[1] == "gender_id"
-    gender_id = create_lod_tensor([[1]])
-
-    assert feed_target_names[2] == "age_id"
-    age_id = create_lod_tensor([[0]])
-
-    assert feed_target_names[3] == "job_id"
-    job_id = create_lod_tensor([[10]])
-
-    assert feed_target_names[4] == "movie_id"
-    movie_id = create_lod_tensor([[783]])
-
-    assert feed_target_names[5] == "category_id"
-    category_id = create_lod_tensor([[10], [8], [9]], [[0, 3]])
-
-    assert feed_target_names[6] == "movie_title"
-    movie_title = create_lod_tensor([[1069], [4140], [2923], [710], [988]],
-                                    [[0, 5]])
-
-    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
-    # and results will contain a list of data corresponding to fetch_targets.
-    results = exe.run(inference_program,
-                      feed={
-                          feed_target_names[0]: user_id,
-                          feed_target_names[1]: gender_id,
-                          feed_target_names[2]: age_id,
-                          feed_target_names[3]: job_id,
-                          feed_target_names[4]: movie_id,
-                          feed_target_names[5]: category_id,
-                          feed_target_names[6]: movie_title
-                      },
-                      fetch_list=fetch_targets,
-                      return_numpy=False)
-    print("inferred score: ", np.array(results[0]))
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        # Use fluid.io.load_inference_model to obtain the inference program desc,
+        # the feed_target_names (the names of variables that will be feeded
+        # data using feed operators), and the fetch_targets (variables that
+        # we want to obtain data from using fetch operators).
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+        # Use the first data from paddle.dataset.movielens.test() as input
+        assert feed_target_names[0] == "user_id"
+        user_id = create_lod_tensor([[1]])
+
+        assert feed_target_names[1] == "gender_id"
+        gender_id = create_lod_tensor([[1]])
+
+        assert feed_target_names[2] == "age_id"
+        age_id = create_lod_tensor([[0]])
+
+        assert feed_target_names[3] == "job_id"
+        job_id = create_lod_tensor([[10]])
+
+        assert feed_target_names[4] == "movie_id"
+        movie_id = create_lod_tensor([[783]])
+
+        assert feed_target_names[5] == "category_id"
+        category_id = create_lod_tensor([[10], [8], [9]], [[0, 3]])
+
+        assert feed_target_names[6] == "movie_title"
+        movie_title = create_lod_tensor([[1069], [4140], [2923], [710], [988]],
+                                        [[0, 5]])
+
+        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+        # and results will contain a list of data corresponding to fetch_targets.
+        results = exe.run(inference_program,
+                          feed={
+                              feed_target_names[0]: user_id,
+                              feed_target_names[1]: gender_id,
+                              feed_target_names[2]: age_id,
+                              feed_target_names[3]: job_id,
+                              feed_target_names[4]: movie_id,
+                              feed_target_names[5]: category_id,
+                              feed_target_names[6]: movie_title
+                          },
+                          fetch_list=fetch_targets,
+                          return_numpy=False)
+        print("inferred score: ", np.array(results[0]))
 
 
 def main(use_cuda):
diff --git a/python/paddle/fluid/tests/book/test_understand_sentiment.py b/python/paddle/fluid/tests/book/test_understand_sentiment.py
index ab8df93651c01f75eeda1eab1ac95db867678106..1b7e84ea05cab5750865032ee7440cd5f5aa519b 100644
--- a/python/paddle/fluid/tests/book/test_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/test_understand_sentiment.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@ import contextlib
 import math
 import numpy as np
 import sys
+import os
 
 
 def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
@@ -132,7 +133,12 @@ def create_random_lodtensor(lod, place, low, high):
     return res
 
 
-def train(word_dict, net_method, use_cuda, parallel=False, save_dirname=None):
+def train(word_dict,
+          net_method,
+          use_cuda,
+          parallel=False,
+          save_dirname=None,
+          is_local=True):
     BATCH_SIZE = 128
     PASS_NUM = 5
     dict_dim = len(word_dict)
@@ -164,7 +170,7 @@ def train(word_dict, net_method, use_cuda, parallel=False, save_dirname=None):
         assert save_dirname is None
 
     adagrad = fluid.optimizer.Adagrad(learning_rate=0.002)
-    adagrad.minimize(cost)
+    optimize_ops, params_grads = adagrad.minimize(cost)
 
     train_data = paddle.batch(
         paddle.reader.shuffle(
@@ -174,55 +180,88 @@ def train(word_dict, net_method, use_cuda, parallel=False, save_dirname=None):
     exe = fluid.Executor(place)
     feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
 
-    exe.run(fluid.default_startup_program())
-
-    for pass_id in xrange(PASS_NUM):
-        for data in train_data():
-            cost_val, acc_val = exe.run(fluid.default_main_program(),
-                                        feed=feeder.feed(data),
-                                        fetch_list=[cost, acc_out])
-            print("cost=" + str(cost_val) + " acc=" + str(acc_val))
-            if cost_val < 0.4 and acc_val > 0.8:
-                if save_dirname is not None:
-                    fluid.io.save_inference_model(save_dirname, ["words"],
-                                                  prediction, exe)
-                return
-            if math.isnan(float(cost_val)):
-                sys.exit("got NaN loss, training failed.")
-    raise AssertionError("Cost is too large for {0}".format(
-        net_method.__name__))
-
-
-def infer(use_cuda, save_dirname=None):
+    def train_loop(main_program):
+        exe.run(fluid.default_startup_program())
+
+        for pass_id in xrange(PASS_NUM):
+            for data in train_data():
+                cost_val, acc_val = exe.run(main_program,
+                                            feed=feeder.feed(data),
+                                            fetch_list=[cost, acc_out])
+                print("cost=" + str(cost_val) + " acc=" + str(acc_val))
+                if cost_val < 0.4 and acc_val > 0.8:
+                    if save_dirname is not None:
+                        fluid.io.save_inference_model(save_dirname, ["words"],
+                                                      prediction, exe)
+                    return
+                if math.isnan(float(cost_val)):
+                    sys.exit("got NaN loss, training failed.")
+        raise AssertionError("Cost is too large for {0}".format(
+            net_method.__name__))
+
+    if is_local:
+        train_loop(fluid.default_main_program())
+    else:
+        port = os.getenv("PADDLE_INIT_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, port]))
+        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
+        trainers = int(os.getenv("TRAINERS"))
+        current_endpoint = os.getenv("POD_IP") + ":" + port
+        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            optimize_ops,
+            params_grads,
+            trainer_id,
+            pservers=pserver_endpoints,
+            trainers=trainers)
+        if training_role == "PSERVER":
+            pserver_prog = t.get_pserver_program(current_endpoint)
+            pserver_startup = t.get_startup_program(current_endpoint,
+                                                    pserver_prog)
+            exe.run(pserver_startup)
+            exe.run(pserver_prog)
+        elif training_role == "TRAINER":
+            train_loop(t.get_trainer_program())
+
+
+def infer(word_dict, use_cuda, save_dirname=None):
     if save_dirname is None:
         return
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
 
-    # Use fluid.io.load_inference_model to obtain the inference program desc,
-    # the feed_target_names (the names of variables that will be feeded 
-    # data using feed operators), and the fetch_targets (variables that 
-    # we want to obtain data from using fetch operators).
-    [inference_program, feed_target_names,
-     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
-    lod = [0, 4, 10]
-    word_dict = paddle.dataset.imdb.word_dict()
-    tensor_words = create_random_lodtensor(
-        lod, place, low=0, high=len(word_dict) - 1)
-
-    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
-    # and results will contain a list of data corresponding to fetch_targets.
-    assert feed_target_names[0] == "words"
-    results = exe.run(inference_program,
-                      feed={feed_target_names[0]: tensor_words},
-                      fetch_list=fetch_targets,
-                      return_numpy=False)
-    print(results[0].lod())
-    np_data = np.array(results[0])
-    print("Inference Shape: ", np_data.shape)
-    print("Inference results: ", np_data)
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        # Use fluid.io.load_inference_model to obtain the inference program desc,
+        # the feed_target_names (the names of variables that will be feeded
+        # data using feed operators), and the fetch_targets (variables that
+        # we want to obtain data from using fetch operators).
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+        word_dict_len = len(word_dict)
+
+        lod = [0, 4, 10]
+        tensor_words = create_random_lodtensor(
+            lod, place, low=0, high=word_dict_len - 1)
+
+        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+        # and results will contain a list of data corresponding to fetch_targets.
+        assert feed_target_names[0] == "words"
+        results = exe.run(inference_program,
+                          feed={feed_target_names[0]: tensor_words},
+                          fetch_list=fetch_targets,
+                          return_numpy=False)
+        print(results[0].lod())
+        np_data = np.array(results[0])
+        print("Inference Shape: ", np_data.shape)
+        print("Inference results: ", np_data)
 
 
 def main(word_dict, net_method, use_cuda, parallel=False, save_dirname=None):
@@ -258,7 +297,7 @@ class TestUnderstandSentiment(unittest.TestCase):
                 self.word_dict,
                 net_method=convolution_net,
                 use_cuda=False,
-                save_dirname="understand_sentiment.inference.model")
+                save_dirname="understand_sentiment_conv.inference.model")
 
     def test_conv_cpu_parallel(self):
         with self.new_program_scope():
@@ -271,7 +310,11 @@ class TestUnderstandSentiment(unittest.TestCase):
     @unittest.skip(reason="make CI faster")
     def test_stacked_lstm_cpu(self):
         with self.new_program_scope():
-            main(self.word_dict, net_method=stacked_lstm_net, use_cuda=False)
+            main(
+                self.word_dict,
+                net_method=stacked_lstm_net,
+                use_cuda=False,
+                save_dirname="understand_sentiment_stacked_lstm.inference.model")
 
     def test_stacked_lstm_cpu_parallel(self):
         with self.new_program_scope():
@@ -287,7 +330,7 @@ class TestUnderstandSentiment(unittest.TestCase):
                 self.word_dict,
                 net_method=convolution_net,
                 use_cuda=True,
-                save_dirname="understand_sentiment.inference.model")
+                save_dirname="understand_sentiment_conv.inference.model")
 
     def test_conv_gpu_parallel(self):
         with self.new_program_scope():
@@ -300,7 +343,11 @@ class TestUnderstandSentiment(unittest.TestCase):
     @unittest.skip(reason="make CI faster")
     def test_stacked_lstm_gpu(self):
         with self.new_program_scope():
-            main(self.word_dict, net_method=stacked_lstm_net, use_cuda=True)
+            main(
+                self.word_dict,
+                net_method=stacked_lstm_net,
+                use_cuda=True,
+                save_dirname="understand_sentiment_stacked_lstm.inference.model")
 
     def test_stacked_lstm_gpu_parallel(self):
         with self.new_program_scope():
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
index f33a759240f21f52817c482a2ebe008155dbd97b..26b97c3e254f54b83515436660e44d4908c98fbe 100644
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -1,5 +1,6 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-# # Licensed under the Apache License, Version 2.0 (the "License");
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
@@ -21,6 +22,7 @@ import sys
 
 
 def create_random_lodtensor(lod, place, low, high):
+    # The range of data elements is [low, high]
     data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
     res = fluid.LoDTensor()
     res.set(data, place)
@@ -28,54 +30,7 @@ def create_random_lodtensor(lod, place, low, high):
     return res
 
 
-def infer(use_cuda, save_dirname=None):
-    if save_dirname is None:
-        return
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    # Use fluid.io.load_inference_model to obtain the inference program desc,
-    # the feed_target_names (the names of variables that will be feeded 
-    # data using feed operators), and the fetch_targets (variables that 
-    # we want to obtain data from using fetch operators).
-    [inference_program, feed_target_names,
-     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
-    word_dict = paddle.dataset.imikolov.build_dict()
-    dict_size = len(word_dict) - 1
-
-    # Setup input, by creating 4 words, and setting up lod required for 
-    # lookup_table_op
-    lod = [0, 1]
-    first_word = create_random_lodtensor(lod, place, low=0, high=dict_size)
-    second_word = create_random_lodtensor(lod, place, low=0, high=dict_size)
-    third_word = create_random_lodtensor(lod, place, low=0, high=dict_size)
-    fourth_word = create_random_lodtensor(lod, place, low=0, high=dict_size)
-
-    assert feed_target_names[0] == 'firstw'
-    assert feed_target_names[1] == 'secondw'
-    assert feed_target_names[2] == 'thirdw'
-    assert feed_target_names[3] == 'forthw'
-
-    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
-    # and results will contain a list of data corresponding to fetch_targets.
-    results = exe.run(inference_program,
-                      feed={
-                          feed_target_names[0]: first_word,
-                          feed_target_names[1]: second_word,
-                          feed_target_names[2]: third_word,
-                          feed_target_names[3]: fourth_word
-                      },
-                      fetch_list=fetch_targets,
-                      return_numpy=False)
-    print(results[0].lod())
-    np_data = np.array(results[0])
-    print("Inference Shape: ", np_data.shape)
-    print("Inference results: ", np_data)
-
-
-def train(use_cuda, is_sparse, parallel, save_dirname):
+def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
     PASS_NUM = 100
     EMBED_SIZE = 32
     HIDDEN_SIZE = 256
@@ -130,7 +85,7 @@ def train(use_cuda, is_sparse, parallel, save_dirname):
     forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
     next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
 
-    if not parallel:
+    if not is_parallel:
         avg_cost, predict_word = __network__(
             [first_word, second_word, third_word, forth_word, next_word])
     else:
@@ -146,7 +101,7 @@ def train(use_cuda, is_sparse, parallel, save_dirname):
         avg_cost = fluid.layers.mean(pd())
 
     sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-    sgd_optimizer.minimize(avg_cost)
+    optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
 
     train_reader = paddle.batch(
         paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
@@ -157,30 +112,116 @@ def train(use_cuda, is_sparse, parallel, save_dirname):
         feed_list=[first_word, second_word, third_word, forth_word, next_word],
         place=place)
 
-    exe.run(fluid.default_startup_program())
+    def train_loop(main_program):
+        exe.run(fluid.default_startup_program())
+
+        for pass_id in range(PASS_NUM):
+            for data in train_reader():
+                avg_cost_np = exe.run(main_program,
+                                      feed=feeder.feed(data),
+                                      fetch_list=[avg_cost])
+                if avg_cost_np[0] < 5.0:
+                    if save_dirname is not None:
+                        fluid.io.save_inference_model(save_dirname, [
+                            'firstw', 'secondw', 'thirdw', 'forthw'
+                        ], [predict_word], exe)
+                    return
+                if math.isnan(float(avg_cost_np[0])):
+                    sys.exit("got NaN loss, training failed.")
+
+        raise AssertionError("Cost is too large {0:2.2}".format(avg_cost_np[0]))
+
+    if is_local:
+        train_loop(fluid.default_main_program())
+    else:
+        port = os.getenv("PADDLE_INIT_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, port]))
+        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
+        trainers = int(os.getenv("TRAINERS"))
+        current_endpoint = os.getenv("POD_IP") + ":" + port
+        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            optimize_ops,
+            params_grads,
+            trainer_id,
+            pservers=pserver_endpoints,
+            trainers=trainers)
+        if training_role == "PSERVER":
+            pserver_prog = t.get_pserver_program(current_endpoint)
+            pserver_startup = t.get_startup_program(current_endpoint,
+                                                    pserver_prog)
+            exe.run(pserver_startup)
+            exe.run(pserver_prog)
+        elif training_role == "TRAINER":
+            train_loop(t.get_trainer_program())
 
-    for pass_id in range(PASS_NUM):
-        for data in train_reader():
-            avg_cost_np = exe.run(fluid.default_main_program(),
-                                  feed=feeder.feed(data),
-                                  fetch_list=[avg_cost])
-            if avg_cost_np[0] < 5.0:
-                if save_dirname is not None:
-                    fluid.io.save_inference_model(save_dirname, [
-                        'firstw', 'secondw', 'thirdw', 'forthw'
-                    ], [predict_word], exe)
-                return
-            if math.isnan(float(avg_cost_np[0])):
-                sys.exit("got NaN loss, training failed.")
 
-    raise AssertionError("Cost is too large {0:2.2}".format(avg_cost_np[0]))
+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
 
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
 
-def main(use_cuda, is_sparse, parallel):
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        # Use fluid.io.load_inference_model to obtain the inference program desc,
+        # the feed_target_names (the names of variables that will be feeded
+        # data using feed operators), and the fetch_targets (variables that
+        # we want to obtain data from using fetch operators).
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+        word_dict = paddle.dataset.imikolov.build_dict()
+        dict_size = len(word_dict)
+
+        # Setup inputs, by creating 4 words, the lod of which should be [0, 1]
+        lod = [0, 1]
+        first_word = create_random_lodtensor(
+            lod, place, low=0, high=dict_size - 1)
+        second_word = create_random_lodtensor(
+            lod, place, low=0, high=dict_size - 1)
+        third_word = create_random_lodtensor(
+            lod, place, low=0, high=dict_size - 1)
+        fourth_word = create_random_lodtensor(
+            lod, place, low=0, high=dict_size - 1)
+
+        assert feed_target_names[0] == 'firstw'
+        assert feed_target_names[1] == 'secondw'
+        assert feed_target_names[2] == 'thirdw'
+        assert feed_target_names[3] == 'forthw'
+
+        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+        # and results will contain a list of data corresponding to fetch_targets.
+        results = exe.run(inference_program,
+                          feed={
+                              feed_target_names[0]: first_word,
+                              feed_target_names[1]: second_word,
+                              feed_target_names[2]: third_word,
+                              feed_target_names[3]: fourth_word
+                          },
+                          fetch_list=fetch_targets,
+                          return_numpy=False)
+        print(results[0].lod())
+        np_data = np.array(results[0])
+        print("Inference Shape: ", np_data.shape)
+
+
+def main(use_cuda, is_sparse, is_parallel):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
-    save_dirname = "word2vec.inference.model"
-    train(use_cuda, is_sparse, parallel, save_dirname)
+
+    if not is_parallel:
+        save_dirname = "word2vec.inference.model"
+    else:
+        save_dirname = None
+
+    train(use_cuda, is_sparse, is_parallel, save_dirname)
     infer(use_cuda, save_dirname)
 
 
@@ -193,10 +234,10 @@ class W2VTest(unittest.TestCase):
     pass
 
 
-def inject_test_method(use_cuda, is_sparse, parallel):
+def inject_test_method(use_cuda, is_sparse, is_parallel):
     fn_name = "test_{0}_{1}_{2}".format("cuda" if use_cuda else "cpu", "sparse"
                                         if is_sparse else "dense", "parallel"
-                                        if parallel else "normal")
+                                        if is_parallel else "normal")
 
     def __impl__(*args, **kwargs):
         prog = fluid.Program()
@@ -204,10 +245,12 @@ def inject_test_method(use_cuda, is_sparse, parallel):
         scope = fluid.core.Scope()
         with fluid.scope_guard(scope):
             with fluid.program_guard(prog, startup_prog):
-                main(use_cuda=use_cuda, is_sparse=is_sparse, parallel=parallel)
+                main(
+                    use_cuda=use_cuda,
+                    is_sparse=is_sparse,
+                    is_parallel=is_parallel)
 
-    # run only 2 cases: use_cuda is either True or False
-    if is_sparse == False and parallel == False:
+    if use_cuda and is_sparse:
         fn = __impl__
     else:
         # skip the other test when on CI server
@@ -219,8 +262,8 @@ def inject_test_method(use_cuda, is_sparse, parallel):
 
 for use_cuda in (False, True):
     for is_sparse in (False, True):
-        for parallel in (False, True):
-            inject_test_method(use_cuda, is_sparse, parallel)
+        for is_parallel in (False, True):
+            inject_test_method(use_cuda, is_sparse, is_parallel)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/book_distribute/CMakeLists.txt b/python/paddle/fluid/tests/book_distribute/CMakeLists.txt
deleted file mode 100644
index 4d7664469e481344cf9eea84688f068b4fb99dee..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book_distribute/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
-endforeach()
diff --git a/python/paddle/fluid/tests/book_distribute/notest_dist_fit_a_line.py b/python/paddle/fluid/tests/book_distribute/notest_dist_fit_a_line.py
deleted file mode 100644
index 01c1fa24fd4ae449af1f66f40a1c641f718c0e58..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book_distribute/notest_dist_fit_a_line.py
+++ /dev/null
@@ -1,77 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import paddle.v2 as paddle
-import paddle.fluid as fluid
-import os
-
-x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-
-y_predict = fluid.layers.fc(input=x, size=1, act=None)
-
-y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-
-cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-avg_cost = fluid.layers.mean(cost)
-
-sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
-
-BATCH_SIZE = 20
-
-train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.uci_housing.train(), buf_size=500),
-    batch_size=BATCH_SIZE)
-
-place = fluid.CPUPlace()
-feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-exe = fluid.Executor(place)
-
-t = fluid.DistributeTranspiler()
-# all parameter server endpoints list for spliting parameters
-pserver_endpoints = os.getenv("PSERVERS")
-# server endpoint for current node
-current_endpoint = os.getenv("SERVER_ENDPOINT")
-# run as trainer or parameter server
-training_role = os.getenv("TRAINING_ROLE",
-                          "TRAINER")  # get the training role: trainer/pserver
-t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
-
-if training_role == "PSERVER":
-    if not current_endpoint:
-        print("need env SERVER_ENDPOINT")
-        exit(1)
-    pserver_prog = t.get_pserver_program(current_endpoint)
-    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
-    exe.run(pserver_startup)
-    exe.run(pserver_prog)
-else:
-    trainer_prog = t.get_trainer_program()
-
-    exe.run(fluid.default_startup_program())
-
-    PASS_NUM = 100
-    for pass_id in range(PASS_NUM):
-        fluid.io.save_persistables(exe, "./fit_a_line.model/")
-        fluid.io.load_persistables(exe, "./fit_a_line.model/")
-        for data in train_reader():
-            avg_loss_value = exe.run(trainer_prog,
-                                     feed=feeder.feed(data),
-                                     fetch_list=[avg_cost])
-            print("loss:" + str(avg_loss_value))
-            if avg_loss_value[0] < 10.0:
-                exit(0)
-exit(1)
diff --git a/python/paddle/fluid/tests/book_distribute/notest_dist_image_classification.py b/python/paddle/fluid/tests/book_distribute/notest_dist_image_classification.py
deleted file mode 100644
index e9101fd763c4fd466e1e9287a24487516dc250f6..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book_distribute/notest_dist_image_classification.py
+++ /dev/null
@@ -1,170 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle.v2 as paddle
-import paddle.fluid as fluid
-import os
-import sys
-
-TRAINERS = 5
-BATCH_SIZE = 128
-PASS_NUM = 100
-
-
-def resnet_cifar10(input, depth=32):
-    def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
-        tmp = fluid.layers.conv2d(
-            input=input,
-            filter_size=filter_size,
-            num_filters=ch_out,
-            stride=stride,
-            padding=padding,
-            act=None,
-            bias_attr=False)
-        return fluid.layers.batch_norm(input=tmp, act=act)
-
-    def shortcut(input, ch_in, ch_out, stride):
-        if ch_in != ch_out:
-            return conv_bn_layer(input, ch_out, 1, stride, 0, None)
-        else:
-            return input
-
-    def basicblock(input, ch_in, ch_out, stride):
-        tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
-        tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None)
-        short = shortcut(input, ch_in, ch_out, stride)
-        return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
-
-    def layer_warp(block_func, input, ch_in, ch_out, count, stride):
-        tmp = block_func(input, ch_in, ch_out, stride)
-        for i in range(1, count):
-            tmp = block_func(tmp, ch_out, ch_out, 1)
-        return tmp
-
-    assert (depth - 2) % 6 == 0
-    n = (depth - 2) / 6
-    conv1 = conv_bn_layer(
-        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
-    res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
-    res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
-    res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
-    pool = fluid.layers.pool2d(
-        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
-    return pool
-
-
-def vgg16_bn_drop(input):
-    def conv_block(input, num_filter, groups, dropouts):
-        return fluid.nets.img_conv_group(
-            input=input,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act='relu',
-            conv_with_batchnorm=True,
-            conv_batchnorm_drop_rate=dropouts,
-            pool_type='max')
-
-    conv1 = conv_block(input, 64, 2, [0.3, 0])
-    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
-    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
-    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
-    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-
-    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
-    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
-    bn = fluid.layers.batch_norm(input=fc1, act='relu')
-    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
-    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
-    return fc2
-
-
-classdim = 10
-data_shape = [3, 32, 32]
-
-images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
-label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-net_type = "vgg"
-if len(sys.argv) >= 2:
-    net_type = sys.argv[1]
-
-if net_type == "vgg":
-    print("training vgg net")
-    net = vgg16_bn_drop(images)
-elif net_type == "resnet":
-    print("training resnet")
-    net = resnet_cifar10(images, 32)
-else:
-    raise ValueError("%s network is not supported" % net_type)
-
-predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
-cost = fluid.layers.cross_entropy(input=predict, label=label)
-avg_cost = fluid.layers.mean(cost)
-
-optimizer = fluid.optimizer.Adam(learning_rate=0.001)
-optimize_ops, params_grads = optimizer.minimize(avg_cost)
-
-accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
-
-train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.cifar.train10(), buf_size=128 * 10),
-    batch_size=BATCH_SIZE)
-
-place = fluid.CPUPlace()
-feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
-exe = fluid.Executor(place)
-
-t = fluid.DistributeTranspiler()
-# all parameter server endpoints list for spliting parameters
-pserver_endpoints = os.getenv("PSERVERS")
-# server endpoint for current node
-current_endpoint = os.getenv("SERVER_ENDPOINT")
-# run as trainer or parameter server
-training_role = os.getenv("TRAINING_ROLE",
-                          "TRAINER")  # get the training role: trainer/pserver
-t.transpile(
-    optimize_ops, params_grads, pservers=pserver_endpoints, trainers=TRAINERS)
-
-if training_role == "PSERVER":
-    if not current_endpoint:
-        print("need env SERVER_ENDPOINT")
-        exit(1)
-    pserver_prog = t.get_pserver_program(current_endpoint)
-    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
-    exe.run(pserver_startup)
-    exe.run(pserver_prog)
-elif training_role == "TRAINER":
-    trainer_prog = t.get_trainer_program()
-    exe.run(fluid.default_startup_program())
-
-    for pass_id in range(PASS_NUM):
-        accuracy.reset(exe)
-        for data in train_reader():
-            loss, acc = exe.run(trainer_prog,
-                                feed=feeder.feed(data),
-                                fetch_list=[avg_cost] + accuracy.metrics)
-            pass_acc = accuracy.eval(exe)
-            print("pass_id:" + str(pass_id) + "loss:" + str(loss) + " pass_acc:"
-                  + str(pass_acc))
-            # this model is slow, so if we can train two mini batches,
-            # we think it works properly.
-    print("trainer run end")
-else:
-    print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
-exit(1)
diff --git a/python/paddle/fluid/tests/book_distribute/notest_dist_label_semantic_roles.py b/python/paddle/fluid/tests/book_distribute/notest_dist_label_semantic_roles.py
deleted file mode 100644
index 2d0c54fa7c37f42241702769d01c546fcb6efd8e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book_distribute/notest_dist_label_semantic_roles.py
+++ /dev/null
@@ -1,240 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-
-import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.dataset.conll05 as conll05
-import paddle.fluid as fluid
-import time
-import os
-
-word_dict, verb_dict, label_dict = conll05.get_dict()
-word_dict_len = len(word_dict)
-label_dict_len = len(label_dict)
-pred_len = len(verb_dict)
-
-mark_dict_len = 2
-word_dim = 32
-mark_dim = 5
-hidden_dim = 512
-depth = 8
-mix_hidden_lr = 1e-3
-
-IS_SPARSE = True
-PASS_NUM = 10
-BATCH_SIZE = 20
-
-embedding_name = 'emb'
-
-
-def load_parameter(file_name, h, w):
-    with open(file_name, 'rb') as f:
-        f.read(16)  # skip header.
-        return np.fromfile(f, dtype=np.float32).reshape(h, w)
-
-
-def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
-            **ignored):
-    # 8 features
-    predicate_embedding = fluid.layers.embedding(
-        input=predicate,
-        size=[pred_len, word_dim],
-        dtype='float32',
-        is_sparse=IS_SPARSE,
-        param_attr='vemb')
-
-    mark_embedding = fluid.layers.embedding(
-        input=mark,
-        size=[mark_dict_len, mark_dim],
-        dtype='float32',
-        is_sparse=IS_SPARSE)
-
-    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
-    emb_layers = [
-        fluid.layers.embedding(
-            size=[word_dict_len, word_dim],
-            input=x,
-            param_attr=fluid.ParamAttr(
-                name=embedding_name, trainable=False)) for x in word_input
-    ]
-    emb_layers.append(predicate_embedding)
-    emb_layers.append(mark_embedding)
-
-    hidden_0_layers = [
-        fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers
-    ]
-
-    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
-
-    lstm_0 = fluid.layers.dynamic_lstm(
-        input=hidden_0,
-        size=hidden_dim,
-        candidate_activation='relu',
-        gate_activation='sigmoid',
-        cell_activation='sigmoid')
-
-    # stack L-LSTM and R-LSTM with direct edges
-    input_tmp = [hidden_0, lstm_0]
-
-    for i in range(1, depth):
-        mix_hidden = fluid.layers.sums(input=[
-            fluid.layers.fc(input=input_tmp[0], size=hidden_dim),
-            fluid.layers.fc(input=input_tmp[1], size=hidden_dim)
-        ])
-
-        lstm = fluid.layers.dynamic_lstm(
-            input=mix_hidden,
-            size=hidden_dim,
-            candidate_activation='relu',
-            gate_activation='sigmoid',
-            cell_activation='sigmoid',
-            is_reverse=((i % 2) == 1))
-
-        input_tmp = [mix_hidden, lstm]
-
-    feature_out = fluid.layers.sums(input=[
-        fluid.layers.fc(input=input_tmp[0], size=label_dict_len),
-        fluid.layers.fc(input=input_tmp[1], size=label_dict_len)
-    ])
-
-    return feature_out
-
-
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
-def main():
-    # define network topology
-    word = fluid.layers.data(
-        name='word_data', shape=[1], dtype='int64', lod_level=1)
-    predicate = fluid.layers.data(
-        name='verb_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_n2 = fluid.layers.data(
-        name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_n1 = fluid.layers.data(
-        name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_0 = fluid.layers.data(
-        name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_p1 = fluid.layers.data(
-        name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_p2 = fluid.layers.data(
-        name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
-    mark = fluid.layers.data(
-        name='mark_data', shape=[1], dtype='int64', lod_level=1)
-    feature_out = db_lstm(**locals())
-    target = fluid.layers.data(
-        name='target', shape=[1], dtype='int64', lod_level=1)
-    crf_cost = fluid.layers.linear_chain_crf(
-        input=feature_out,
-        label=target,
-        param_attr=fluid.ParamAttr(
-            name='crfw', learning_rate=mix_hidden_lr))
-    avg_cost = fluid.layers.mean(crf_cost)
-
-    # TODO(qiao)
-    # check other optimizers and check why out will be NAN
-    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
-    optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
-
-    # TODO(qiao)
-    # add dependency track and move this config before optimizer
-    crf_decode = fluid.layers.crf_decoding(
-        input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
-
-    chunk_evaluator = fluid.evaluator.ChunkEvaluator(
-        input=crf_decode,
-        label=target,
-        chunk_scheme="IOB",
-        num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))
-
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.conll05.test(), buf_size=8192),
-        batch_size=BATCH_SIZE)
-    place = fluid.CPUPlace()
-    feeder = fluid.DataFeeder(
-        feed_list=[
-            word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
-        ],
-        place=place)
-    exe = fluid.Executor(place)
-
-    t = fluid.DistributeTranspiler()
-    pserver_endpoints = os.getenv("PSERVERS")
-    # server endpoint for current node
-    current_endpoint = os.getenv("SERVER_ENDPOINT")
-    # run as trainer or parameter server
-    training_role = os.getenv(
-        "TRAINING_ROLE", "TRAINER")  # get the training role: trainer/pserver
-    t.transpile(
-        optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
-
-    if training_role == "PSERVER":
-        if not current_endpoint:
-            print("need env SERVER_ENDPOINT")
-            exit(1)
-        pserver_prog = t.get_pserver_program(current_endpoint)
-        pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
-        exe.run(pserver_startup)
-        exe.run(pserver_prog)
-    elif training_role == "TRAINER":
-        trainer_prog = t.get_trainer_program()
-        start_time = time.time()
-        batch_id = 0
-        exe.run(fluid.default_startup_program())
-        embedding_param = fluid.global_scope().find_var(
-            embedding_name).get_tensor()
-        embedding_param.set(
-            load_parameter(conll05.get_embedding(), word_dict_len, word_dim),
-            place)
-        for pass_id in xrange(PASS_NUM):
-            chunk_evaluator.reset(exe)
-            for data in train_data():
-                cost, precision, recall, f1_score = exe.run(
-                    trainer_prog,
-                    feed=feeder.feed(data),
-                    fetch_list=[avg_cost] + chunk_evaluator.metrics)
-                pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(
-                    exe)
-
-                if batch_id % 10 == 0:
-                    print("avg_cost:" + str(cost) + " precision:" + str(
-                        precision) + " recall:" + str(recall) + " f1_score:" +
-                          str(f1_score) + " pass_precision:" + str(
-                              pass_precision) + " pass_recall:" + str(
-                                  pass_recall) + " pass_f1_score:" + str(
-                                      pass_f1_score))
-                    if batch_id != 0:
-                        print("second per batch: " + str((time.time(
-                        ) - start_time) / batch_id))
-
-                batch_id = batch_id + 1
-
-
-if __name__ == '__main__':
-    main()
diff --git a/python/paddle/fluid/tests/book_distribute/notest_dist_word2vec.py b/python/paddle/fluid/tests/book_distribute/notest_dist_word2vec.py
deleted file mode 100644
index 6304927364e2fa9cf570adf8d1cee81b78604cec..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book_distribute/notest_dist_word2vec.py
+++ /dev/null
@@ -1,111 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import numpy as np
-import paddle.v2 as paddle
-import paddle.fluid as fluid
-import os
-
-PASS_NUM = 100
-EMBED_SIZE = 32
-HIDDEN_SIZE = 256
-N = 5
-BATCH_SIZE = 32
-IS_SPARSE = True
-TRAINERS = 2
-
-word_dict = paddle.dataset.imikolov.build_dict()
-dict_size = len(word_dict)
-
-first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
-second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
-third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
-forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
-next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
-
-embed_first = fluid.layers.embedding(
-    input=first_word,
-    size=[dict_size, EMBED_SIZE],
-    dtype='float32',
-    is_sparse=IS_SPARSE,
-    param_attr='shared_w')
-embed_second = fluid.layers.embedding(
-    input=second_word,
-    size=[dict_size, EMBED_SIZE],
-    dtype='float32',
-    is_sparse=IS_SPARSE,
-    param_attr='shared_w')
-embed_third = fluid.layers.embedding(
-    input=third_word,
-    size=[dict_size, EMBED_SIZE],
-    dtype='float32',
-    is_sparse=IS_SPARSE,
-    param_attr='shared_w')
-embed_forth = fluid.layers.embedding(
-    input=forth_word,
-    size=[dict_size, EMBED_SIZE],
-    dtype='float32',
-    is_sparse=IS_SPARSE,
-    param_attr='shared_w')
-
-concat_embed = fluid.layers.concat(
-    input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
-hidden1 = fluid.layers.fc(input=concat_embed, size=HIDDEN_SIZE, act='sigmoid')
-predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax')
-cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
-avg_cost = fluid.layers.mean(cost)
-sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
-train_reader = paddle.batch(
-    paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
-
-place = fluid.CPUPlace()
-exe = fluid.Executor(place)
-
-t = fluid.DistributeTranspiler()
-# all parameter server endpoints list for spliting parameters
-pserver_endpoints = os.getenv("PSERVERS")
-# server endpoint for current node
-current_endpoint = os.getenv("SERVER_ENDPOINT")
-# run as trainer or parameter server
-training_role = os.getenv("TRAINING_ROLE",
-                          "TRAINER")  # get the training role: trainer/pserver
-t.transpile(
-    optimize_ops, params_grads, pservers=pserver_endpoints, trainers=TRAINERS)
-if training_role == "PSERVER":
-    if not current_endpoint:
-        print("need env SERVER_ENDPOINT")
-        exit(1)
-    pserver_prog = t.get_pserver_program(current_endpoint)
-    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
-    exe.run(pserver_startup)
-    exe.run(pserver_prog)
-elif training_role == "TRAINER":
-    feeder = fluid.DataFeeder(
-        feed_list=[first_word, second_word, third_word, forth_word, next_word],
-        place=place)
-    exe.run(fluid.default_startup_program())
-    for pass_id in range(PASS_NUM):
-        for data in train_reader():
-            avg_cost_np = exe.run(t.get_trainer_program(),
-                                  feed=feeder.feed(data),
-                                  fetch_list=[avg_cost])
-            print("avg_cost_np", avg_cost_np)
-            if avg_cost_np[0] < 5.0:
-                exit(
-                    0)  # if avg cost less than 10.0, we think our code is good.
-else:
-    print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
-exit(1)
diff --git a/python/paddle/fluid/tests/book_distribute/notest_machine_translation.py b/python/paddle/fluid/tests/book_distribute/notest_machine_translation.py
deleted file mode 100644
index f5ef08430e0d3ec340f1859d0ec4343e03099a6f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book_distribute/notest_machine_translation.py
+++ /dev/null
@@ -1,157 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import paddle.v2 as paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.framework as framework
-import paddle.fluid.layers as layers
-from paddle.fluid.executor import Executor
-import os
-
-dict_size = 30000
-source_dict_dim = target_dict_dim = dict_size
-src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
-hidden_dim = 32
-word_dim = 16
-IS_SPARSE = True
-batch_size = 10
-max_length = 50
-topk_size = 50
-trg_dic_size = 10000
-
-decoder_size = hidden_dim
-
-
-def encoder_decoder():
-    # encoder
-    src_word_id = layers.data(
-        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
-    src_embedding = layers.embedding(
-        input=src_word_id,
-        size=[dict_size, word_dim],
-        dtype='float32',
-        is_sparse=IS_SPARSE,
-        param_attr=fluid.ParamAttr(name='vemb'))
-
-    fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
-    lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
-    encoder_out = layers.sequence_last_step(input=lstm_hidden0)
-
-    # decoder
-    trg_language_word = layers.data(
-        name="target_language_word", shape=[1], dtype='int64', lod_level=1)
-    trg_embedding = layers.embedding(
-        input=trg_language_word,
-        size=[dict_size, word_dim],
-        dtype='float32',
-        is_sparse=IS_SPARSE,
-        param_attr=fluid.ParamAttr(name='vemb'))
-
-    rnn = fluid.layers.DynamicRNN()
-    with rnn.block():
-        current_word = rnn.step_input(trg_embedding)
-        mem = rnn.memory(init=encoder_out)
-        fc1 = fluid.layers.fc(input=[current_word, mem],
-                              size=decoder_size,
-                              act='tanh')
-        out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax')
-        rnn.update_memory(mem, fc1)
-        rnn.output(out)
-
-    return rnn()
-
-
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = core.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
-def main():
-    rnn_out = encoder_decoder()
-    label = layers.data(
-        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
-    cost = layers.cross_entropy(input=rnn_out, label=label)
-    avg_cost = fluid.layers.mean(cost)
-
-    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
-    optimize_ops, params_grads = optimizer.minimize(avg_cost)
-
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
-        batch_size=batch_size)
-
-    place = core.CPUPlace()
-    exe = Executor(place)
-
-    t = fluid.DistributeTranspiler()
-    # all parameter server endpoints list for spliting parameters
-    pserver_endpoints = os.getenv("PSERVERS")
-    # server endpoint for current node
-    current_endpoint = os.getenv("SERVER_ENDPOINT")
-    # run as trainer or parameter server
-    training_role = os.getenv(
-        "TRAINING_ROLE", "TRAINER")  # get the training role: trainer/pserver
-    t.transpile(
-        optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
-
-    if training_role == "PSERVER":
-        if not current_endpoint:
-            print("need env SERVER_ENDPOINT")
-            exit(1)
-        pserver_prog = t.get_pserver_program(current_endpoint)
-        pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
-        exe.run(pserver_startup)
-        exe.run(pserver_prog)
-    elif training_role == "TRAINER":
-        trainer_prog = t.get_trainer_program()
-        exe.run(framework.default_startup_program())
-
-        batch_id = 0
-        for pass_id in xrange(2):
-            for data in train_data():
-                word_data = to_lodtensor(map(lambda x: x[0], data), place)
-                trg_word = to_lodtensor(map(lambda x: x[1], data), place)
-                trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
-                outs = exe.run(trainer_prog,
-                               feed={
-                                   'src_word_id': word_data,
-                                   'target_language_word': trg_word,
-                                   'target_language_next_word': trg_word_next
-                               },
-                               fetch_list=[avg_cost])
-                avg_cost_val = np.array(outs[0])
-                print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
-                      " avg_cost=" + str(avg_cost_val))
-                if batch_id > 3:
-                    exit(0)
-                batch_id += 1
-    else:
-        print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
-
-
-if __name__ == '__main__':
-    main()
diff --git a/python/paddle/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py b/python/paddle/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py
deleted file mode 100644
index eae1fe62af431a3be677c02460f942efa3c8dde2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py
+++ /dev/null
@@ -1,100 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import numpy as np
-import paddle.v2 as paddle
-import paddle.fluid as fluid
-import os
-
-images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
-label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-conv_pool_1 = fluid.nets.simple_img_conv_pool(
-    input=images,
-    filter_size=5,
-    num_filters=20,
-    pool_size=2,
-    pool_stride=2,
-    act="relu")
-conv_pool_2 = fluid.nets.simple_img_conv_pool(
-    input=conv_pool_1,
-    filter_size=5,
-    num_filters=50,
-    pool_size=2,
-    pool_stride=2,
-    act="relu")
-
-predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax")
-cost = fluid.layers.cross_entropy(input=predict, label=label)
-avg_cost = fluid.layers.mean(cost)
-optimizer = fluid.optimizer.Adam(learning_rate=0.01)
-optimize_ops, params_grads = optimizer.minimize(avg_cost)
-
-accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
-
-BATCH_SIZE = 50
-PASS_NUM = 3
-train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.mnist.train(), buf_size=500),
-    batch_size=BATCH_SIZE)
-
-place = fluid.CPUPlace()
-exe = fluid.Executor(place)
-
-pserver_endpoints = os.getenv("PSERVERS")  # all pserver endpoints
-trainers = int(os.getenv("TRAINERS"))  # total trainer count
-current_endpoint = os.getenv("SERVER_ENDPOINT")  # current pserver endpoint
-training_role = os.getenv("TRAINING_ROLE",
-                          "TRAINER")  # get the training role: trainer/pserver
-if not current_endpoint:
-    print("need env SERVER_ENDPOINT")
-    exit(1)
-
-t = fluid.DistributeTranspiler()
-t.transpile(
-    optimize_ops,
-    params_grads,
-    0,
-    pservers=pserver_endpoints,
-    trainers=trainers)
-
-if training_role == "PSERVER":
-    pserver_prog = t.get_pserver_program(current_endpoint)
-    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
-    exe.run(pserver_startup)
-    exe.run(pserver_prog)
-elif training_role == "TRAINER":
-    trainer_prog = t.get_trainer_program()
-    feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
-    # TODO(typhoonzero): change trainer startup program to fetch parameters from pserver
-    exe.run(fluid.default_startup_program())
-
-    for pass_id in range(PASS_NUM):
-        accuracy.reset(exe)
-        batch_id = 0
-        for data in train_reader():
-            loss, acc = exe.run(trainer_prog,
-                                feed=feeder.feed(data),
-                                fetch_list=[avg_cost] + accuracy.metrics)
-            pass_acc = accuracy.eval(exe)
-            if batch_id % 100 == 0:
-                print("batch_id %d, loss: %f, acc: %f" %
-                      (batch_id, loss, pass_acc))
-            batch_id += 1
-
-        pass_acc = accuracy.eval(exe)
-        print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc))
-else:
-    print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
diff --git a/python/paddle/fluid/tests/book_distribute/notest_recognize_digits_mlp_dist.py b/python/paddle/fluid/tests/book_distribute/notest_recognize_digits_mlp_dist.py
deleted file mode 100644
index dad95c0f3fff7fb5f3521e561617a2accad86823..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book_distribute/notest_recognize_digits_mlp_dist.py
+++ /dev/null
@@ -1,89 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import numpy as np
-import paddle.v2 as paddle
-import paddle.fluid as fluid
-import os
-
-BATCH_SIZE = 128
-PASS_NUM = 100
-
-images = fluid.layers.data(name='x', shape=[784], dtype='float32')
-
-# TODO(aroraabhinav) Add regularization and error clipping after
-# Issue 7432(https://github.com/PaddlePaddle/Paddle/issues/7432) is resolved.
-hidden1 = fluid.layers.fc(input=images, size=128, act='relu')
-hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
-predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
-
-label = fluid.layers.data(name='y', shape=[1], dtype='int64')
-
-cost = fluid.layers.cross_entropy(input=predict, label=label)
-avg_cost = fluid.layers.mean(cost)
-
-optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
-optimize_ops, params_grads = optimizer.minimize(avg_cost)
-
-accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
-
-train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.mnist.train(), buf_size=8192),
-    batch_size=BATCH_SIZE)
-
-place = fluid.CPUPlace()
-exe = fluid.Executor(place)
-
-t = fluid.DistributeTranspiler()
-# all parameter server endpoints list for spliting parameters
-pserver_endpoints = os.getenv("PSERVERS")
-# server endpoint for current node
-current_endpoint = os.getenv("SERVER_ENDPOINT")
-# run as trainer or parameter server
-training_role = os.getenv("TRAINING_ROLE",
-                          "TRAINER")  # get the training role: trainer/pserver
-t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
-
-if training_role == "PSERVER":
-    if not current_endpoint:
-        print("need env SERVER_ENDPOINT")
-        exit(1)
-    pserver_prog = t.get_pserver_program(current_endpoint)
-    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
-    exe.run(pserver_startup)
-    exe.run(pserver_prog)
-elif training_role == "TRAINER":
-    trainer_prog = t.get_trainer_program()
-    feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
-    exe.run(fluid.default_startup_program())
-
-    for pass_id in range(PASS_NUM):
-        accuracy.reset(exe)
-        batch_id = 0
-        for data in train_reader():
-            loss, acc = exe.run(trainer_prog,
-                                feed=feeder.feed(data),
-                                fetch_list=[avg_cost] + accuracy.metrics)
-            pass_acc = accuracy.eval(exe)
-            if batch_id % 100 == 0:
-                print("batch_id %d, loss: %f, acc: %f" %
-                      (batch_id, loss, pass_acc))
-            batch_id += 1
-
-        pass_acc = accuracy.eval(exe)
-        print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc))
-else:
-    print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
diff --git a/python/paddle/fluid/tests/book_distribute/notest_recommender_system_dist.py b/python/paddle/fluid/tests/book_distribute/notest_recommender_system_dist.py
deleted file mode 100644
index 4329c821c27998d47e735ff0214775a078b9ec8e..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book_distribute/notest_recommender_system_dist.py
+++ /dev/null
@@ -1,216 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import os
-import paddle.v2 as paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.layers as layers
-import paddle.fluid.nets as nets
-from paddle.fluid.optimizer import SGDOptimizer
-
-IS_SPARSE = True
-BATCH_SIZE = 256
-PASS_NUM = 100
-
-
-def get_usr_combined_features():
-    USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1
-    uid = layers.data(name='user_id', shape=[1], dtype='int64')
-    usr_emb = layers.embedding(
-        input=uid,
-        dtype='float32',
-        size=[USR_DICT_SIZE, 32],
-        param_attr='user_table',
-        is_sparse=IS_SPARSE)
-    usr_fc = layers.fc(input=usr_emb, size=32)
-    USR_GENDER_DICT_SIZE = 2
-
-    usr_gender_id = layers.data(name='gender_id', shape=[1], dtype='int64')
-    usr_gender_emb = layers.embedding(
-        input=usr_gender_id,
-        size=[USR_GENDER_DICT_SIZE, 16],
-        param_attr='gender_table',
-        is_sparse=IS_SPARSE)
-    usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
-
-    USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
-    usr_age_id = layers.data(name='age_id', shape=[1], dtype="int64")
-    usr_age_emb = layers.embedding(
-        input=usr_age_id,
-        size=[USR_AGE_DICT_SIZE, 16],
-        is_sparse=IS_SPARSE,
-        param_attr='age_table')
-    usr_age_fc = layers.fc(input=usr_age_emb, size=16)
-
-    USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
-    usr_job_id = layers.data(name='job_id', shape=[1], dtype="int64")
-    usr_job_emb = layers.embedding(
-        input=usr_job_id,
-        size=[USR_JOB_DICT_SIZE, 16],
-        param_attr='job_table',
-        is_sparse=IS_SPARSE)
-    usr_job_fc = layers.fc(input=usr_job_emb, size=16)
-
-    concat_embed = layers.concat(
-        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1)
-
-    usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
-    return usr_combined_features
-
-
-def get_mov_combined_features():
-    MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1
-    mov_id = layers.data(name='movie_id', shape=[1], dtype='int64')
-    mov_emb = layers.embedding(
-        input=mov_id,
-        dtype='float32',
-        size=[MOV_DICT_SIZE, 32],
-        param_attr='movie_table',
-        is_sparse=IS_SPARSE)
-    mov_fc = layers.fc(input=mov_emb, size=32)
-
-    CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
-    category_id = layers.data(name='category_id', shape=[1], dtype='int64')
-    mov_categories_emb = layers.embedding(
-        input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
-    mov_categories_hidden = layers.sequence_pool(
-        input=mov_categories_emb, pool_type="sum")
-
-    MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
-    mov_title_id = layers.data(name='movie_title', shape=[1], dtype='int64')
-    mov_title_emb = layers.embedding(
-        input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
-    mov_title_conv = nets.sequence_conv_pool(
-        input=mov_title_emb,
-        num_filters=32,
-        filter_size=3,
-        act="tanh",
-        pool_type="sum")
-
-    concat_embed = layers.concat(
-        input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1)
-
-    mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
-    return mov_combined_features
-
-
-def model():
-    usr_combined_features = get_usr_combined_features()
-    mov_combined_features = get_mov_combined_features()
-
-    # need cos sim
-    inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
-    scale_infer = layers.scale(x=inference, scale=5.0)
-
-    label = layers.data(name='score', shape=[1], dtype='float32')
-    square_cost = layers.square_error_cost(input=scale_infer, label=label)
-    avg_cost = layers.mean(square_cost)
-
-    return avg_cost
-
-
-def func_feed(feeding, data, place):
-    feed_tensors = {}
-    for (key, idx) in feeding.iteritems():
-        tensor = core.LoDTensor()
-        if key != "category_id" and key != "movie_title":
-            if key == "score":
-                numpy_data = np.array(map(lambda x: x[idx], data)).astype(
-                    "float32")
-            else:
-                numpy_data = np.array(map(lambda x: x[idx], data)).astype(
-                    "int64")
-        else:
-            numpy_data = map(lambda x: np.array(x[idx]).astype("int64"), data)
-            lod_info = [len(item) for item in numpy_data]
-            offset = 0
-            lod = [offset]
-            for item in lod_info:
-                offset += item
-                lod.append(offset)
-            numpy_data = np.concatenate(numpy_data, axis=0)
-            tensor.set_lod([lod])
-
-        numpy_data = numpy_data.reshape([numpy_data.shape[0], 1])
-        tensor.set(numpy_data, place)
-        feed_tensors[key] = tensor
-    return feed_tensors
-
-
-def main():
-    cost = model()
-    optimizer = SGDOptimizer(learning_rate=0.2)
-    optimize_ops, params_grads = optimizer.minimize(cost)
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.movielens.train(), buf_size=8192),
-        batch_size=BATCH_SIZE)
-
-    place = fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    t = fluid.DistributeTranspiler()
-
-    # all parameter server endpoints list for spliting parameters
-    pserver_endpoints = os.getenv("PSERVERS")
-    # server endpoint for current node
-    current_endpoint = os.getenv("SERVER_ENDPOINT")
-    # run as trainer or parameter server
-    training_role = os.getenv("TRAINING_ROLE", "TRAINER")
-    t.transpile(
-        optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
-
-    if training_role == "PSERVER":
-        if not current_endpoint:
-            print("need env SERVER_ENDPOINT")
-            exit(1)
-        pserver_prog = t.get_pserver_program(current_endpoint)
-        pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
-        exe.run(pserver_startup)
-        exe.run(pserver_prog)
-    elif training_role == "TRAINER":
-        exe.run(fluid.default_startup_program())
-        trainer_prog = t.get_trainer_program()
-
-        feeding = {
-            'user_id': 0,
-            'gender_id': 1,
-            'age_id': 2,
-            'job_id': 3,
-            'movie_id': 4,
-            'category_id': 5,
-            'movie_title': 6,
-            'score': 7
-        }
-
-        for pass_id in range(PASS_NUM):
-            for data in train_reader():
-                outs = exe.run(trainer_prog,
-                               feed=func_feed(feeding, data, place),
-                               fetch_list=[cost])
-                out = np.array(outs[0])
-                print("cost=" + str(out[0]))
-                if out[0] < 6.0:
-                    print("Training complete. Average cost is less than 6.0.")
-                    # if avg cost less than 6.0, we think our code is good.
-                    exit(0)
-    else:
-        print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
-
-
-if __name__ == '__main__':
-    main()
diff --git a/python/paddle/fluid/tests/book_distribute/notest_understand_sentiment_conv_dist.py b/python/paddle/fluid/tests/book_distribute/notest_understand_sentiment_conv_dist.py
deleted file mode 100644
index ee0d8597b75e6326b60276668a85cd8422327e48..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book_distribute/notest_understand_sentiment_conv_dist.py
+++ /dev/null
@@ -1,125 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import os
-import numpy as np
-import paddle.v2 as paddle
-import paddle.fluid as fluid
-
-
-def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
-                    hid_dim=32):
-    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
-    conv_3 = fluid.nets.sequence_conv_pool(
-        input=emb,
-        num_filters=hid_dim,
-        filter_size=3,
-        act="tanh",
-        pool_type="sqrt")
-    conv_4 = fluid.nets.sequence_conv_pool(
-        input=emb,
-        num_filters=hid_dim,
-        filter_size=4,
-        act="tanh",
-        pool_type="sqrt")
-    prediction = fluid.layers.fc(input=[conv_3, conv_4],
-                                 size=class_dim,
-                                 act="softmax")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(cost)
-    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
-    optimize_ops, params_grads = adam_optimizer.minimize(avg_cost)
-    accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
-    return avg_cost, accuracy, accuracy.metrics[0], optimize_ops, params_grads
-
-
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
-def main():
-    BATCH_SIZE = 100
-    PASS_NUM = 5
-
-    word_dict = paddle.dataset.imdb.word_dict()
-    dict_dim = len(word_dict)
-    class_dim = 2
-
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-    cost, accuracy, acc_out, optimize_ops, params_grads = convolution_net(
-        data, label, input_dim=dict_dim, class_dim=class_dim)
-
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.imdb.train(word_dict), buf_size=1000),
-        batch_size=BATCH_SIZE)
-    place = fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    t = fluid.DistributeTranspiler()
-
-    # all parameter server endpoints list for spliting parameters
-    pserver_endpoints = os.getenv("PSERVERS")
-    # server endpoint for current node
-    current_endpoint = os.getenv("SERVER_ENDPOINT")
-    # run as trainer or parameter server
-    training_role = os.getenv(
-        "TRAINING_ROLE", "TRAINER")  # get the training role: trainer/pserver
-    t.transpile(
-        optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
-
-    if training_role == "PSERVER":
-        if not current_endpoint:
-            print("need env SERVER_ENDPOINT")
-            exit(1)
-        pserver_prog = t.get_pserver_program(current_endpoint)
-        pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
-        exe.run(pserver_startup)
-        exe.run(pserver_prog)
-    elif training_role == "TRAINER":
-        exe.run(fluid.default_startup_program())
-        trainer_prog = t.get_trainer_program()
-        feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
-
-        for pass_id in xrange(PASS_NUM):
-            accuracy.reset(exe)
-            for data in train_data():
-                cost_val, acc_val = exe.run(trainer_prog,
-                                            feed=feeder.feed(data),
-                                            fetch_list=[cost, acc_out])
-                pass_acc = accuracy.eval(exe)
-                print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
-                      " pass_acc=" + str(pass_acc))
-                if cost_val < 1.0 and pass_acc > 0.8:
-                    exit(0)
-    else:
-        print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
-
-
-if __name__ == '__main__':
-    main()
diff --git a/python/paddle/fluid/tests/book_distribute/notest_understand_sentiment_dynamic_lstm.py b/python/paddle/fluid/tests/book_distribute/notest_understand_sentiment_dynamic_lstm.py
deleted file mode 100644
index fa792cbf92906fe4e922ab9f9fac0245a56b414d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book_distribute/notest_understand_sentiment_dynamic_lstm.py
+++ /dev/null
@@ -1,135 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import os
-import paddle.v2 as paddle
-import paddle.fluid as fluid
-
-
-def stacked_lstm_net(data,
-                     label,
-                     input_dim,
-                     class_dim=2,
-                     emb_dim=128,
-                     hid_dim=512,
-                     stacked_num=3):
-    assert stacked_num % 2 == 1
-
-    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
-    # add bias attr
-
-    # TODO(qijun) linear act
-    fc1 = fluid.layers.fc(input=emb, size=hid_dim)
-    lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)
-
-    inputs = [fc1, lstm1]
-
-    for i in range(2, stacked_num + 1):
-        fc = fluid.layers.fc(input=inputs, size=hid_dim)
-        lstm, cell = fluid.layers.dynamic_lstm(
-            input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
-        inputs = [fc, lstm]
-
-    fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
-    lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
-
-    prediction = fluid.layers.fc(input=[fc_last, lstm_last],
-                                 size=class_dim,
-                                 act='softmax')
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(cost)
-    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
-    optimize_ops, params_grads = adam_optimizer.minimize(avg_cost)
-    accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
-    return avg_cost, accuracy, accuracy.metrics[0], optimize_ops, params_grads
-
-
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
-def main():
-    BATCH_SIZE = 100
-    PASS_NUM = 5
-
-    word_dict = paddle.dataset.imdb.word_dict()
-    print "loaded word dict successfully"
-    dict_dim = len(word_dict)
-    class_dim = 2
-
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-    cost, accuracy, acc_out, optimize_ops, params_grads = stacked_lstm_net(
-        data, label, input_dim=dict_dim, class_dim=class_dim)
-
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.imdb.train(word_dict), buf_size=1000),
-        batch_size=BATCH_SIZE)
-    place = fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
-
-    t = fluid.DistributeTranspiler()
-    # all parameter server endpoints list for spliting parameters
-    pserver_endpoints = os.getenv("PSERVERS")
-    # server endpoint for current node
-    current_endpoint = os.getenv("SERVER_ENDPOINT")
-    # run as trainer or parameter server
-    training_role = os.getenv(
-        "TRAINING_ROLE", "TRAINER")  # get the training role: trainer/pserver
-    t.transpile(
-        optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
-
-    if training_role == "PSERVER":
-        if not current_endpoint:
-            print("need env SERVER_ENDPOINT")
-            exit(1)
-        pserver_prog = t.get_pserver_program(current_endpoint)
-        pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
-        exe.run(pserver_startup)
-        exe.run(pserver_prog)
-    elif training_role == "TRAINER":
-        exe.run(fluid.default_startup_program())
-        trainer_prog = t.get_trainer_program()
-        for pass_id in xrange(PASS_NUM):
-            accuracy.reset(exe)
-            for data in train_data():
-                cost_val, acc_val = exe.run(trainer_prog,
-                                            feed=feeder.feed(data),
-                                            fetch_list=[cost, acc_out])
-                pass_acc = accuracy.eval(exe)
-                print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
-                      " pass_acc=" + str(pass_acc))
-                if cost_val < 1.0 and acc_val > 0.8:
-                    exit(0)
-    else:
-        print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
-
-
-if __name__ == '__main__':
-    main()
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 1dc6d107d214165dd42cb728a56a55baf9b157fb..fc25786499ff054a32e5503e796992d7f1e3ba02 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -145,5 +145,43 @@ class TestMultiBoxHead(unittest.TestCase):
         return mbox_locs, mbox_confs, box, var
 
 
+class TestDetectionMAP(unittest.TestCase):
+    def test_detection_map(self):
+        program = Program()
+        with program_guard(program):
+            detect_res = layers.data(
+                name='detect_res',
+                shape=[10, 6],
+                append_batch_size=False,
+                dtype='float32')
+            label = layers.data(
+                name='label',
+                shape=[10, 6],
+                append_batch_size=False,
+                dtype='float32')
+
+            map_out, accum_pos_count_out, accum_true_pos_out, accum_false_pos_out = layers.detection_map(
+                detect_res=detect_res, label=label)
+            self.assertIsNotNone(map_out)
+            self.assertIsNotNone(accum_pos_count_out)
+            self.assertIsNotNone(accum_true_pos_out)
+            self.assertIsNotNone(accum_false_pos_out)
+            self.assertEqual(map_out.shape, (1, ))
+            map_out, accum_pos_count_out2, accum_true_pos_out2, accum_false_pos_out2 = layers.detection_map(
+                detect_res=detect_res, label=label)
+            self.assertIsNotNone(map_out)
+            self.assertIsNotNone(accum_pos_count_out2)
+            self.assertIsNotNone(accum_true_pos_out2)
+            self.assertIsNotNone(accum_false_pos_out2)
+            self.assertEqual(map_out.shape, (1, ))
+            self.assertEqual(accum_pos_count_out.shape,
+                             accum_pos_count_out2.shape)
+            self.assertEqual(accum_true_pos_out.shape,
+                             accum_true_pos_out2.shape)
+            self.assertEqual(accum_false_pos_out.shape,
+                             accum_false_pos_out2.shape)
+        print(str(program))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 9355f51311e33729c0cb8ff321010235aafa4063..f96c2ca4f0593b6c2624d449304f23425c69ab93 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -41,6 +41,7 @@ list(REMOVE_ITEM TEST_OPS test_while_op)
 list(REMOVE_ITEM TEST_OPS test_lod_array_length_op)
 list(REMOVE_ITEM TEST_OPS test_reorder_lod_tensor)
 list(REMOVE_ITEM TEST_OPS test_profiler)
+list(REMOVE_ITEM TEST_OPS test_nvprof)
 list(REMOVE_ITEM TEST_OPS test_normalization_wrapper)
 list(REMOVE_ITEM TEST_OPS test_executor_and_mul)
 list(REMOVE_ITEM TEST_OPS test_assign_value_op)
@@ -75,6 +76,7 @@ py_test_modules(test_while_op MODULES test_while_op)
 py_test_modules(test_lod_array_length_op MODULES test_lod_array_length_op)
 py_test_modules(test_reorder_lod_tensor MODULES test_reorder_lod_tensor)
 py_test_modules(test_profiler MODULES test_profiler)
+py_test_modules(test_nvprof MODULES test_nvprof)
 py_test_modules(test_normalization_wrapper MODULES test_normalization_wrapper)
 py_test_modules(test_executor_and_mul MODULES test_executor_and_mul)
 py_test_modules(test_assign_value_op MODULES test_assign_value_op)
diff --git a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
index 9f9af2f55e2e9a1c624fb95f1c113e24c2de4a89..f7461ee6dab699064153332116449c8e20a0bac0 100644
--- a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
@@ -46,7 +46,20 @@ def bipartite_match(distance, match_indices, match_dist):
             idx += 1
 
 
-def batch_bipartite_match(distance, lod):
+def argmax_match(distance, match_indices, match_dist, threshold):
+    r, c = distance.shape
+    for j in xrange(c):
+        if match_indices[j] != -1:
+            continue
+        col_dist = distance[:, j]
+        indices = np.argwhere(col_dist >= threshold).flatten()
+        if len(indices) < 1:
+            continue
+        match_indices[j] = indices[np.argmax(col_dist[indices])]
+        match_dist[j] = col_dist[match_indices[j]]
+
+
+def batch_bipartite_match(distance, lod, match_type=None, dist_threshold=None):
     """Bipartite Matching algorithm for batch input.
     Arg:
         distance (numpy.array) : The distance of two entries with shape [M, N].
@@ -59,6 +72,9 @@ def batch_bipartite_match(distance, lod):
     for i in range(len(lod) - 1):
         bipartite_match(distance[lod[i]:lod[i + 1], :], match_indices[i, :],
                         match_dist[i, :])
+        if match_type == 'per_prediction':
+            argmax_match(distance[lod[i]:lod[i + 1], :], match_indices[i, :],
+                         match_dist[i, :], dist_threshold)
     return match_indices, match_dist
 
 
@@ -71,8 +87,8 @@ class TestBipartiteMatchOpWithLoD(OpTest):
 
         self.inputs = {'DistMat': (dist, lod)}
         self.outputs = {
-            'ColToRowMatchIndices': (match_indices),
-            'ColToRowMatchDist': (match_dist),
+            'ColToRowMatchIndices': match_indices,
+            'ColToRowMatchDist': match_dist,
         }
 
     def test_check_output(self):
@@ -96,5 +112,27 @@ class TestBipartiteMatchOpWithoutLoD(OpTest):
         self.check_output()
 
 
+class TestBipartiteMatchOpWithPerPredictionType(OpTest):
+    def setUp(self):
+        self.op_type = 'bipartite_match'
+        lod = [[0, 5, 11, 23]]
+        dist = np.random.random((23, 237)).astype('float32')
+        match_indices, match_dist = batch_bipartite_match(dist, lod[0],
+                                                          'per_prediction', 0.5)
+
+        self.inputs = {'DistMat': (dist, lod)}
+        self.outputs = {
+            'ColToRowMatchIndices': match_indices,
+            'ColToRowMatchDist': match_dist,
+        }
+        self.attrs = {
+            'match_type': 'per_prediction',
+            'dist_threshold': 0.5,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index 1fada38a0359d4a0130d58ae2f8442fbfce21e2e..1321cfd484ec8be1d8a817535386db949d825574 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -210,6 +210,19 @@ class TestWithDilation(TestConv2dOp):
         self.groups = 3
 
 
+class TestWithInput1x1Filter1x1(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 1, 1]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 1, 1]
+
+    def init_group(self):
+        self.groups = 3
+
+
 #----------------Conv2dCUDNN----------------
 class TestCUDNN(TestConv2dOp):
     def init_op_type(self):
@@ -241,6 +254,12 @@ class TestCUDNNWith1x1(TestWith1x1):
         self.op_type = "conv2d"
 
 
+class TestCUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv2d"
+
+
 class TestDepthwiseConv(TestConv2dOp):
     def init_test_case(self):
         self.pad = [1, 1]
@@ -265,7 +284,8 @@ class TestDepthwiseConv2(TestConv2dOp):
         self.op_type = "depthwise_conv2d"
 
 
-#  cudnn v5 does not support dilation conv.
+# Please Don't remove the following code.
+# Currently, CI use cudnn V5.0 which not support dilation conv.
 # class TestCUDNNWithDilation(TestWithDilation):
 #     def init_op_type(self):
 #         self.op_type = "conv_cudnn"
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
index 9831b7eb1269938cfcb1d6a3a2940836ee19ed56..d864b9b348e961c585749d47d449d775b2dfebc9 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
@@ -200,7 +200,8 @@ class TestCUDNNWithStride(TestWithStride):
         self.op_type = "conv2d_transpose"
 
 
-# #cudnn v5 does not support dilation conv.
+# Please Don't remove the following code.
+# Currently, CI use cudnn V5.0 which not support dilation conv.
 # class TestCUDNNWithDilation(TestWithDilation):
 #     def init_test_case(self):
 #         self.pad = [1, 1]
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
index 4d3df5e33c426c92213a199b4841f649446c6e28..d5dd63e8737cbdd9b91d083fbd0b38f8baf570b3 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
@@ -200,6 +200,22 @@ class TestWith1x1(TestConv3dOp):
         self.groups = 3
 
 
+class TestWithInput1x1Filter1x1(TestConv3dOp):
+    def init_test_case(self):
+        self.pad = [0, 0, 0]
+        self.stride = [1, 1, 1]
+        self.input_size = [2, 3, 1, 1, 1]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 1, 1, 1]
+
+    def init_dilation(self):
+        self.dilations = [1, 1, 1]
+
+    def init_group(self):
+        self.groups = 3
+
+
 class TestWithDilation(TestConv3dOp):
     def init_test_case(self):
         self.pad = [0, 0, 0]
@@ -240,6 +256,12 @@ class TestWith1x1CUDNN(TestWith1x1):
         self.op_type = "conv3d"
 
 
+class TestWithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d"
+
+
 # FIXME(typhoonzero): find a way to determine if
 # using cudnn > 6 in python
 # class TestWithDilationCUDNN(TestWithDilation):
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
index a79bfa13d6ca21cac6640deb5a21ef457af984df..55ba238710c56dd0daea388cd2dcdb79243bb71e 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
@@ -207,7 +207,8 @@ class TestCUDNNWithStride(TestWithStride):
         self.op_type = "conv3d_transpose"
 
 
-# #cudnn v5 does not support dilation conv.
+# Please Don't remove the following code.
+# Currently, CI use cudnn V5.0 which not support dilation conv.
 # class TestCUDNNWithDilation(TestWithDilation):
 #     def init_test_case(self):
 #         self.pad = [1, 1, 1]
diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_decay.py b/python/paddle/fluid/tests/unittests/test_learning_rate_decay.py
index c867cfcdb6ec810ddd5add833b06d4adf266d86b..5c221a0325b6cdc27ec22e5a8b02ae8eec9f6d80 100644
--- a/python/paddle/fluid/tests/unittests/test_learning_rate_decay.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_decay.py
@@ -12,14 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
-
-import math
 import copy
+import math
+import unittest
 
-import paddle.fluid.framework as framework
 import paddle.fluid as fluid
-import paddle.fluid.layers as layers
+import paddle.fluid.framework as framework
 import paddle.fluid.learning_rate_decay as lr_decay
 
 
@@ -90,12 +88,9 @@ class TestLearningRateDecay(unittest.TestCase):
 
         exe.run(fluid.default_startup_program())
         for step in range(10):
-            step_val, lr_val = exe.run(
-                fluid.default_main_program(),
-                feed=[],
-                fetch_list=[
-                    fluid.layers.autoincreased_step_counter(), decayed_lr
-                ])
+            lr_val, = exe.run(fluid.default_main_program(),
+                              feed=[],
+                              fetch_list=[decayed_lr])
             python_decayed_lr = python_decay_fn(
                 global_step=float(step), **kwargs)
             self.assertAlmostEqual(
diff --git a/python/paddle/fluid/tests/unittests/test_nvprof.py b/python/paddle/fluid/tests/unittests/test_nvprof.py
new file mode 100644
index 0000000000000000000000000000000000000000..226e5e5d1131b1f33cfbbfefec536e6974f85b36
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_nvprof.py
@@ -0,0 +1,46 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.layers as layers
+import paddle.fluid.core as core
+
+
+class TestNVProf(unittest.TestCase):
+    def test_nvprof(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+        epoc = 8
+        dshape = [4, 3, 28, 28]
+        data = layers.data(name='data', shape=[3, 28, 28], dtype='float32')
+        conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
+
+        place = fluid.CUDAPlace(0)
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        output_file = 'cuda_profiler.txt'
+        with profiler.cuda_profiler(output_file, 'csv') as nvprof:
+            for i in range(epoc):
+                input = np.random.random(dshape).astype('float32')
+                exe.run(fluid.default_main_program(), feed={'data': input})
+        os.remove(output_file)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py
index d9444b50a2362d4d122ea880d47d337426fbdc96..f6f581ff7d67260dad50b285aa35276698fd7130 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
@@ -22,27 +22,9 @@ import paddle.fluid.core as core
 
 
 class TestProfiler(unittest.TestCase):
-    def test_nvprof(self):
-        if not fluid.core.is_compiled_with_cuda():
-            return
-        epoc = 8
-        dshape = [4, 3, 28, 28]
-        data = layers.data(name='data', shape=[3, 28, 28], dtype='float32')
-        conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
-
-        place = fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-
-        output_file = 'cuda_profiler.txt'
-        with profiler.cuda_profiler(output_file, 'csv') as nvprof:
-            for i in range(epoc):
-                input = np.random.random(dshape).astype('float32')
-                exe.run(fluid.default_main_program(), feed={'data': input})
-        os.remove(output_file)
-
     def net_profiler(self, state):
-        if state == 'GPU' and not core.is_compiled_with_cuda():
+        enable_if_gpu = state == 'GPU' or state == "All"
+        if enable_if_gpu and not core.is_compiled_with_cuda():
             return
         startup_program = fluid.Program()
         main_program = fluid.Program()
@@ -85,6 +67,9 @@ class TestProfiler(unittest.TestCase):
     def test_cuda_profiler(self):
         self.net_profiler('GPU')
 
+    def test_all_profiler(self):
+        self.net_profiler('All')
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py
index 23f5a24a1cea7f665fb65e802e1a7811df78208d..0d544efac9cd20157f87b5cd3b68f97ab5ed2dbc 100644
--- a/python/paddle/v2/dataset/conll05.py
+++ b/python/paddle/v2/dataset/conll05.py
@@ -41,6 +41,26 @@ EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
 UNK_IDX = 0
 
 
+def load_label_dict(filename):
+    d = dict()
+    tag_dict = set()
+    with open(filename, 'r') as f:
+        for i, line in enumerate(f):
+            line = line.strip()
+            if line.startswith("B-"):
+                tag_dict.add(line[2:])
+            elif line.startswith("I-"):
+                tag_dict.add(line[2:])
+        index = 0
+        for tag in tag_dict:
+            d["B-" + tag] = index
+            index += 1
+            d["I-" + tag] = index
+            index += 1
+        d["O"] = index
+    return d
+
+
 def load_dict(filename):
     d = dict()
     with open(filename, 'r') as f:
@@ -188,7 +208,7 @@ def get_dict():
     verb_dict = load_dict(
         paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st',
                                           VERBDICT_MD5))
-    label_dict = load_dict(
+    label_dict = load_label_dict(
         paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st',
                                           TRGDICT_MD5))
     return word_dict, verb_dict, label_dict