diff --git a/benchmark/fluid/mnist.py b/benchmark/fluid/mnist.py
index 43866da9cb113e9d49fc1c51f67da94cbc6bfd8e..dc10ac2ec195acc9a5693718141ddb32417dfb71 100644
--- a/benchmark/fluid/mnist.py
+++ b/benchmark/fluid/mnist.py
@@ -139,9 +139,6 @@ def run_benchmark(model, args):
 
     # inference program
     inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(
-            target_vars=[batch_acc, batch_size_tensor])
 
     # Optimization
     opt = fluid.optimizer.AdamOptimizer(
@@ -161,7 +158,7 @@ def run_benchmark(model, args):
     train_reader = paddle.batch(
         paddle.dataset.mnist.train(), batch_size=args.batch_size)
 
-    accuracy = fluid.average.WeightedAverage()
+    accuracy = fluid.metrics.Accuracy()
     iters, num_samples, start_time = 0, 0, time.time()
     for pass_id in range(args.pass_num):
         accuracy.reset()
@@ -184,7 +181,7 @@ def run_benchmark(model, args):
                       "label": y_data},
                 fetch_list=[avg_cost, batch_acc, batch_size_tensor]
             )  # The accuracy is the accumulation of batches, but not the current batch.
-            accuracy.add(value=outs[1], weight=outs[2])
+            accuracy.update(value=outs[1], weight=outs[2])
             iters += 1
             num_samples += len(y_data)
             loss = np.array(outs[0])
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 6320b17520a687f88993b6f464d9115838b0f96b..52a22c1fbf4779fa3c0ca687cab664bd3ca0410a 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -62,29 +62,33 @@ endif()
 
 
 ## Then find the reference-cblas.  www.netlib.org/blas/
-
-
 set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH
   "Folder contains reference-cblas")
-set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
-  ${REFERENCE_CBLAS_ROOT}/include
-  /usr/include
-  /usr/include/cblas
-)
-
-set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
-  ${REFERENCE_CBLAS_ROOT}/lib
-  /usr/lib
-  /usr/lib/blas/reference/
-  /usr/lib/reference/
-)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
+    ${REFERENCE_CBLAS_ROOT}/include
+    /usr/include
+    /usr/include/cblas
+  )
+
+  set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
+    ${REFERENCE_CBLAS_ROOT}/lib
+    /usr/lib
+    /usr/lib/blas/reference/
+    /usr/lib/reference/
+  )
+else()
+  # Diable the finding of reference cblas under host's system path
+  set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/include)
+  set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib)
+endif()
 
 find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
         ${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS})
 find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
         ${REFERENCE_CBLAS_LIB_SEARCH_PATHS})
 
-if (REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
+if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
   set(CBLAS_FOUND ON)
   set(CBLAS_PROVIDER REFERENCE)
   set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index 0853b981813c5d60a12603471df7e0b216b0822f..aa249159470773241e0f6da2e8e086264634dd4a 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -24,16 +24,16 @@ SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
 SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
 SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
 IF(APPLE)
-  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
+  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
 ELSE()
-  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin)
+  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin)
 ENDIF()
 
 ExternalProject_Add(
     extern_grpc
     DEPENDS protobuf zlib
     GIT_REPOSITORY "https://github.com/grpc/grpc.git"
-    GIT_TAG "v1.8.x"
+    GIT_TAG "v1.11.x"
     PREFIX          ${GRPC_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CONFIGURE_COMMAND ""
diff --git a/cmake/external/nccl.cmake b/cmake/external/nccl.cmake
deleted file mode 100644
index af5c689c3524741a88518eeb3f85996872257677..0000000000000000000000000000000000000000
--- a/cmake/external/nccl.cmake
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-if(NOT WITH_GPU)
-  return()
-endif()
-
-include(ExternalProject)
-
-set(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl)
-
-include_directories(${NCCL_SOURCE_DIR}/src/extern_nccl/src)
-
-if(WITH_DSO)
-  # If we use DSO, we do not build nccl, just download the dependencies
-  set(NCCL_BUILD_COMMAND "")
-  set(NCCL_INSTALL_COMMAND "")
-  set(NCCL_INSTALL_DIR "")
-else()
-  # otherwise, we build nccl and link it.
-  set(NCCL_INSTALL_DIR ${THIRD_PARTY_PATH}/install/nccl)
-  # Note: cuda 8.0 is needed to make nccl
-  # When cuda is not installed on the system directory, need to set CUDA_HOME to your cuda root
-  set(NCCL_BUILD_COMMAND "make -j 8")
-  set(NCCL_INSTALL_COMMAND  "make install PREFIX=${NCCL_INSTALL_DIR}")
-endif()
-
-ExternalProject_Add(
-    extern_nccl
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/NVIDIA/nccl.git"
-    GIT_TAG         "v1.3.4-1"
-    PREFIX          "${NCCL_SOURCE_DIR}"
-    UPDATE_COMMAND  ""
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND     "${NCCL_BUILD_COMMAND}"
-    INSTALL_COMMAND   "${NCCL_INSTALL_COMMAND}"
-    INSTALL_DIR       "${NCCL_INSTALL_DIR}"
-    TEST_COMMAND      ""
-)
-
-if(WITH_DSO)
-  if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
-    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_nccl_dummy.c)
-    file(WRITE ${dummyfile} "const char * dummy_nccl = \"${dummyfile}\";")
-    add_library(nccl STATIC ${dummyfile})
-  else()
-    add_library(nccl INTERFACE)
-  endif()
-else()
-  add_library(nccl STATIC IMPORTED GLOBAL)
-  set_property(TARGET nccl PROPERTY IMPORTED_LOCATION
-               ${NCCL_INSTALL_DIR}/lib/libnccl_static.a)
-endif()
-
-add_dependencies(nccl extern_nccl)
diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake
index 71f54c425d4c38e271a8f1b78887d95a27252443..80282329c6ac65fbd1493a6838efca4bd9cadaad 100644
--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
@@ -11,19 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
 
-IF(MOBILE_INFERENCE)
+if(MOBILE_INFERENCE OR RPI)
     return()
-ENDIF()
+endif()
 
 include (ExternalProject)
 
 # NOTE: snappy is needed when linking with recordio
 
-SET(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
-SET(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
-SET(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include/" CACHE PATH "snappy include directory." FORCE)
+set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
+set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
+set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
+
+set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
 
 ExternalProject_Add(
     extern_snappy
@@ -51,8 +52,7 @@ ExternalProject_Add(
 )
 
 add_library(snappy STATIC IMPORTED GLOBAL)
-set_property(TARGET snappy PROPERTY IMPORTED_LOCATION
-             "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
+set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES})
 
 include_directories(${SNAPPY_INCLUDE_DIR})
 add_dependencies(snappy extern_snappy)
diff --git a/cmake/external/snappystream.cmake b/cmake/external/snappystream.cmake
index 8f7a3bf8eeaef75c8840f4ea318b484d33249bb7..20a96430823d07a07d4bb4602e7fc0cfe55c3bf2 100644
--- a/cmake/external/snappystream.cmake
+++ b/cmake/external/snappystream.cmake
@@ -11,9 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
 
-IF(MOBILE_INFERENCE)
+IF(MOBILE_INFERENCE OR RPI)
     return()
 ENDIF()
 
@@ -21,9 +20,11 @@ include (ExternalProject)
 
 # NOTE: snappy is needed when linking with recordio
 
-SET(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
-SET(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
-SET(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include/" CACHE PATH "snappy stream include directory." FORCE)
+set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
+set(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
+set(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include" CACHE PATH "snappy stream include directory." FORCE)
+
+set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
 
 ExternalProject_Add(
         extern_snappystream
@@ -51,8 +52,7 @@ ExternalProject_Add(
 )
 
 add_library(snappystream STATIC IMPORTED GLOBAL)
-set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION
-        "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
+set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION ${SNAPPYSTREAM_LIBRARIES})
 
 include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) # For snappysteam to include its own headers.
 include_directories(${THIRD_PARTY_PATH}/install) # For Paddle to include snappy stream headers.
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index c4c9f77df8d57fe162616d2250bd4dfe5b7754e7..1d3e2ade6d393c6e4c37eea0dc1064cdb18808a5 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -195,14 +195,7 @@ function(cc_library TARGET_NAME)
         list(REMOVE_ITEM cc_library_DEPS warpctc)
         add_dependencies(${TARGET_NAME} warpctc)
       endif()
-      if("${cc_library_DEPS}" MATCHES "ARCHIVE_START")
-        # Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
-        # WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
-        target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
-        list(REMOVE_ITEM cc_library_DEPS ARCHIVE_START ARCHIVE_END)
-      else()
-        target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
-      endif()
+      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
       add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
     endif()
     
@@ -243,11 +236,7 @@ function(cc_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS ARGS)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
-    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
-    if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
-      list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
-    endif()
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
     add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
     add_test(NAME ${TARGET_NAME}
              COMMAND ${TARGET_NAME} ${cc_test_ARGS}
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 0323cd9698cba916d2aa04403be97c0a6a463830..cc758019827b9a5416a801e4da43d754d4492a73 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -1,7 +1,22 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 set_property(GLOBAL PROPERTY FLUID_MODULES "")
 # find all fluid modules is used for paddle fluid static library
 function(find_fluid_modules TARGET_NAME)
   get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
+  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
   string(FIND "${__target_path}" "fluid" pos)
   if(pos GREATER 1)
     get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
@@ -77,6 +92,23 @@ elseif (WITH_MKLML)
     )
 endif()
 
+if(NOT MOBILE_INFERENCE AND NOT RPI)
+  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappy")
+  copy(snappy_lib
+    SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
+    DSTS ${dst_dir} ${dst_dir}/lib)
+
+  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappystream")
+  copy(snappystream_lib
+    SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
+    DSTS ${dst_dir} ${dst_dir}/lib)
+
+  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/zlib")
+  copy(zlib_lib
+    SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
+    DSTS ${dst_dir} ${dst_dir}/lib)
+endif()
+
 # paddle fluid module
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle/fluid")
diff --git a/doc/fluid/design/motivation/fluid.md b/doc/fluid/design/motivation/fluid.md
index 5e147f8263e685a4665b5793f7127178cbc3cfdd..4b7696cc1bbf57ace72c4d31ffc2bfe6c1071939 100644
--- a/doc/fluid/design/motivation/fluid.md
+++ b/doc/fluid/design/motivation/fluid.md
@@ -119,7 +119,7 @@ An actual Fluid example is described  [here](https://github.com/PaddlePaddle/Pad
 
 From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.
 
-We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/if_else_op.md) structure of Fluid.
+We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md) structure of Fluid.
 
 ## Turing Completeness
 
diff --git a/doc/v2/dev/write_docs_cn.rst b/doc/v2/dev/write_docs_cn.rst
index 23615f8830e99633676c83ec5d28139a732c623c..4231f2bb5cd800c0cd86835b5d07e491fcde4989 100644
--- a/doc/v2/dev/write_docs_cn.rst
+++ b/doc/v2/dev/write_docs_cn.rst
@@ -65,39 +65,55 @@ PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好D
 不使用PaddlePaddle.org工具
 --------------------------
 
-使用Docker构建PaddlePaddle的文档，需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。安装好Docker之后可以使用源码目录下的脚本构建文档，即
+使用Docker构建PaddlePaddle的文档，需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。该方法与 `从源码编译PaddlePaddle <http://paddlepaddle.org/docs/develop/documentation/zh/build_and_install/build_from_source_cn.html>`_ 相似，通过从源码中构建可用于编译PaddlePaddle文档的Docker镜像并运行，在进入Docker容器后使用源码中的脚本构建PaddlePaddle文档，具体步骤如下：
 
-[TBD]
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+
+   # 从源码中构建可用于编译PaddlePaddle文档的Docker镜像
+   docker build -t paddle:dev .
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" -e "WITH_DOC=ON" paddle:dev /bin/bash
+
+   # 进入Docker容器后使用build.sh脚本构建PaddlePaddle文档
+   bash -x /paddle/paddle/scripts/docker/build.sh
+
+注：上述命令把当前目录（源码根目录）映射为 container 里的 :code:`/paddle` 目录。
+
+编译完成后，会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录，在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 、 ``api/en/html`` 共三个子目录，分别进入这些目录下，执行以下命令：
+
+.. code-block:: bash
+
+   python -m SimpleHTTPServer 8088
+
+在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面。
 
 如果不想使用Docker，也可以使用以下命令直接构建PaddlePaddle文档，即
 
 .. code-block:: bash
 
-   mkdir paddle
-   cd paddle
    git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
    mkdir -p build
    cd build
    cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
 
    # 如果只需要构建使用文档，则执行以下命令
-   make -j $processors gen_proto_py
-   make -j $processors paddle_docs paddle_docs_cn
+   make -j $processors paddle_docs
 
    # 如果只需要构建API，则执行以下命令
-   make -j $processors gen_proto_py framework_py_proto
-   make -j $processors copy_paddle_pybind
-   make -j $processors paddle_api_docs
+   make -j $processors paddle_apis
 
 其中$processors代表启动和CPU核一样多的进程来并行编译，可以根据本机的CPU核数设置相应的值。
 
-编译完成后，进入 ``doc/v2`` 目录，如果选择构建文档则会在该目录下生成 ``cn/html/`` 、 ``en/html`` 两个子目录，选择构建API则会生成 ``api/en/html`` 目录，分别进入这些目录下，执行以下命令：
+编译完成后，同样会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录，如果选择构建文档则会在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 两个子目录，选择构建API则会在这两个目录下分别生成 ``api/en/html`` 目录，分别进入这些子目录下，执行以下命令：
 
 .. code-block:: bash
 
    python -m SimpleHTTPServer 8088
 
-在浏览器中输入 http://localhost:8088 就可以看到编译生成的中/英文的文档页面和英文的API页面,下图为生成的英文文档首页示例。注意，示例中由于使用了sphinx的原始主题，所以页面的风格与官网并不一致，但这并不影响开发者进行调试。
+在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面。下图为生成的 ``v2`` 英文文档首页示例。注意，示例中由于使用了sphinx的原始主题，所以页面的风格与官网并不一致，但这并不影响开发者进行调试。
 
 ..  image:: src/doc_en.png
     :align: center
diff --git a/doc/v2/dev/write_docs_en.rst b/doc/v2/dev/write_docs_en.rst
index 15ff0d34ad622f100fe98d8738b830e47c35b41b..6105455e202e4704aa25f0fd9916b9b61a569702 100644
--- a/doc/v2/dev/write_docs_en.rst
+++ b/doc/v2/dev/write_docs_en.rst
@@ -68,39 +68,56 @@ Please `click here <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develo
 Manually Building the Documentation
 -------------------------------------
 
-Build PaddlePaddle's documentation with Docker，you need to install Docker first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. After Docker is installed, you could use the scripts in the source directory to build the documentation.
+Build PaddlePaddle's documentation with Docker，you need to install Docker first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. This method is quite similar to ` Build From Sources <http://paddlepaddle.org/docs/develop/documentation/en/build_and_install/build_from_source_en.html>`_ , by constructing, from source code, a docker image that can be used to build PaddlePaddle documentation. Enter the Docker container and use the script ``build.sh`` in the source directory to build the PaddlePaddle documentation. The specific steps are as follows:
 
-[TBD]
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+
+   # Construct a docker image from source code
+   docker build -t paddle:dev .
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" -e "WITH_DOC=ON" paddle:dev /bin/bash
+
+   # Use build.sh to build PaddlePaddle documentation
+   bash -x /paddle/paddle/scripts/docker/build.sh
+
+Note: The above commands maps the current directory (source root directory) to the :code:`/paddle` directory in the container.
+
+After compiling, there should be two generated directories: ``doc/v2`` and ``doc/fluid``, where three subdirectories ``cn/html/``, ``en/html`` and ``api/en/html`` are generated. Please enter these directories respectively and execute the following commands:
+
+.. code-block:: bash
+
+   python -m SimpleHTTPServer 8088
+
+Use a web browser and navigate to http://localhost:8000, you could see the compiled  ``v2`` 's and ``fluid`` 's Chinese/English documents page and English APIs page.
 
 If you do not wish to use Docker, you can also use the following commands to directly build the PaddlePaddle documentation.
 
 .. code-block:: bash
 
-   mkdir paddle
-   cd paddle
+
    git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
    mkdir -p build
    cd build
    cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
 
    # If you only need to build documents, use the following commands
-   make -j $processors gen_proto_py
-   make -j $processors paddle_docs paddle_docs_cn
+   make -j $processors paddle_docs
 
    # If you only need to build APIs, use the following commands
-   make -j $processors gen_proto_py framework_py_proto
-   make -j $processors copy_paddle_pybind
-   make -j $processors paddle_api_docs
+   make -j $processors paddle_apis
 
 $processors indicates that as many processes as the CPU cores are started to compile in parallel. It should be set according to the number of CPU cores of your machine.
 
-After the compilation is complete, enter the ``doc/v2`` directory. If you chose to build documents, it will generate ``cn/html/`` and ``en/html`` subdirectories under this directory. If you chose to build APIs，it will generate``api/en/html`` subdirectory. Please enter these directories respectively and execute the following commands:
+After compiling, there also should be two generated directories: ``doc/v2`` and ``doc/fluid`` . If you chose to build documents, two subdirectories ``cn/html/`` and ``en/html``  will be generated in both two directories. If you chose to build APIs，a subdirectory ``api/en/html`` will be generated. Please enter these directories respectively and execute the following commands:
 
 .. code-block:: bash
 
    python -m SimpleHTTPServer 8088
 
-Use a web browser and navigate to http://localhost:8000, you could see the compiled Chinese/English documents page and the English APIs page. The following figure is an example of the built English documents home page. Note that due to the sphinx's original theme used in the example, the style of the page is not consistent with the official website, but this does not affect the developer's debugging.
+Use a web browser and navigate to http://localhost:8000, you could see the compiled  ``v2`` 's and ``fluid`` 's Chinese/English documents page and English APIs page. The following figure is an example of the built ``v2`` 's English documents home page. Note that due to the sphinx's original theme used in the example, the style of the page is not consistent with the official website, but this does not affect the developer's debugging.
 
 ..  image:: src/doc_en.png
     :align: center
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index c44f8a8a8ecc1ba1f886fc41aec863b4ca3458a6..8b1ca5e16548334ed0c9a6d31b88e0805304579e 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -24,6 +24,6 @@ if(NOT WITH_FLUID_ONLY)
 endif()
 
 add_subdirectory(testing)
-if(NOT MOBILE_INFERENCE AND NOT ANDROID AND NOT IOS)
+if(NOT MOBILE_INFERENCE AND NOT RPI)
   add_subdirectory(fluid)
 endif()
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index d725763b01d5953985f8e090605f68a8419b5498..d274d96c29bdbf5973d568d783369c3975bdc436 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -3,6 +3,7 @@ add_subdirectory(platform)
 add_subdirectory(framework)
 add_subdirectory(operators)
 add_subdirectory(pybind)
-add_subdirectory(inference)
 add_subdirectory(string)
 add_subdirectory(recordio)
+# NOTE: please add subdirectory inference at last.
+add_subdirectory(inference)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 3840bbe83b68dc2a49aa73feb57a80e9992cad5f..1f3ca24df16cf080d325fbdc0d613a828e384b2a 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -79,14 +79,12 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
     COMMENT "Copy generated python proto into directory paddle/fluid/proto."
     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
-cc_library(backward SRCS backward.cc DEPS net_op)
-cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op)
 cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
 
 cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
-framework_proto backward glog lod_rank_table feed_fetch_method)
+framework_proto glog lod_rank_table feed_fetch_method)
 
 
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS multi_devices_graph_builder threaded_ssa_graph_executor)
diff --git a/paddle/fluid/framework/backward.cc b/paddle/fluid/framework/backward.cc
deleted file mode 100644
index 1314af2b3dab281bd201e6a77bfbe87e0bd58ffb..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/backward.cc
+++ /dev/null
@@ -1,585 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/backward.h"
-#include "paddle/fluid/operators/net_op.h"
-
-#include <deque>
-#include <list>
-#include <memory>
-#include <unordered_set>
-
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/net_op.h"
-
-namespace paddle {
-namespace framework {
-
-static std::unordered_set<std::string>* g_ctrl_flow_ops_ = nullptr;
-// Control Flow operators's backward is significantly different from
-// computational operators. Hack Code here.
-// We should design a better way to backward CtrlFlowOps.
-static std::unordered_set<std::string>& CtrlFlowOps() {
-  if (g_ctrl_flow_ops_ == nullptr) {
-    g_ctrl_flow_ops_ = new std::unordered_set<std::string>{
-        "increment", "lod_rank_table", "less_than"};
-  }
-  return *g_ctrl_flow_ops_;
-}
-
-static inline std::unique_ptr<OperatorBase> CreateGradOp(
-    const OperatorBase& op, const std::unordered_set<std::string>& no_grad_set,
-    std::unordered_map<std::string, std::string>* grad_to_var) {
-  OpDesc op_desc;
-  op_desc.SetInputMap(op.Inputs());
-  op_desc.SetOutputMap(op.Outputs());
-  op_desc.SetType(op.Type());
-  op_desc.SetAttrMap(op.Attrs());
-  auto& info = OpInfoMap::Instance().Get(op.Type());
-  auto grad_descs = info.GradOpMaker()(op_desc, no_grad_set, grad_to_var, {});
-  std::vector<std::unique_ptr<OperatorBase>> grad_ops;
-  grad_ops.reserve(grad_descs.size());
-  std::transform(grad_descs.begin(), grad_descs.end(),
-                 std::back_inserter(grad_ops),
-                 [](const std::unique_ptr<OpDesc>& grad_desc) {
-                   return OpRegistry::CreateOp(*grad_desc);
-                 });
-  PADDLE_ENFORCE(!grad_ops.empty());
-  if (grad_ops.size() == 1) {
-    return std::move(grad_ops[0]);
-  } else {
-    auto net_op = new operators::NetOp();
-    for (auto& grad_op : grad_ops) {
-      net_op->AppendOp(std::move(grad_op));
-    }
-    net_op->CompleteAddOp();
-    return std::unique_ptr<OperatorBase>(net_op);
-  }
-}
-
-template <typename Map, typename T>
-static void ForEachVarName(const Map& names, T callback) {
-  for (auto& name : names) {
-    for (auto& n : name.second) {
-      if (callback(n)) return;
-    }
-  }
-}
-
-// return whether all the names + suffixes in the set
-static bool AllInSet(
-    const std::map<std::string, std::vector<std::string>>& names,
-    const std::string& suffix, const std::unordered_set<std::string>& set) {
-  bool all_in_set = true;
-  ForEachVarName(names, [&all_in_set, &set, &suffix](const std::string& n) {
-    all_in_set = set.find(n + suffix) != set.end();
-    return !all_in_set;
-  });
-  return all_in_set;
-}
-
-static std::unique_ptr<OperatorBase> NOP() {
-  auto net_op = new operators::NetOp();
-  net_op->SetType("@NOP@");
-  net_op->CompleteAddOp();
-  return std::unique_ptr<OperatorBase>(net_op);
-}
-
-//  Get backward operator from a forward operator, a recursive implementation.
-//
-//  no_grad_names the gradient variable names without gradient calculating.
-//
-//  uniq_id is a unique index used inside recursively calling
-//  BackwardRecursive. use `uid = uniq_id++;` to get the unique index, and
-//  pass `uniq_id` through recursive calling.
-//
-//  returns The backward operator. In a simple situation, it may be a simple
-//  operator, in a complex situation, it maybe a NetOp.
-//
-//  See Backward.h for details
-static std::unique_ptr<OperatorBase> BackwardRecursive(
-    const OperatorBase& forwardOp,
-    std::unordered_set<std::string>& no_grad_names,
-    std::unordered_map<std::string, std::string>* grad_to_var,
-    size_t& uniq_id) {
-  //  If all input gradients of forwarding operator do not need to calculate,
-  //  just return an NOP. Not return null ptr because NOP does not take
-  //  too much time for calculation, but it is useful for simplifying logic.
-  if (AllInSet(forwardOp.Inputs() /*names*/, kGradVarSuffix /*suffix*/,
-               no_grad_names /*set*/)) {
-    return NOP();
-  }
-
-  //  All output gradients of forwarding operator do not need to calculate.
-  //  Then all input gradients cannot be computed at all, and we put them into
-  //  `no_grad_names` set. Return an NOP.
-  if (AllInSet(forwardOp.Outputs() /*names*/, kGradVarSuffix /*suffix*/,
-               no_grad_names /*set*/)) {
-    ForEachVarName(forwardOp.Inputs(),
-                   [&no_grad_names](const std::string& name) -> bool {
-                     no_grad_names.insert(GradVarName(name));
-                     return false;
-                   });
-    return NOP();
-  }
-
-  // Returned gradient network
-  auto net = std::unique_ptr<operators::NetOp>(new operators::NetOp());
-
-  if (forwardOp.IsNetOp()) {
-    // Because forwardOp is a net op, it can static_cast.
-    auto& forwardNet = static_cast<const operators::NetOp&>(forwardOp);
-
-    // Map from output gradient variable name to operator's indices in
-    // backward net's ops_. That operator generates that variable.
-    std::unordered_map<std::string, std::vector<size_t>> dup_output_ops;
-
-    size_t local_op_id = 0;
-    // reversely travel forwardNet and collect all duplicate outputs.
-    for (auto it = forwardNet.ops_.rbegin(); it != forwardNet.ops_.rend();
-         ++it, ++local_op_id) {
-      auto& fwd = *it;
-      auto bwd = BackwardRecursive(*fwd, no_grad_names, grad_to_var, uniq_id);
-      ForEachVarName(bwd->Outputs(),
-                     [&dup_output_ops, local_op_id](const std::string& out) {
-                       dup_output_ops[out].emplace_back(local_op_id);
-                       return false;
-                     });
-      net->AppendOp(std::move(bwd));
-    }
-    // Get unique ID for this method.
-    auto uid = uniq_id++;
-    // TODO(dzh): more comment
-    // multiple operators which have the same output (y for example) may
-    // overwrite the same y variable when backward, special operations are token
-    // to handle this case. For each duplicate output, rename it to an alias
-    // (original name with a offset), append an `add` op for its operator,
-    // and finally sum all the alias variable to the final output variable y.
-    using Pos = std::pair<size_t, std::unique_ptr<OperatorBase>>;
-    std::list<Pos> insert_position;
-    for (auto& dup_output_op : dup_output_ops) {
-      const std::string& name = dup_output_op.first;
-      // duplicate @Empty@ don't need to be added
-      if (name == kEmptyVarName) continue;
-
-      auto& dup_op = dup_output_op.second;
-      // no duplicate output
-      if (dup_op.size() == 1) continue;
-
-      // process the duplicate outputs
-      std::vector<std::string> dup_outputs;
-      for (size_t i = 0; i < dup_op.size(); ++i) {
-        // rename each duplicate output to an alias
-        auto op_offset = dup_op[i];
-        dup_outputs.push_back(name + "@RENAME@" + std::to_string(uid) + "@" +
-                              std::to_string(i));
-        net->ops_[op_offset]->Rename(name, dup_outputs.back());
-      }
-      // collect all the offset for each alias,
-      // insert a sum operator to add all aliases to output
-      insert_position.push_back(
-          {dup_op.back(),
-           OpRegistry::CreateOp("sum", {{"X", dup_outputs}}, {{"Out", {name}}},
-                                AttributeMap{})});
-    }
-
-    // make sure the inserted `sum` ops follow the BFS order.
-    insert_position.sort(
-        [](const Pos& l, const Pos& r) { return l.first > r.first; });
-
-    for (auto& pos : insert_position) {
-      net->InsertOp(pos.first + 1, std::move(pos.second));
-    }
-  } else {
-    std::unique_ptr<OperatorBase> grad_op(
-        CreateGradOp(forwardOp, no_grad_names, grad_to_var));
-
-    ForEachVarName(grad_op->Inputs(), [&no_grad_names, &net, &grad_op](
-                                          const std::string& grad_input) {
-      if (no_grad_names.count(grad_input)) {
-        // +1 for \0
-        std::string prefix = grad_input.substr(
-            0, grad_input.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
-        grad_op->Rename(grad_input, prefix + kZeroVarSuffix);
-
-        // If part of input gradient of that operator is not calculated, fill
-        // zero variables to that input gradient.
-        net->AppendOp(OpRegistry::CreateOp("fill_zeros_like", {{"X", {prefix}}},
-                                           {{"Out", {grad_input}}},
-                                           AttributeMap{}));
-      }
-      return false;
-    });
-
-    ForEachVarName(grad_op->Outputs(),
-                   [&no_grad_names, &grad_op](const std::string& grad_output) {
-                     if (no_grad_names.count(grad_output)) {
-                       grad_op->Rename(grad_output, kEmptyVarName);
-                     }
-                     return false;
-                   });
-
-    if (net->ops_.empty()) {  // Current no aux op is added to network
-      return grad_op;
-    }
-    net->AppendOp(std::move(grad_op));
-  }
-  net->SetType("@GENERATED_BACKWARD@");
-  net->CompleteAddOp();
-  return std::unique_ptr<OperatorBase>(
-      static_cast<OperatorBase*>(net.release()));
-}
-
-// See header for comments
-std::unique_ptr<OperatorBase> Backward(
-    const OperatorBase& forwardOp,
-    const std::unordered_set<std::string>& no_grad_vars) {
-  std::unordered_set<std::string> no_grad_names;
-  no_grad_names.reserve(no_grad_vars.size() + 1);
-
-  no_grad_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
-
-  for (auto& name : no_grad_vars) {
-    no_grad_names.insert(name + kGradVarSuffix);
-  }
-  size_t uid = 0;
-  std::unordered_map<std::string, std::string> grad_to_var;
-  return BackwardRecursive(forwardOp, no_grad_names, &grad_to_var, uid);
-}
-
-// ====================================  //
-
-static bool AllGradInSet(const std::vector<std::string>& names,
-                         const std::unordered_set<std::string>& set) {
-  for (const std::string& name : names) {
-    if (!set.count(GradVarName(name))) {
-      return false;
-    }
-  }
-  if (VLOG_IS_ON(10)) {
-    std::ostringstream sout;
-    sout << "All input {";
-    for (auto& name : names) {
-      sout << name << ",";
-    }
-    sout << "} is in {";
-    for (auto& name : set) {
-      sout << name << ",";
-    }
-    sout << "}";
-    VLOG(10) << sout.str();
-  }
-  return true;
-}
-
-static std::string FwdName(const std::string& grad_name) {
-  auto pos = grad_name.find("@GRAD");
-  if (pos == std::string::npos) {
-    return "";
-  } else {
-    return grad_name.substr(0, pos);
-  }
-}
-
-static void CreateGradVarInBlock(
-    size_t grad_op_start_index,
-    const std::unordered_map<std::string, std::string>& param_name_map,
-    BlockDesc* block_desc,
-    std::unordered_map<std::string, GradVarInfo>* grad_var_record) {
-  auto ops = block_desc->AllOps();
-  for (size_t op_index = grad_op_start_index; op_index < ops.size();
-       ++op_index) {
-    std::unordered_set<std::string> new_vars;
-    auto& ctrl_flow_ops = CtrlFlowOps();
-    ForEachVarName(ops[op_index]->Outputs(),
-                   [&](const std::string& grad_var_name) {
-                     if (ctrl_flow_ops.find(ops[op_index]->Type()) !=
-                         ctrl_flow_ops.end()) {
-                       if (block_desc->HasVarRecursive(grad_var_name)) {
-                         return false;
-                       }
-                     } else {
-                       if (block_desc->HasVar(grad_var_name)) {
-                         return false;
-                       }
-                     }
-                     if (grad_var_name == framework::kEmptyVarName) {
-                       return false;
-                     }
-                     auto var = block_desc->Var(grad_var_name);
-                     VLOG(10) << "Creating Variable " << grad_var_name;
-                     new_vars.insert(var->Name());
-                     auto it = param_name_map.find(grad_var_name);
-                     if (it == param_name_map.end()) {
-                       return false;
-                     }
-                     auto param_var_name = it->second;
-                     auto& grad_record = (*grad_var_record)[param_var_name];
-                     grad_record.name_ = grad_var_name;
-                     grad_record.block_idx_ = block_desc->ID();
-                     grad_record.op_idx_ = static_cast<int>(op_index);
-                     return false; /* not break */
-                   });
-    ops[op_index]->InferVarType(block_desc);
-    for (auto& arg : ops[op_index]->OutputArgumentNames()) {
-      if (new_vars.find(arg) == new_vars.end()) {
-        continue;
-      }
-      auto pname = FwdName(arg);
-      auto* param = block_desc->FindVarRecursive(pname);
-      auto* grad = block_desc->FindVar(arg);
-      if (param == nullptr) {
-        grad->SetDataType(proto::VarType::FP32);
-      } else {
-        grad->SetDataType(param->GetDataType());
-      }
-    }
-    ops[op_index]->InferShape(*block_desc);
-  }
-}
-
-std::vector<std::unique_ptr<OpDesc>> MakeOpGrad(
-    const OpDesc* op_desc, std::unordered_set<std::string>* no_grad_vars,
-    std::unordered_map<std::string, std::string>* grad_to_var,
-    const std::vector<BlockDesc*>& grad_block = std::vector<BlockDesc*>()) {
-  std::vector<std::unique_ptr<OpDesc>> grad_op_descs;
-  // All input gradients of forwarding operator do not need to calculate.
-  const std::vector<std::string>& inputs = op_desc->InputArgumentNames();
-  if (AllGradInSet(inputs, *no_grad_vars)) {
-    VLOG(10) << "Drop operator  " << op_desc->Type();
-    return grad_op_descs;  // empty vector
-  }
-
-  // All output gradients of forwarding operator do not need to calculate.
-  const std::vector<std::string>& outputs = op_desc->OutputArgumentNames();
-
-  if (AllGradInSet(outputs, *no_grad_vars)) {
-    VLOG(10) << "Drop operator " << op_desc->Type();
-    // FIXME: Hack code here
-    auto& ctrl_flow_ops = CtrlFlowOps();
-    if (ctrl_flow_ops.find(op_desc->Type()) == ctrl_flow_ops.end()) {
-      // Only computational op need drop input's gradient.
-      for (const std::string& name : inputs) {
-        no_grad_vars->insert(GradVarName(name));
-        VLOG(10) << " Also drop " << GradVarName(name);
-      }
-    }
-
-    return grad_op_descs;  // empty vector
-  }
-
-  grad_op_descs =
-      OpInfoMap::Instance()
-          .Get(op_desc->Type())
-          .GradOpMaker()(*op_desc, *no_grad_vars, grad_to_var, grad_block);
-
-  std::list<std::unique_ptr<OpDesc>> pending_fill_zeros_ops;
-  for (auto& desc : grad_op_descs) {
-    for (const std::string& in_name : desc->InputArgumentNames()) {
-      if (no_grad_vars->count(in_name)) {
-        std::string prefix = in_name.substr(
-            0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
-        std::string new_name = prefix + kZeroVarSuffix;
-        desc->Rename(in_name, new_name);
-        std::unique_ptr<OpDesc> fill_zeros_op(
-            new OpDesc("fill_zeros_like", {{"X", {prefix}}},
-                       {{"Out", {new_name}}}, AttributeMap{}));
-        pending_fill_zeros_ops.push_back(std::move(fill_zeros_op));
-      }
-    }
-  }
-
-  for (auto& p : pending_fill_zeros_ops) {
-    grad_op_descs.insert(grad_op_descs.begin(), std::move(p));
-  }
-  return grad_op_descs;
-}
-
-static BlockDesc* CreateStepBlock(
-    ProgramDesc& program_desc, std::unordered_set<std::string>* no_grad_vars,
-    std::unordered_map<std::string, std::string>* grad_to_var,
-    int step_block_idx);
-
-std::vector<std::unique_ptr<OpDesc>> MakeBlockBackward(
-    ProgramDesc& program_desc, int block_idx,
-    std::unordered_set<std::string>* no_grad_vars,
-    std::unordered_map<std::string, std::string>* grad_to_var) {
-  VLOG(5) << "MakeBlockBackward";
-  BlockDesc* cur_block = program_desc.MutableBlock(block_idx);
-  std::vector<OpDesc*> op_descs = cur_block->AllOps();
-  std::unordered_map<std::string, std::vector<size_t>> dup_out_ops;
-  size_t grad_desc_idx = 0;
-  std::vector<std::unique_ptr<OpDesc>> backward_descs;
-
-  for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
-    VLOG(5) << "Making backward " << (*it)->Type() << " op";
-    std::vector<std::unique_ptr<OpDesc>> op_grads;
-
-    if ((*it)->Type() == "recurrent" || (*it)->Type() == "while" ||
-        (*it)->Type() == "parallel_do") {
-      int step_block_idx = (*it)->GetBlockAttr("sub_block");
-      BlockDesc* backward_block = CreateStepBlock(program_desc, no_grad_vars,
-                                                  grad_to_var, step_block_idx);
-      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
-    } else if ((*it)->Type() == "conditional_block") {
-      BlockDesc* backward_block =
-          CreateStepBlock(program_desc, no_grad_vars, grad_to_var,
-                          (*it)->GetBlockAttr("sub_block"));
-      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
-    } else {
-      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var);
-    }
-
-    if (VLOG_IS_ON(10)) {
-      std::ostringstream sout;
-      sout << "Made ";
-      for (auto& op_grad : op_grads) {
-        sout << op_grad->Type() << " ";
-      }
-      VLOG(10) << sout.str();
-    }
-
-    for (const auto& desc : op_grads) {
-      for (const std::string& out_name : desc->OutputArgumentNames()) {
-        if (out_name.find("@GRAD") == std::string::npos) {
-          // Not all outputs of a backward operator is a gradient. Only gradient
-          // need to be sum. Skip variables are not gradient.
-          continue;
-        }
-        dup_out_ops[out_name].emplace_back(grad_desc_idx);
-      }
-      ++grad_desc_idx;
-    }
-    std::transform(op_grads.begin(), op_grads.end(),
-                   std::back_inserter(backward_descs),
-                   [](std::unique_ptr<OpDesc>& ptr) { return std::move(ptr); });
-  }
-
-  VLOG(5) << "Appending Sums";
-  // Check whether some variables are written more than once
-  std::list<std::pair<size_t, std::unique_ptr<OpDesc>>> pending_sum_ops;
-  for (const auto& dup : dup_out_ops) {
-    const std::string& out_name = dup.first;
-    const std::vector<size_t> dup_op = dup.second;
-    if (out_name != kEmptyVarName && dup_op.size() > 1) {
-      std::vector<std::string> sum_op_inputs;
-      std::string next_g_name = out_name;
-      for (size_t i = 0; i < dup_op.size(); ++i) {
-        VLOG(10) << backward_descs[dup_op[i]]->Type() << " has " << out_name
-                 << " duplicated";
-        std::string new_name = out_name + "@RENAME@" + std::to_string(i);
-        backward_descs[dup_op[i]]->RenameOutput(out_name, new_name);
-        backward_descs[dup_op[i]]->RenameInput(out_name, next_g_name);
-        sum_op_inputs.emplace_back(new_name);
-        next_g_name = sum_op_inputs.back();
-      }
-      std::unique_ptr<OpDesc> sum_op(new OpDesc("sum", {{"X", sum_op_inputs}},
-                                                {{"Out", {out_name}}},
-                                                AttributeMap{}));
-      pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
-    }
-  }
-
-  pending_sum_ops.sort([](const std::pair<size_t, std::unique_ptr<OpDesc>>& a,
-                          const std::pair<size_t, std::unique_ptr<OpDesc>>& b) {
-    return a.first > b.first;
-  });
-  for (auto& p : pending_sum_ops) {
-    backward_descs.insert(backward_descs.begin() + p.first + 1,
-                          std::move(p.second));
-  }
-
-  VLOG(5) << "MakeBlockBackward Finished";
-
-  return backward_descs;
-}
-
-static BlockDesc* CreateStepBlock(
-    ProgramDesc& program_desc, std::unordered_set<std::string>* no_grad_vars,
-    std::unordered_map<std::string, std::string>* grad_to_var,
-    int step_block_idx) {
-  auto backward_block_op_descs = MakeBlockBackward(program_desc, step_block_idx,
-                                                   no_grad_vars, grad_to_var);
-  BlockDesc* backward_block =
-      program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
-  for (auto& ptr : backward_block_op_descs) {
-    backward_block->AppendAllocatedOp(move(ptr));
-  }
-  return backward_block;
-}
-
-ParamGradInfoMap AppendBackward(
-    ProgramDesc& program_desc, const VarDesc& target,
-    const std::unordered_set<std::string>& no_grad_vars) {
-  std::unordered_set<std::string> no_grad_var_names;
-  no_grad_var_names.reserve(no_grad_vars.size() + 1);
-  no_grad_var_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
-  for (auto& name : no_grad_vars) {
-    no_grad_var_names.insert(GradVarName(name));
-  }
-
-  const int root_block_idx = 0;
-  auto root_block = program_desc.MutableBlock(root_block_idx);
-
-  std::string fill_one_op_out = GradVarName(target.Name());
-  bool is_scalar = target.GetShape() == std::vector<int64_t>{1};
-  PADDLE_ENFORCE(is_scalar, "target should be scalar");
-  VLOG(3) << "backward from loss=" << target.Name()
-          << " data_type=" << target.GetDataType();
-  std::unique_ptr<OpDesc> fill_one_op(
-      new OpDesc("fill_constant", {}, {{"Out", {fill_one_op_out}}},
-                 {{"shape", std::vector<int>{1}},
-                  {"value", static_cast<float>(1.0)},
-                  {"dtype", target.GetDataType()}}));
-  // infer var type of fill_one_op
-  fill_one_op->InferVarType(root_block);
-
-  root_block->AppendAllocatedOp(std::move(fill_one_op));
-  size_t forward_op_num = root_block->OpSize();
-  size_t forward_block_num = program_desc.Size();
-
-  // Insert backward operators
-  std::unordered_map<std::string, std::string> grad_to_var;
-  auto backward_op_descs = MakeBlockBackward(program_desc, root_block_idx,
-                                             &no_grad_var_names, &grad_to_var);
-
-  for (auto& ptr : backward_op_descs) {
-    root_block->AppendAllocatedOp(std::move(ptr));
-  }
-  // Create Variable
-
-  // Create target gradient variable
-  std::unordered_map<std::string, GradVarInfo> retv;
-
-  auto var = root_block->Var(fill_one_op_out);
-  var->SetDataType(target.GetDataType());
-  var->SetShape(target.GetShape());
-  auto& target_grad = retv[target.Name()];
-  target_grad.name_ = fill_one_op_out;
-  target_grad.block_idx_ = root_block_idx;
-  target_grad.op_idx_ = static_cast<int>(forward_op_num);
-
-  // create grad_var for all blocks in this program
-  CreateGradVarInBlock(forward_op_num, grad_to_var, root_block, &retv);
-  for (size_t block_index = forward_block_num;
-       block_index < program_desc.Size(); ++block_index) {
-    CreateGradVarInBlock(0, grad_to_var, program_desc.MutableBlock(block_index),
-                         &retv);
-  }
-  return retv;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/backward.h b/paddle/fluid/framework/backward.h
deleted file mode 100644
index 3a971090c25c85efbf976532c364371baba9a870..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/backward.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-
-namespace paddle {
-namespace framework {
-
-// Create the backward operator from a forward operator.
-// TODO(yuyang18): Add more API reference comment.
-extern std::unique_ptr<OperatorBase> Backward(
-    const OperatorBase& forwardOp,
-    const std::unordered_set<std::string>& no_grad_vars);
-
-struct GradVarInfo {
-  GradVarInfo() {}
-  GradVarInfo(const std::string& name, int block_idx, int op_idx)
-      : name_(name), block_idx_(block_idx), op_idx_(op_idx) {}
-
-  bool operator==(const GradVarInfo& b) const {
-    return name_ == b.name_ && block_idx_ == b.block_idx_ &&
-           op_idx_ == b.op_idx_;
-  }
-
-  std::string name_;
-  int block_idx_;
-  int op_idx_;
-};
-
-using ParamGradInfoMap = std::unordered_map<std::string /*fwd_var_name*/,
-                                            GradVarInfo /*grad_var_info*/>;
-
-ParamGradInfoMap AppendBackward(
-    ProgramDesc& program_desc, const VarDesc& target,
-    const std::unordered_set<std::string>& no_grad_vars);
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/backward_test.cc b/paddle/fluid/framework/backward_test.cc
deleted file mode 100644
index cc1f871360ed3f7071364dbb0f932bfd997cadb0..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/backward_test.cc
+++ /dev/null
@@ -1,918 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/backward.h"
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/var_desc.h"
-#include "paddle/fluid/operators/net_op.h"
-
-USE_NO_KERNEL_OP(fill_constant);
-
-namespace paddle {
-namespace framework {
-
-using DeviceContext = platform::DeviceContext;
-
-class NoneOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {}
-};
-
-template <typename Place, typename T>
-class NoneKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {}
-};
-
-class RowWiseAddOpMaker : public OpProtoAndCheckerMaker {
- public:
-  RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input X of Add");
-    AddInput("b", "Bias of Add");
-    AddOutput("Out", "Out of Add");
-    AddComment("Add Op");
-  }
-};
-
-class RowWiseAddGradMaker : public SingleGradOpDescMaker {
- public:
-  using SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<OpDesc> Apply() const override {
-    auto grad_op = new OpDesc();
-    grad_op->SetInput(GradVarName("Out"), OutputGrad("Out"));
-    grad_op->SetOutput(GradVarName("X"), InputGrad("X"));
-    grad_op->SetOutput(GradVarName("b"), InputGrad("b"));
-    grad_op->SetType("rowwise_add_grad");
-    return std::unique_ptr<OpDesc>(grad_op);
-  }
-};
-
-class MulOpMaker : public OpProtoAndCheckerMaker {
- public:
-  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "A");
-    AddInput("Y", "B");
-    AddOutput("Out", "Out");
-    AddAttr<int>("x_num_col_dims", "").SetDefault(1).EqualGreaterThan(1);
-    AddAttr<int>("y_num_col_dims", "").SetDefault(1).EqualGreaterThan(1);
-    AddComment("Mul");
-  }
-};
-
-class SigmoidOpMaker : public OpProtoAndCheckerMaker {
- public:
-  SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "X");
-    AddOutput("Out", "Y");
-    AddComment("Sigmoid");
-  }
-};
-
-class NoGradOpMaker : public OpProtoAndCheckerMaker {
- public:
-  NoGradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "X input");
-    AddOutput("Out", "Y output");
-    AddComment("NoGradOp, same input output. no Grad");
-  }
-};
-
-class FcOp : public operators::NetOp {
- public:
-  FcOp(const std::string &type, const VariableNameMap &inputs,
-       const VariableNameMap &outputs, const AttributeMap &attrs)
-      : NetOp(type, inputs, outputs, attrs) {
-    AppendOp(OpRegistry::CreateOp(
-        "mul", {{"X", {Input("X")}}, {"Y", {Input("W")}}},
-        {{"Out", {Output("mul_result")}}}, AttributeMap{}));
-    auto input_b = Inputs("b");
-    std::string before_act = "mul_result";
-    if (input_b.size() != 0) {
-      AppendOp(OpRegistry::CreateOp(
-          "rowwise_add", {{"X", {Output("mul_result")}}, {"b", {input_b[0]}}},
-          {{"Out", {Output("add_result")}}}, AttributeMap{}));
-      before_act = "add_result";
-    } else {
-      auto out_varname = Output("add_result");
-      if (out_varname != kEmptyVarName) {
-        this->Rename(out_varname, kEmptyVarName);
-      }
-    }
-
-    AppendOp(OpRegistry::CreateOp("sigmoid", {{"X", {Output(before_act)}}},
-                                  {{"Out", {Output("Out")}}}, AttributeMap{}));
-    CompleteAddOp(false);
-  }
-};
-
-class FcOpMaker : public OpProtoAndCheckerMaker {
- public:
-  FcOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "x");
-    AddInput("W", "w");
-    AddInput("b", "b");
-    AddOutput("mul_result", "").AsIntermediate();
-    AddOutput("add_result", "").AsIntermediate();
-    AddOutput("Out", "");
-    AddComment("");
-  }
-};
-
-class ManyOutputOpMaker : public OpProtoAndCheckerMaker {
- public:
-  ManyOutputOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("x", "x");
-    AddOutput("y", "y");
-    AddOutput("z", "z");
-    AddComment("");
-  }
-};
-
-class FillZeroOpMaker : public OpProtoAndCheckerMaker {
- public:
-  FillZeroOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "x");
-    AddOutput("Out", "out");
-    AddComment("");
-  }
-};
-
-class SumOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "the input tensors of sum operator.").AsDuplicable();
-    AddOutput("Out", "the output tensor of sum operator.");
-    AddComment("");
-  }
-};
-
-class MultInOutOpMaker : public OpProtoAndCheckerMaker {
- public:
-  MultInOutOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "x");
-    AddInput("H", "h");
-    AddOutput("Y", "y");
-    AddOutput("Z", "z");
-    AddComment("");
-  }
-};
-
-class MinusGradOpDescMaker : public GradOpDescMakerBase {
- public:
-  using GradOpDescMakerBase::GradOpDescMakerBase;
-
-  std::vector<std::unique_ptr<OpDesc>> operator()() const override {
-    std::vector<std::unique_ptr<OpDesc>> retv;
-    auto x_g = InputGrad("X");
-    if (!x_g.empty()) {
-      auto *op_desc = new OpDesc();
-      op_desc->SetType("scale");
-      op_desc->SetInput("X", OutputGrad("Out"));
-      op_desc->SetOutput("Out", x_g);
-      op_desc->SetAttr("scale", 1.0f);
-      retv.emplace_back(op_desc);
-    }
-
-    auto y_g = InputGrad("Y");
-    if (!y_g.empty()) {
-      auto *op_desc = new OpDesc();
-      op_desc->SetType("scale");
-      op_desc->SetInput("X", OutputGrad("Out"));
-      op_desc->SetOutput("Out", y_g);
-      op_desc->SetAttr("scale", -1.0f);
-      retv.emplace_back(op_desc);
-    }
-    return retv;
-  }
-};
-
-class MinusOpMaker : public OpProtoAndCheckerMaker {
- public:
-  MinusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "");
-    AddInput("Y", "");
-    AddOutput("Out", "");
-    AddComment("minus for unittest");
-  }
-};
-}  // namespace framework
-}  // namespace paddle
-
-namespace f = paddle::framework;
-namespace ops = paddle::operators;
-using EnforceNotMet = paddle::platform::EnforceNotMet;
-// rowwise_add
-REGISTER_OPERATOR(rowwise_add, f::NoneOp, f::RowWiseAddOpMaker,
-                  f::RowWiseAddGradMaker);
-REGISTER_OP_CPU_KERNEL(rowwise_add,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OPERATOR(rowwise_add_grad, f::NoneOp);
-REGISTER_OP_CPU_KERNEL(rowwise_add_grad,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-// mul
-REGISTER_OP(mul, f::NoneOp, f::MulOpMaker, mul_grad, f::NoneOp);
-REGISTER_OP_CPU_KERNEL(mul, f::NoneKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(mul_grad,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-// sigmoid
-REGISTER_OP(sigmoid, f::NoneOp, f::SigmoidOpMaker, sigmoid_grad, f::NoneOp);
-REGISTER_OP_CPU_KERNEL(sigmoid,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_WITHOUT_GRADIENT(nograd, f::NoneOp, f::NoGradOpMaker);
-// fill_zeros_like
-REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, f::NoneOp, f::FillZeroOpMaker);
-REGISTER_OP_CPU_KERNEL(fill_zeros_like,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-// sum
-REGISTER_OP(sum, f::NoneOp, f::SumOpMaker, sum_grad, f::NoneOp);
-REGISTER_OP_CPU_KERNEL(sum, f::NoneKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(sum_grad,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-// fc
-REGISTER_OP_WITHOUT_GRADIENT(fc, f::FcOp, f::FcOpMaker);
-// many_output_op
-REGISTER_OP(many_output_op, f::NoneOp, f::ManyOutputOpMaker,
-            many_output_op_grad, f::NoneOp);
-// mult_in_out
-REGISTER_OP(mult_in_out, f::NoneOp, f::MultInOutOpMaker, mult_in_out_grad,
-            f::NoneOp);
-REGISTER_OP_CPU_KERNEL(mult_in_out,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(mult_in_out_grad,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-// minus
-REGISTER_OPERATOR(minus, f::NoneOp, f::MinusOpMaker, f::MinusGradOpDescMaker);
-REGISTER_OP_CPU_KERNEL(minus, f::NoneKernel<paddle::platform::CPUPlace, float>);
-// scale
-REGISTER_OPERATOR(scale, f::NoneOp);
-REGISTER_OP_CPU_KERNEL(scale, f::NoneKernel<paddle::platform::CPUPlace, float>);
-
-TEST(Backward, simple_op_not_need_grad) {
-  auto fwd =
-      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
-                              {{"Out", {"out"}}}, f::AttributeMap{});
-  ASSERT_NE(fwd, nullptr);
-  auto gop = f::Backward(*fwd, {"x"});
-  ASSERT_EQ(gop->Output(f::GradVarName("X")), f::kEmptyVarName);
-
-  auto no_input_gop = f::Backward(*fwd, {"x", "b"});
-  ASSERT_NE(no_input_gop, nullptr);
-  ASSERT_TRUE(no_input_gop->IsNetOp());
-  ASSERT_EQ(0UL, static_cast<ops::NetOp *>(no_input_gop.get())->ops_.size());
-}
-
-TEST(Backward, net_fc_backward_normal) {
-  std::shared_ptr<f::OperatorBase> fwd =
-      f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {"b"}}},
-                              {{"mul_result", {"mul_res"}},
-                               {"add_result", {"add_re"}},
-                               {"Out", {"out"}}},
-                              f::AttributeMap{});
-  ASSERT_NE(fwd, nullptr);
-  std::shared_ptr<f::OperatorBase> gop =
-      f::Backward(*fwd, std::unordered_set<std::string>{});
-  ASSERT_TRUE(gop->IsNetOp());
-  auto net = static_cast<ops::NetOp *>(gop.get());
-
-  ASSERT_NO_THROW(net->DebugString());
-
-  ASSERT_EQ(3UL, net->ops_.size());
-
-  f::OperatorBase &d_sigmoid = *net->ops_[0];
-  ASSERT_EQ("sigmoid_grad", d_sigmoid.Type());
-
-  f::OperatorBase &d_add = *net->ops_[1];
-  ASSERT_EQ("rowwise_add_grad", d_add.Type());
-
-  f::OperatorBase &d_mul = *net->ops_[2];
-  ASSERT_EQ("mul_grad", d_mul.Type());
-}
-
-TEST(Backward, net_fc_backward_not_have_b) {
-  std::shared_ptr<f::OperatorBase> fwd =
-      f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {}}},
-                              {{"mul_result", {"mul_res"}},
-                               {"add_result", {"add_res"}},
-                               {"Out", {"tmp"}}},
-                              f::AttributeMap{});
-  ASSERT_NE(fwd, nullptr);
-  std::shared_ptr<f::OperatorBase> gop =
-      f::Backward(*fwd, std::unordered_set<std::string>{});
-  ASSERT_TRUE(gop->IsNetOp());
-  auto net = static_cast<ops::NetOp *>(gop.get());
-
-  ASSERT_NO_THROW(net->DebugString());
-
-  ASSERT_EQ(2UL, net->ops_.size());
-
-  f::OperatorBase &d_sigmoid = *net->ops_[0];
-  ASSERT_EQ("sigmoid_grad", d_sigmoid.Type());
-
-  f::OperatorBase &d_mul = *net->ops_[1];
-  ASSERT_EQ("mul_grad", d_mul.Type());
-}
-
-TEST(Backward, net_input_of_network_not_need_grad) {
-  ops::NetOp net;
-  net.AppendOp(f::OpRegistry::CreateOp(
-      "fc", {{"X", {"x"}}, {"W", {"W1"}}, {"b", {"b1"}}},
-      {{"mul_result", {"mul_tmp_0"}},
-       {"add_result", {"add_tmp_0"}},
-       {"Out", {"hidden0"}}},
-      f::AttributeMap{}));
-  net.AppendOp(f::OpRegistry::CreateOp(
-      "fc", {{"X", {"hidden0"}}, {"W", {"W2"}}, {"b", {"b2"}}},
-      {{"mul_result", {"mul_tmp_1"}},
-       {"add_result", {"add_tmp_1"}},
-       {"Out", {"hidden1"}}},
-      f::AttributeMap{}));
-  net.CompleteAddOp();
-  auto bwd = Backward(net, {"x"});  // x@GRAD is not need.
-  ASSERT_TRUE(bwd->IsNetOp());
-  auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
-
-  auto output_vars = bwd_net->OutputVars(true);
-  std::unordered_set<std::string> all_outputs =
-      std::unordered_set<std::string>(output_vars.begin(), output_vars.end());
-  all_outputs.erase(f::kEmptyVarName);
-
-  for (auto &out : {"W1", "b1", "hidden0", "W2", "b2"}) {
-    ASSERT_NE(all_outputs.find(f::GradVarName(out)), all_outputs.end());
-  }
-
-  // Not Generated X
-  ASSERT_EQ(all_outputs.find(f::GradVarName("X")), all_outputs.end());
-
-  ASSERT_EQ(2UL, bwd_net->ops_.size());
-  ASSERT_TRUE(bwd_net->ops_[1]->IsNetOp());
-  auto first_fc_grad = static_cast<ops::NetOp *>(bwd_net->ops_[1].get());
-  ASSERT_EQ(3UL, first_fc_grad->ops_.size());
-  ASSERT_EQ(f::kEmptyVarName,
-            first_fc_grad->ops_[2]->Output(f::GradVarName("X")));
-}
-
-TEST(Backward, net_shared_weight) {
-  ops::NetOp net;
-  net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"x"}}, {"Y", {"w"}}},
-                                       {{"Out", {"out"}}}, f::AttributeMap{}));
-  net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"out"}}, {"Y", {"w"}}},
-                                       {{"Out", {"FinalOut"}}},
-                                       f::AttributeMap{}));
-  net.CompleteAddOp();
-
-  auto bwd = f::Backward(net, std::unordered_set<std::string>{});
-  ASSERT_TRUE(bwd->IsNetOp());
-  auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
-  ASSERT_EQ(3UL, bwd_net->ops_.size());
-  ASSERT_EQ("sum", bwd_net->ops_[2]->Type());
-}
-
-TEST(Backward, op_all_input_are_not_need) {
-  auto fwd =
-      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
-                              {{"Out", {"out"}}}, f::AttributeMap{});
-  auto backward = f::Backward(*fwd, {"x", "b"});
-  ASSERT_TRUE(backward->IsNetOp());
-  auto net = static_cast<ops::NetOp *>(backward.get());
-  ASSERT_TRUE(net->ops_.empty());
-}
-
-TEST(Backward, op_all_output_are_not_need) {
-  auto fwd =
-      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
-                              {{"Out", {"out"}}}, f::AttributeMap{});
-  auto backward = f::Backward(*fwd, {"out"});
-  ASSERT_TRUE(backward->IsNetOp());
-  auto net = static_cast<ops::NetOp *>(backward.get());
-  ASSERT_TRUE(net->ops_.empty());
-}
-
-TEST(Backward, op_part_of_output_are_not_need) {
-  auto fwd =
-      f::OpRegistry::CreateOp("many_output_op", {{"x", {"X"}}},
-                              {{"y", {"Y"}}, {"z", {"Z"}}}, f::AttributeMap{});
-  auto backward = f::Backward(*fwd, {"Z"});
-  ASSERT_TRUE(backward->IsNetOp());
-  auto net = static_cast<ops::NetOp *>(backward.get());
-  ASSERT_EQ(net->ops_.size(), 2UL);
-
-  auto &fill_zero = *net->ops_[0];
-  ASSERT_EQ("fill_zeros_like", fill_zero.Type());
-  ASSERT_EQ(1UL, fill_zero.Inputs("X").size());
-  ASSERT_EQ("Z", fill_zero.Input("X"));
-  ASSERT_EQ(1UL, fill_zero.Outputs("Out").size());
-  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.Output("Out"));
-
-  auto &d_many_out = *net->ops_[1];
-  ASSERT_EQ("many_output_op_grad", d_many_out.Type());
-  ASSERT_EQ(1UL + 2UL + 2UL, d_many_out.Inputs().size());  // I/O/OG
-  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix,
-            d_many_out.Input(f::GradVarName("z")));
-  ASSERT_EQ(f::GradVarName("Y"), d_many_out.Input(f::GradVarName("y")));
-  ASSERT_EQ(f::GradVarName("X"), d_many_out.Output(f::GradVarName("x")));
-}
-
-TEST(Backward, op_part_of_input_are_not_need) {
-  auto fwd = f::OpRegistry::CreateOp("mul", {{"X", {"a"}}, {"Y", {"b"}}},
-                                     {{"Out", {"out"}}}, f::AttributeMap{});
-  auto backward = f::Backward(*fwd, {"a"});
-  auto &grad_mul = *backward;
-  ASSERT_EQ(grad_mul.Type(), "mul_grad");
-  ASSERT_EQ(grad_mul.Inputs().size(), 2UL + 1UL + 1UL);
-  ASSERT_EQ(grad_mul.Outputs().size(), 2UL);
-  ASSERT_EQ(grad_mul.Output(f::GradVarName("X")), f::kEmptyVarName);
-  ASSERT_EQ(grad_mul.Output(f::GradVarName("Y")), f::GradVarName("b"));
-  ASSERT_EQ(grad_mul.Input(f::GradVarName("Out")), f::GradVarName("out"));
-  ASSERT_EQ(grad_mul.Input("X"), "a");
-  ASSERT_EQ(grad_mul.Input("Y"), "b");
-  ASSERT_EQ(grad_mul.Input("Out"), "out");
-}
-
-TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
-  ops::NetOp net;
-  net.AppendOp(f::OpRegistry::CreateOp(
-      "fc", {{"X", {"x1"}}, {"W", {"w1"}}, {"b", {"b1"}}},
-      {{"mul_result", {"mul_out1"}},
-       {"add_result", {"add_out1"}},
-       {"Out", {"out1"}}},
-      f::AttributeMap{}));
-  net.AppendOp(f::OpRegistry::CreateOp(
-      "fc", {{"X", {"out1"}}, {"W", {"w2"}}, {"b", {"b2"}}},
-      {{"mul_result", {"mul_out2"}},
-       {"add_result", {"tmp_out2"}},
-       {"Out", {"out2"}}},
-      f::AttributeMap{}));
-  net.AppendOp(f::OpRegistry::CreateOp(
-      "fc", {{"X", {"out2"}}, {"W", {"w3"}}, {"b", {"b3"}}},
-      {{"mul_result", {"mul_out3"}},
-       {"add_result", {"tmp_out3"}},
-       {"Out", {"out3"}}},
-      f::AttributeMap{}));
-  net.CompleteAddOp();
-
-  auto backward = f::Backward(net, {"mul_out2", "tmp_out2", "out2"});
-  ASSERT_TRUE(backward->IsNetOp());
-  auto bwd_net = static_cast<ops::NetOp *>(backward.get());
-  ASSERT_EQ(bwd_net->ops_.size(), 3UL);
-  auto &grad_fc = *bwd_net->ops_[0];
-
-  const char *all = paddle::operators::NetOp::kAll;
-  EXPECT_EQ(grad_fc.Inputs(all).size(),
-            2UL       /* external input number */
-                + 1UL /* external output number*/
-                + 1UL /* number of gradient of external output*/
-                + 2UL /* internal variable number*/
-            );
-  EXPECT_EQ(grad_fc.Outputs(all).size(),
-            2UL       /* input number of mul*/
-                + 2UL /* input number of rowwise_add*/
-                + 1UL /* input number of sigmod */
-                - 1UL /* out2 is not needed*/);
-  EXPECT_EQ(bwd_net->ops_[1]->Inputs(all).size(), 0UL);
-  EXPECT_EQ(bwd_net->ops_[1]->Outputs(all).size(), 0UL);
-  EXPECT_EQ(bwd_net->ops_[2]->Inputs(all).size(), 0UL);
-  EXPECT_EQ(bwd_net->ops_[2]->Outputs(all).size(), 0UL);
-}
-
-TEST(Backward, simple_single_op) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-
-  f::OpDesc *op = block->AppendOp();
-  op->SetType("rowwise_add");
-  op->SetInput("X", {"x"});
-  op->SetInput("b", {"b"});
-  op->SetOutput("Out", {"out"});
-
-  auto target = f::VarDesc("out");
-  target.SetShape({1});
-  auto var_to_grad =
-      AppendBackward(program, target, std::unordered_set<std::string>{});
-
-  ASSERT_EQ(block->AllOps().size(), 3UL);
-  f::OpDesc *fill_op = block->AllOps()[1];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op = block->AllOps()[2];
-  EXPECT_EQ(grad_op->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out")}));
-  EXPECT_EQ(grad_op->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("x")}));
-  EXPECT_EQ(grad_op->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b")}));
-
-  EXPECT_EQ(var_to_grad.size(), 3UL);
-  EXPECT_EQ(var_to_grad.at("b"), f::GradVarInfo(f::GradVarName("b"), 0, 2));
-  EXPECT_EQ(var_to_grad.at("x"), f::GradVarInfo(f::GradVarName("x"), 0, 2));
-
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("x")));
-}
-
-TEST(Backward, default_attribute) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDesc *op = block->AppendOp();
-  op->SetType("mul");
-  op->SetInput("X", {"x"});
-  op->SetInput("Y", {"y"});
-  op->SetOutput("Out", {"out"});
-  op->CheckAttrs();
-
-  auto target = f::VarDesc("out");
-  target.SetShape({1});
-  AppendBackward(program, target, std::unordered_set<std::string>{});
-
-  ASSERT_EQ(block->AllOps().size(), 3UL);
-  EXPECT_EQ(boost::get<int>(op->GetAttr("x_num_col_dims")), 1);
-  EXPECT_EQ(boost::get<int>(op->GetAttr("y_num_col_dims")), 1);
-
-  f::OpDesc *fill_op = block->AllOps()[1];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op = block->AllOps()[2];
-  ASSERT_EQ(grad_op->Type(), "mul_grad");
-  EXPECT_EQ(boost::get<int>(grad_op->GetAttr("x_num_col_dims")), 1);
-  EXPECT_EQ(boost::get<int>(grad_op->GetAttr("y_num_col_dims")), 1);
-}
-
-TEST(Backward, simple_mult_op) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDesc *op1 = block->AppendOp();
-  op1->SetType("rowwise_add");
-  op1->SetInput("X", {"x1"});
-  op1->SetInput("b", {"b1"});
-  op1->SetOutput("Out", {"out1"});
-
-  f::OpDesc *op2 = block->AppendOp();
-  op2->SetType("mul");
-  op2->SetInput("X", {"out1"});
-  op2->SetInput("Y", {"y2"});
-  op2->SetOutput("Out", {"out2"});
-
-  f::OpDesc *op3 = block->AppendOp();
-  op3->SetType("rowwise_add");
-  op3->SetInput("X", {"out2"});
-  op3->SetInput("b", {"b3"});
-  op3->SetOutput("Out", {"out3"});
-
-  auto target = f::VarDesc("out3");
-  target.SetShape({1});
-  size_t forward_len = block->AllOps().size();
-  auto var_to_grad =
-      AppendBackward(program, target, std::unordered_set<std::string>{});
-
-  ASSERT_EQ(block->AllOps().size(), 6UL + 1);
-  f::OpDesc *fill_op = block->AllOps()[forward_len];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op1 = block->AllOps()[6];
-  EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("x1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b1")}));
-
-  f::OpDesc *grad_op2 = block->AllOps()[5];
-  EXPECT_EQ(grad_op2->Type(), "mul_grad");
-  ASSERT_EQ(grad_op2->InputNames().size(), 4UL);
-  ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op2->Input("X"), std::vector<std::string>({"out1"}));
-  EXPECT_EQ(grad_op2->Input("Y"), std::vector<std::string>({"y2"}));
-  EXPECT_EQ(grad_op2->Input("Out"), std::vector<std::string>({"out2"}));
-  EXPECT_EQ(grad_op2->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out2")}));
-  EXPECT_EQ(grad_op2->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("out1")}));
-  EXPECT_EQ(grad_op2->Output(f::GradVarName("Y")),
-            std::vector<std::string>({f::GradVarName("y2")}));
-
-  f::OpDesc *grad_op3 = block->AllOps()[4];
-  EXPECT_EQ(grad_op3->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op3->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out3")}));
-  EXPECT_EQ(grad_op3->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("out2")}));
-  EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b3")}));
-
-  EXPECT_EQ(var_to_grad.size(), 7UL);
-  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6));
-  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6));
-  EXPECT_EQ(var_to_grad.at("out1"),
-            f::GradVarInfo(f::GradVarName("out1"), 0, 5));
-  EXPECT_EQ(var_to_grad.at("y2"), f::GradVarInfo(f::GradVarName("y2"), 0, 5));
-  EXPECT_EQ(var_to_grad.at("out2"),
-            f::GradVarInfo(f::GradVarName("out2"), 0, 4));
-  EXPECT_EQ(var_to_grad.at("b3"), f::GradVarInfo(f::GradVarName("b3"), 0, 4));
-
-  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("y2")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("out2")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b3")));
-}
-
-TEST(Backward, intermedia_var_no_grad) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDesc *op1 = block->AppendOp();
-  op1->SetType("rowwise_add");
-  op1->SetInput("X", {"x1"});
-  op1->SetInput("b", {"b1"});
-  op1->SetOutput("Out", {"out1"});
-
-  f::OpDesc *op2 = block->AppendOp();
-  op2->SetType("mul");
-  op2->SetInput("X", {"x2"});
-  op2->SetInput("Y", {"y2"});
-  op2->SetOutput("Out", {"out2"});
-
-  f::OpDesc *op3 = block->AppendOp();
-  op3->SetType("rowwise_add");
-  op3->SetInput("X", {"out2"});
-  op3->SetInput("b", {"b3"});
-  op3->SetOutput("Out", {"out3"});
-
-  f::OpDesc *op4 = block->AppendOp();
-  op4->SetType("mul");
-  op4->SetInput("X", {"out1"});
-  op4->SetInput("Y", {"out3"});
-  op4->SetOutput("Out", {"out4"});
-
-  auto target = f::VarDesc("out4");
-  target.SetShape({1});
-  size_t forward_len = block->AllOps().size();
-  auto var_to_grad = AppendBackward(program, target, {"out3"});
-
-  ASSERT_EQ(block->AllOps().size(), 7UL);
-  f::OpDesc *fill_op = block->AllOps()[forward_len];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op1 = block->AllOps()[6];
-  EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("x1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b1")}));
-
-  f::OpDesc *grad_op4 = block->AllOps()[5];
-  EXPECT_EQ(grad_op4->Type(), "mul_grad");
-  ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
-  ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op4->Input("X"), std::vector<std::string>({"out1"}));
-  EXPECT_EQ(grad_op4->Input("Y"), std::vector<std::string>({"out3"}));
-  EXPECT_EQ(grad_op4->Input("Out"), std::vector<std::string>({"out4"}));
-  EXPECT_EQ(grad_op4->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out4")}));
-  EXPECT_EQ(grad_op4->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("out1")}));
-  EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")), std::vector<std::string>());
-
-  EXPECT_EQ(var_to_grad.size(), 4UL);
-  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6));
-  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6));
-  EXPECT_EQ(var_to_grad.at("out1"),
-            f::GradVarInfo(f::GradVarName("out1"), 0, 5));
-
-  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
-}
-
-TEST(Backward, var_no_grad) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDesc *op1 = block->AppendOp();
-  op1->SetType("mult_in_out");
-  op1->SetInput("X", {"x1"});
-  op1->SetInput("H", {"h1"});
-  op1->SetOutput("Y", {"y1"});
-  op1->SetOutput("Z", {"z1"});
-
-  f::OpDesc *op2 = block->AppendOp();
-  op2->SetType("mult_in_out");
-  op2->SetInput("X", {"y1"});
-  op2->SetInput("H", {"z1"});
-  op2->SetOutput("Y", {"y2"});
-  op2->SetOutput("Z", {"z2"});
-
-  auto target = f::VarDesc("z2");
-  target.SetShape({1});
-  size_t forward_len = block->AllOps().size();
-  auto var_to_grad = AppendBackward(program, target, {"z1"});
-
-  ASSERT_EQ(block->AllOps().size(), 6UL);
-  f::OpDesc *fill_op = block->AllOps()[forward_len];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op2 = block->AllOps()[3];
-  ASSERT_EQ(grad_op2->Type(), "mult_in_out_grad");
-  ASSERT_EQ(grad_op2->InputNames().size(), 6UL);
-  ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op2->Input("X"), std::vector<std::string>({"y1"}));
-  EXPECT_EQ(grad_op2->Input("H"), std::vector<std::string>({"z1"}));
-  EXPECT_EQ(grad_op2->Input("Y"), std::vector<std::string>({"y2"}));
-  EXPECT_EQ(grad_op2->Input("Z"), std::vector<std::string>({"z2"}));
-  EXPECT_EQ(grad_op2->Input(f::GradVarName("Y")),
-            std::vector<std::string>({f::GradVarName("y2")}));
-  EXPECT_EQ(grad_op2->Input(f::GradVarName("Z")),
-            std::vector<std::string>({f::GradVarName("z2")}));
-  EXPECT_EQ(grad_op2->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("y1")}));
-  EXPECT_EQ(grad_op2->Output(f::GradVarName("H")), std::vector<std::string>());
-
-  f::OpDesc *fill_zero_op = block->AllOps()[4];
-  ASSERT_EQ(fill_zero_op->Type(), "fill_zeros_like");
-  ASSERT_EQ(fill_zero_op->InputNames().size(), 1UL);
-  ASSERT_EQ(fill_zero_op->OutputNames().size(), 1UL);
-  EXPECT_EQ(fill_zero_op->Input("X"), std::vector<std::string>({"z1"}));
-  EXPECT_EQ(fill_zero_op->Output("Out"),
-            std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));
-
-  f::OpDesc *grad_op1 = block->AllOps()[5];
-  ASSERT_EQ(grad_op1->Type(), "mult_in_out_grad");
-  ASSERT_EQ(grad_op1->InputNames().size(), 6UL);
-  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op1->Input("X"), std::vector<std::string>({"x1"}));
-  EXPECT_EQ(grad_op1->Input("H"), std::vector<std::string>({"h1"}));
-  EXPECT_EQ(grad_op1->Input("Y"), std::vector<std::string>({"y1"}));
-  EXPECT_EQ(grad_op1->Input("Z"), std::vector<std::string>({"z1"}));
-  EXPECT_EQ(grad_op1->Input(f::GradVarName("Y")),
-            std::vector<std::string>({f::GradVarName("y1")}));
-  EXPECT_EQ(grad_op1->Input(f::GradVarName("Z")),
-            std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("x1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("H")),
-            std::vector<std::string>({f::GradVarName("h1")}));
-
-  EXPECT_EQ(var_to_grad.size(), 4UL);
-  EXPECT_EQ(var_to_grad.at("y1"), f::GradVarInfo(f::GradVarName("y1"), 0, 3));
-  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 5));
-  EXPECT_EQ(var_to_grad.at("h1"), f::GradVarInfo(f::GradVarName("h1"), 0, 5));
-
-  EXPECT_TRUE(block->HasVar(f::GradVarName("y1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("h1")));
-}
-
-TEST(Backward, shared_var) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDesc *op1 = block->AppendOp();
-  op1->SetType("rowwise_add");
-  op1->SetInput("X", {"x1"});
-  op1->SetInput("b", {"b1"});
-  op1->SetOutput("Out", {"out1"});
-
-  f::OpDesc *op2 = block->AppendOp();
-  op2->SetType("mul");
-  op2->SetInput("X", {"out1"});
-  op2->SetInput("Y", {"y2"});
-  op2->SetOutput("Out", {"out2"});
-
-  f::OpDesc *op3 = block->AppendOp();
-  op3->SetType("rowwise_add");
-  op3->SetInput("X", {"out1"});
-  op3->SetInput("b", {"b3"});
-  op3->SetOutput("Out", {"out3"});
-
-  auto target = f::VarDesc("out3");
-  target.SetShape({1});
-  size_t forward_len = block->AllOps().size();
-  auto var_to_grad =
-      AppendBackward(program, target, std::unordered_set<std::string>{});
-
-  ASSERT_EQ(block->AllOps().size(), 8UL);
-  f::OpDesc *fill_op = block->AllOps()[forward_len];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op3 = block->AllOps()[4];
-  ASSERT_EQ(grad_op3->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op3->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out3")}));
-  EXPECT_EQ(grad_op3->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@0"}));
-  EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b3")}));
-
-  f::OpDesc *grad_op4 = block->AllOps()[5];
-  ASSERT_EQ(grad_op4->Type(), "mul_grad");
-  ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
-  ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op4->Input("X"), std::vector<std::string>({"out1"}));
-  EXPECT_EQ(grad_op4->Input("Y"), std::vector<std::string>({"y2"}));
-  EXPECT_EQ(grad_op4->Input("Out"), std::vector<std::string>({"out2"}));
-  EXPECT_EQ(grad_op4->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out2")}));
-  EXPECT_EQ(grad_op4->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@1"}));
-  EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")),
-            std::vector<std::string>({f::GradVarName("y2")}));
-
-  f::OpDesc *sum_op = block->AllOps()[6];
-  ASSERT_EQ(sum_op->Type(), "sum");
-  ASSERT_EQ(sum_op->InputNames().size(), 1UL);
-  ASSERT_EQ(sum_op->OutputNames().size(), 1UL);
-  EXPECT_EQ(sum_op->Input("X"),
-            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@0",
-                                      f::GradVarName("out1") + "@RENAME@1"}));
-  EXPECT_EQ(sum_op->Output("Out"),
-            std::vector<std::string>({f::GradVarName("out1")}));
-
-  f::OpDesc *grad_op1 = block->AllOps()[7];
-  ASSERT_EQ(grad_op1->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("x1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b1")}));
-
-  EXPECT_EQ(var_to_grad.size(), 6UL);
-  EXPECT_EQ(var_to_grad.at("b3"), f::GradVarInfo(f::GradVarName("b3"), 0, 4));
-  EXPECT_EQ(var_to_grad.at("y2"), f::GradVarInfo(f::GradVarName("y2"), 0, 5));
-  EXPECT_EQ(var_to_grad.at("out1"),
-            f::GradVarInfo(f::GradVarName("out1"), 0, 6));
-  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 7));
-  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 7));
-
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b3")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("y2")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
-}
-
-TEST(Backward, half_backward) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  auto *op1 = block->AppendOp();
-  op1->SetType("minus");
-  op1->SetInput("X", {"a"});
-  op1->SetInput("Y", {"b"});
-  op1->SetOutput("Out", {"out"});
-
-  auto target = f::VarDesc("out");
-  target.SetShape({1});
-  size_t forward_len = block->AllOps().size();
-  auto var_to_grad = AppendBackward(program, target, {"b"});
-  f::OpDesc *fill_op = block->AllOps()[forward_len];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-  auto ops = block->AllOps();
-  ASSERT_EQ(3UL, ops.size());
-
-  EXPECT_EQ(var_to_grad.size(), 2UL);
-  EXPECT_EQ(var_to_grad.at("a"),
-            f::GradVarInfo(f::GradVarName("a"), 0, forward_len + 1));
-}
diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h
index 873969b2a884f6d9e133fe87bf72725c36ce8b98..eef19c4f09c60b9df18f154c85c421f5bff9413f 100644
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -92,7 +92,7 @@ class BlockDesc {
 
   /*
    * Remove Op and its input/output variables.
-   * Note that for either input or ouput variable, if it is also an input or
+   * Note that for either input or output variable, if it is also an input or
    * output variable of other ops, we should remain it.
    */
   void RemoveOp(size_t s, size_t e);
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 89b5c6847f15b3f2a270fe1e7db9e590549e8982..85b649b2937f6a281b9ee1fe7bae8101169f6102 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -5,6 +5,7 @@ cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod
 nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
         dynload_cuda)
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
+cc_library(send_op_handle SRCS send_op_handle.cc DEPS framework_proto scope place operator op_registry)
 
 cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
 cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
@@ -15,7 +16,7 @@ else()
     set(multi_devices_graph_builder_deps)
 endif()
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
-            scale_loss_grad_op_handle ${multi_devices_graph_builder_deps})
+            scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps})
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
         simple_threadpool device_context)
diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
index 7a1b40c0b60a788b1f0a70e688f8fcbe427ad076..e3f8bbb72f2a1b75b6041d41496cef0efc81874f 100644
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 
+#include <string>
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -33,7 +35,7 @@ void ComputationOpHandle::RunImpl() {
     }
   }
 
-  op_->Run(*scope_->FindVar("@TMP_SCOPE@")->Get<Scope *>(), place_);
+  op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
 }
 
 std::string ComputationOpHandle::Name() const { return op_->Type(); }
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 9180903b864d03e59f55f41410b2240fa4199496..e3e7c55d153aec8ce9c25c962821b266eaa84fe4 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -14,6 +14,9 @@
 
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
 
+#include <string>
+#include <vector>
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -57,7 +60,10 @@ void FetchOpHandle::RunImpl() {
 
   for (size_t i = 0; i < scopes.size(); ++i) {
     auto &scope = scopes[i];
-    auto &t = scope->FindVar(var_name)->Get<framework::LoDTensor>();
+    auto &t = scope->FindVar(kLocalExecScopeName)
+                  ->Get<Scope *>()
+                  ->FindVar(var_name)
+                  ->Get<framework::LoDTensor>();
     if (platform::is_gpu_place(var->place_)) {
 #ifdef PADDLE_WITH_CUDA
       TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i]);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index e7a0cb678ebfd8a3fe5f873e995b63b0857e5ba4..e0dd9e6068174a4b0348d503f4082bee6ff68dac 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
+#include "paddle/fluid/framework/details/send_op_handle.h"
 #include "paddle/fluid/framework/scope.h"
 
 #ifdef PADDLE_WITH_CUDA
@@ -54,6 +55,27 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
   }
 }
 
+void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result, OpDesc *op,
+                                                const platform::Place &p,
+                                                const size_t &i) const {
+  auto *op_handle = result->ops_.back().get();
+  op_handle->dev_ctxes_[p] = const_cast<platform::DeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(p));
+
+  auto var_names = op->InputArgumentNames();
+
+  for (auto &each_var_name : var_names) {
+    VarHandle *var = CreateOrGetLatestVarHandle(result, each_var_name, p, i);
+    op_handle->AddInput(var);
+  }
+
+  var_names = op->OutputArgumentNames();
+
+  for (auto &each_var_name : var_names) {
+    CreateOpOutput(result, op_handle, each_var_name, p, i);
+  }
+}
+
 std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
     const ProgramDesc &program) const {
   auto graph = new SSAGraph();
@@ -76,27 +98,28 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
       }
     }
 
+    // append send op if program is distributed trainer main program.
+    // always use the first device
+    if (!is_forwarding && op->Type() == "send") {
+      auto &p = places_[0];
+      auto *s = local_scopes_[0];
+      // FIXME(wuyi): send op always copy from GPU 0
+      result.ops_.emplace_back(new SendOpHandle(*op, s, p));
+      // Create inputs for output on original place and no ssa output
+      // is created for send op.
+      CreateOpHandleIOs(&result, op, p, 0);
+      continue;
+    }
+
     for (size_t i = 0; i < places_.size(); ++i) {
       auto &p = places_[i];
       auto *s = local_scopes_[i];
 
       result.ops_.emplace_back(new ComputationOpHandle(*op, s, p));
       auto *op_handle = result.ops_.back().get();
-      op_handle->dev_ctxes_[p] = const_cast<platform::DeviceContext *>(
-          platform::DeviceContextPool::Instance().Get(p));
+      CreateOpHandleIOs(&result, op, p, i);
 
-      auto var_names = op->InputArgumentNames();
-
-      for (auto &each_var_name : var_names) {
-        VarHandle *var =
-            CreateOrGetLatestVarHandle(&result, each_var_name, p, i);
-        op_handle->AddInput(var);
-      }
-      var_names = op->OutputArgumentNames();
-
-      for (auto &each_var_name : var_names) {
-        CreateOpOutput(&result, op_handle, each_var_name, p, i);
-      }
+      auto var_names = op->OutputArgumentNames();
 
       if (is_forwarding) {
         if (var_names.size() == 1 && var_names[0] == loss_var_name_) {
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index d3c8e582cf2cdf26198822e4bd2602883622df21..de34caab1be85eecb741a5003f026eb982e178ea 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -14,6 +14,9 @@
 
 #pragma once
 
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/details/ssa_graph_builder.h"
 
 namespace paddle {
@@ -41,6 +44,10 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 
   std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
 
+ private:
+  void CreateOpHandleIOs(SSAGraph *result, OpDesc *op, const platform::Place &p,
+                         const size_t &i) const;
+
  private:
   std::string loss_var_name_;
   const std::vector<platform::Place> &places_;
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index d7a541ac4bb83625060db337446d03a1afda3ed0..fbdb54ba8d940c8dedd44a42a85825af5d2ec664 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -24,6 +24,8 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@";
+
 class OpHandleBase {
  private:
   DISABLE_COPY_AND_ASSIGN(OpHandleBase);
diff --git a/paddle/fluid/framework/details/send_op_handle.cc b/paddle/fluid/framework/details/send_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d181607e86372f4872c38bc35db786ac142ccc65
--- /dev/null
+++ b/paddle/fluid/framework/details/send_op_handle.cc
@@ -0,0 +1,43 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/send_op_handle.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+SendOpHandle::SendOpHandle(const framework::OpDesc &op_desc,
+                           const Scope *local_scope,
+                           const platform::Place &place)
+    : op_(framework::OpRegistry::CreateOp(op_desc)),
+      local_scope_(local_scope),
+      place_(place) {}
+
+void SendOpHandle::RunImpl() {
+  // Wait input done
+  for (auto *in : inputs_) {
+    auto &p = static_cast<VarHandle *>(in)->place_;
+    if (in->DebugString() == "dummy") {  // HACK
+      continue;
+    }
+    in->generated_op_->Wait(dev_ctxes_[p]);
+  }
+  op_->Run(*local_scope_, place_);
+}
+
+std::string SendOpHandle::Name() const { return "send"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/send_op_handle.h b/paddle/fluid/framework/details/send_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..173f9d726145aeb9e85cc0fb9056eb57bf484098
--- /dev/null
+++ b/paddle/fluid/framework/details/send_op_handle.h
@@ -0,0 +1,50 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct SendOpHandle : public OpHandleBase {
+  std::unique_ptr<OperatorBase> op_;
+  const Scope* local_scope_;
+  const platform::Place& place_;
+
+  SendOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope,
+               const platform::Place& place);
+
+  std::string Name() const override;
+
+  // Delay and buffer nccl_all_reduce together can significantly increase
+  // performance. Disable this feature by returning false.
+  bool IsMultiDeviceTransfer() override { return false; };
+
+ protected:
+  void RunImpl() override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h
index 3b818b1a45b56351e34f9e52ec22b6d02a0c1591..a8833b7388ab907020a260d356f1484ffd227658 100644
--- a/paddle/fluid/framework/details/ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/ssa_graph_executor.h
@@ -15,13 +15,15 @@
 #pragma once
 
 #include <memory>
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/details/ssa_graph.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
-
 class SSAGraphExecutor {
   DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor);
 
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 62af4c1d79ded5eaa30e4e6d43cc0d7327ae9689..1ce69ab02b09fe7ec17f479bcef97c931e853dc4 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -136,12 +136,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     ready_ops.clear();
   };
 
-  // Create local scopes.
-  for (auto &scope : local_scopes_) {
-    auto &local_scope = scope->NewScope();
-    *scope->Var("@TMP_SCOPE@")->GetMutable<Scope *>() = &local_scope;
-  }
-
   // Step 3. Execution
   while (!pending_vars.empty() || !ready_ops.empty() || !delayed_ops.empty()) {
     // 1. Run All Ready ops
@@ -189,34 +183,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   PADDLE_ENFORCE(ready_ops.empty());
   PADDLE_ENFORCE(delayed_ops.empty());
   PADDLE_ENFORCE(blocked_by_delayed_ops.empty());
-  ++computation_count_;
-
-  auto sync_computation = [&] {
-    computation_count_ = 0;
-    // Wait All computational streams
-    for (auto p : this->places_) {
-      platform::DeviceContextPool::Instance().Get(p)->Wait();
-    }
-    for (auto &scope : local_scopes_) {
-      scope->DropKids();
-    }
-  };
 
   // Wait FetchOps.
   if (!fetch_ops.empty()) {
     fetch_ops.clear();
-    sync_computation();
-  }
-
-  if (computation_count_ == max_async_computation) {
-    sync_computation();
-  }
-
-  // NOTE: the temp scope can be dropped lazily if needed.
-  // Drop tmp scopes;
-  for (auto &scope : local_scopes_) {
-    auto &kid = *scope->Var("@TMP_SCOPE@")->GetMutable<Scope *>();
-    kid = nullptr;
   }
 
   return fetch_data;
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 79cfc26b461a39811a9a125e5aeac3492d967386..bb5e837b135c35b5aea403496b45aab1ccc288ff 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -99,9 +99,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   std::unique_ptr<platform::EnforceNotMet> exception_;
   std::atomic<int> running_ops_;
   bool allow_op_delay_;
-
-  size_t computation_count_{0};
-  size_t max_async_computation{100};
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 8155cb55a468a09320b1196b49fc3e34cea261b1..a56674cbe216e312c4394ef537140122352dc785 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -12,9 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/lod_tensor.h"
+#include <stdint.h>
+#include <string.h>
+#include <algorithm>
+#include <iterator>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
 
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"
@@ -22,11 +27,6 @@ limitations under the License. */
 #include "paddle/fluid/recordio/scanner.h"
 #include "paddle/fluid/recordio/writer.h"
 
-#include <stdint.h>
-#include <string.h>
-#include <algorithm>
-#include <iterator>
-
 namespace paddle {
 namespace framework {
 
@@ -294,7 +294,7 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
   TensorFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
 }
 
-void WriteToRecordIO(recordio::Writer &writer,
+void WriteToRecordIO(recordio::Writer *writer,
                      const std::vector<LoDTensor> &tensor,
                      const platform::DeviceContext &dev_ctx) {
   std::stringstream buffer;
@@ -303,18 +303,20 @@ void WriteToRecordIO(recordio::Writer &writer,
   for (auto &each : tensor) {
     SerializeToStream(buffer, each, dev_ctx);
   }
-  writer.Write(buffer.str());
+  writer->Write(buffer.str());
 }
 
 std::vector<LoDTensor> ReadFromRecordIO(
-    recordio::Scanner &scanner, const platform::DeviceContext &dev_ctx) {
-  std::istringstream sin(scanner.Next());
-  uint32_t sz;
-  sin.read(reinterpret_cast<char *>(&sz), sizeof(uint32_t));
+    recordio::Scanner *scanner, const platform::DeviceContext &dev_ctx) {
   std::vector<LoDTensor> result;
-  result.resize(sz);
-  for (uint32_t i = 0; i < sz; ++i) {
-    DeserializeFromStream(sin, &result[i], dev_ctx);
+  if (scanner->HasNext()) {
+    std::istringstream sin(scanner->Next());
+    uint32_t sz;
+    sin.read(reinterpret_cast<char *>(&sz), sizeof(uint32_t));
+    result.resize(sz);
+    for (uint32_t i = 0; i < sz; ++i) {
+      DeserializeFromStream(sin, &result[i], dev_ctx);
+    }
   }
   return result;
 }
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index 4f130d265900483ec7a7c541f2610d17a352913f..1159fee39b0737402c60448dcbe69e7535c9d6e1 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -15,6 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <memory>
+#include <string>
+#include <utility>
+#include <vector>
 #ifdef PADDLE_WITH_CUDA
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
@@ -216,12 +219,12 @@ void SerializeToStream(std::ostream& os, const LoDTensor& tensor,
 void DeserializeFromStream(std::istream& is, LoDTensor* tensor,
                            const platform::DeviceContext& dev_ctx);
 
-extern void WriteToRecordIO(recordio::Writer& writer,
+extern void WriteToRecordIO(recordio::Writer* writer,
                             const std::vector<LoDTensor>& tensor,
                             const platform::DeviceContext& dev_ctx);
 
 extern std::vector<LoDTensor> ReadFromRecordIO(
-    recordio::Scanner& scanner, const platform::DeviceContext& dev_ctx);
+    recordio::Scanner* scanner, const platform::DeviceContext& dev_ctx);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
index e691e29383d4842b80769021e0e494967d38e9bb..97ab98f09b1a902a942d9667bc7716a28b98d54c 100644
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/lod_tensor.h"
-
-#include "paddle/fluid/recordio/scanner.h"
-#include "paddle/fluid/recordio/writer.h"
-
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <algorithm>
 #include <memory>
 #include <vector>
 
+#include "paddle/fluid/framework/lod_tensor.h"
+
+#include "paddle/fluid/recordio/scanner.h"
+#include "paddle/fluid/recordio/writer.h"
+
 namespace paddle {
 namespace framework {
 
@@ -240,8 +240,8 @@ TEST(LoDTensor, RecordIO) {
       *platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
   {
     recordio::Writer writer(stream, recordio::Compressor::kSnappy);
-    WriteToRecordIO(writer, {tensor, tensor}, ctx);
-    WriteToRecordIO(writer, {tensor, tensor}, ctx);
+    WriteToRecordIO(&writer, {tensor, tensor}, ctx);
+    WriteToRecordIO(&writer, {tensor, tensor}, ctx);
     writer.Flush();
   }
 
@@ -254,11 +254,11 @@ TEST(LoDTensor, RecordIO) {
   {
     std::unique_ptr<std::istream> stream_ptr(stream);
     recordio::Scanner scanner(std::move(stream_ptr));
-    auto tensors = ReadFromRecordIO(scanner, ctx);
+    auto tensors = ReadFromRecordIO(&scanner, ctx);
     ASSERT_EQ(tensors.size(), 2);
     assert_tensor_ok(tensors[0]);
     assert_tensor_ok(tensors[1]);
-    tensors = ReadFromRecordIO(scanner, ctx);
+    tensors = ReadFromRecordIO(&scanner, ctx);
     ASSERT_EQ(tensors.size(), 2);
     assert_tensor_ok(tensors[0]);
     assert_tensor_ok(tensors[1]);
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index a3b4a8c0829ae3324e933309b2eaea35fe571997..f97bd0827428feeb590fcad16c48f3461517a646 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -46,7 +46,8 @@ proto::VarType::Type GetDataTypeOfVar(const Variable* var) {
   }
 }
 
-static DDim GetDims(const Scope& scope, const std::string& name) {
+static DDim GetDims(const Scope& scope, const std::string& name,
+                    bool get_actual_dim = false) {
   Variable* var = scope.FindVar(name);
   if (var == nullptr) {
     return DDim({-1});
@@ -55,7 +56,11 @@ static DDim GetDims(const Scope& scope, const std::string& name) {
   if (var->IsType<LoDTensor>()) {
     return var->Get<LoDTensor>().dims();
   } else if (var->IsType<SelectedRows>()) {
-    return var->Get<SelectedRows>().GetCompleteDims();
+    if (get_actual_dim) {
+      return var->Get<SelectedRows>().value().dims();
+    } else {
+      return var->Get<SelectedRows>().GetCompleteDims();
+    }
   } else {
     return DDim({-1});
   }
@@ -129,7 +134,7 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
     for (size_t i = 0; i < input.second.size(); ++i) {
       ss << input.second[i];
       if (scope) {
-        ss << "[" << GetDims(*scope, input.second[i]) << "]";
+        ss << "[" << GetDims(*scope, input.second[i], true) << "]";
         ss << "(" << GetLoD(*scope, input.second[i]) << ")";
       }
       if (i != input.second.size() - 1) {
@@ -149,7 +154,7 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
     for (size_t i = 0; i < output.second.size(); ++i) {
       ss << output.second[i];
       if (scope) {
-        ss << "[" << GetDims(*scope, output.second[i]) << "]";
+        ss << "[" << GetDims(*scope, output.second[i], true) << "]";
         ss << "(" << GetLoD(*scope, output.second[i]) << ")";
       }
       if (i != output.second.size() - 1) {
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index f393105fe82bfad70246952deada8e296c851ef5..c1486b527d2e06d2b3f7e0f89458bf9a22564586 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/parallel_executor.h"
 
 #include <string>
+#include <tuple>
 #include <vector>
 
 #ifdef PADDLE_WITH_CUDA
@@ -41,6 +42,8 @@ class ParallelExecutorPrivate {
 #ifdef PADDLE_WITH_CUDA
   std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
 #endif
+
+  std::vector<std::tuple<std::string, proto::VarType::Type, bool>> var_types_;
 };
 
 std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
@@ -97,14 +100,9 @@ ParallelExecutor::ParallelExecutor(
       allow_op_delay));
 
   // Step 3. Create vars in each scope;
-  for (auto *scope : member_->local_scopes_) {
-    for (auto *var : main_program.Block(0).AllVars()) {
-      if (scope->FindVar(var->Name()) != nullptr) {
-        continue;
-      }
-
-      InitializeVariable(scope->Var(var->Name()), var->GetType());
-    }
+  for (auto *var : main_program.Block(0).AllVars()) {
+    member_->var_types_.emplace_back(var->Name(), var->GetType(),
+                                     var->Persistable());
   }
 }
 
@@ -115,14 +113,12 @@ void ParallelExecutor::BCastParamsToGPUs(
 
   for (auto &var : vars) {
     auto *main_var = main_scope->FindVar(var);
-    if (!main_var->IsType<LoDTensor>()) {
+    if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
       continue;
     }
 
     auto &main_tensor = main_var->Get<LoDTensor>();
-
     auto &dims = main_tensor.dims();
-
     if (paddle::platform::is_gpu_place(main_tensor.place())) {
       size_t numel = main_tensor.numel();
       ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
@@ -165,9 +161,42 @@ void ParallelExecutor::Run(
     const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
   platform::RecordBlock b(0);
   SplitTensorToPlaces(feed_tensors);
+
+  // Create local scopes.
+  for (auto &scope : member_->local_scopes_) {
+    Scope &local_scope = scope->NewScope();
+    *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
+        &local_scope;
+
+    for (auto &name_type_pair : member_->var_types_) {
+      if (scope->FindVar(std::get<0>(name_type_pair)) != nullptr) {
+        continue;
+      }
+
+      if (std::get<2>(name_type_pair)) {  // Persistable
+        InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
+                           std::get<1>(name_type_pair));
+      } else {
+        InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
+                           std::get<1>(name_type_pair));
+      }
+    }
+  }
+
   auto fetch_data = member_->executor_->Run(fetch_tensors);
   *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
       fetch_data;
+
+  // Wait All computational streams
+  for (auto p : member_->places_) {
+    platform::DeviceContextPool::Instance().Get(p)->Wait();
+  }
+  for (auto &scope : member_->local_scopes_) {
+    auto &local_scope =
+        *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
+    scope->DeleteScope(local_scope);
+    local_scope = nullptr;
+  }
 }
 
 void ParallelExecutor::SplitTensorToPlaces(
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index c048c3865f14822be4a0015e385ea1b8e05d0ced..b4f16dba858fb279ec23a8a04257dda6651148cc 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -48,13 +48,13 @@ class ParallelExecutor {
            const std::string& fetched_var_name,
            const std::unordered_map<std::string, LoDTensor>& feed_tensors);
 
+  void BCastParamsToGPUs(const std::unordered_set<std::string>& vars) const;
+
  private:
   void SplitTensorToPlaces(
       const std::unordered_map<std::string, LoDTensor>& feed_tensors);
 
   ParallelExecutorPrivate* member_;
-
-  void BCastParamsToGPUs(const std::unordered_set<std::string>& vars) const;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/prune_test.cc b/paddle/fluid/framework/prune_test.cc
index 0e44b34383027ef58a033eb082f4bb2118b5d8a3..8af7d2d510d36e4c24ce3ae8dbc13c24ad5d4a0f 100644
--- a/paddle/fluid/framework/prune_test.cc
+++ b/paddle/fluid/framework/prune_test.cc
@@ -14,18 +14,17 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/prune.h"
 
+#include <gtest/gtest.h>
+#include <string>
+
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/net_op.h"
 
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
 
-#include <gtest/gtest.h>
-
 namespace f = paddle::framework;
-namespace ops = paddle::operators;
 
 void AddOp(const std::string &type, const f::VariableNameMap &inputs,
            const f::VariableNameMap &outputs, f::AttributeMap attrs,
diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc
index 56bf00e5f91700f0cffa917aad8608caaab0a7fe..76126f3dc64d71770d13f9d66bb30f176c112629 100644
--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -22,7 +22,9 @@ FileReader::FileReader(const std::vector<DDim> &dims) : dims_(dims) {}
 
 void FileReader::ReadNext(std::vector<LoDTensor> *out) {
   ReadNextImpl(out);
-  PADDLE_ENFORCE_EQ(out->size(), dims_.size());
+  if (out->empty()) {
+    return;
+  }
   for (size_t i = 0; i < dims_.size(); ++i) {
     auto &actual = out->at(i).dims();
     auto &expect = dims_[i];
diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h
index 3573b99becf6d657c680c5fec0bda4bdde5dd7a2..3a413941df964c8d9454fafc6030c377c10f9fb1 100644
--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@@ -14,14 +14,13 @@
 
 #pragma once
 
+#include <memory>
+#include <vector>
+
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/platform/place.h"
 
-#include <memory>
-#include <thread>
-#include <vector>
-
 namespace paddle {
 namespace framework {
 
@@ -31,8 +30,6 @@ class ReaderBase {
 
   virtual void ReInit() = 0;
 
-  virtual bool HasNext() const = 0;
-
   virtual ~ReaderBase();
 };
 
@@ -44,8 +41,6 @@ class DecoratedReader : public ReaderBase {
 
   void ReInit() override { reader_->ReInit(); }
 
-  bool HasNext() const override { return reader_->HasNext(); }
-
  protected:
   ReaderBase* reader_;
 };
@@ -80,8 +75,6 @@ class ReaderHolder {
     reader_->ReInit();
   }
 
-  bool HasNext() const { return reader_->HasNext(); }
-
  private:
   std::unique_ptr<ReaderBase> reader_;
 };
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index f417f62f3f75360f4ae1b7795608ae95200cfeb8..e53bcf2384e54e21c7dd5638f3b7469a35b571bf 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor prune init)
+set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
 
 cc_library(paddle_fluid_api
     SRCS io.cc
@@ -11,7 +11,7 @@ cc_library(paddle_fluid DEPS ${fluid_modules})
 # Create shared library
 cc_library(paddle_fluid_shared SHARED
     SRCS io.cc
-    DEPS ARCHIVE_START ${GLOB_OP_LIB} ${FLUID_CORE_MODULES} ARCHIVE_END)
+    DEPS ${fluid_modules})
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
 if(NOT APPLE)
   # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index a5b62ef322bfad0fc956d7d722797bd5add6aea6..a29d457b6fa9d0e8297252c8ff1117013d2055f8 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -17,10 +17,16 @@ limitations under the License. */
 #include <fstream>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/pybind/pybind.h"
 
 namespace paddle {
 namespace inference {
 
+// Temporarilly add this function for exposing framework::InitDevices() when
+// linking the inference shared library.
+void Init(bool init_p2p) { framework::InitDevices(init_p2p); }
+
 void ReadBinaryFile(const std::string& filename, std::string& contents) {
   std::ifstream fin(filename, std::ios::in | std::ios::binary);
   PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h
index d07d315b93ef10a464080899b1cb9920abe83be3..756c936b33ad55e2994542b171b945e248ba2e21 100644
--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
@@ -18,12 +18,15 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 
 namespace paddle {
 namespace inference {
 
+void Init(bool init_p2p);
+
 void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
                       const framework::ProgramDesc& main_program,
                       const std::string& dirname,
diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index 6ed77adb9d891c75e7de358d0d7a0c06c9af96dd..97d9f03f88ad3e851a2dd4256d34e8ca76fdfb01 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -17,14 +17,15 @@ function(inference_test TARGET_NAME)
     string(REGEX REPLACE "^_$" "" arg "${arg}")
     cc_test(test_inference_${TARGET_NAME}${arg}
         SRCS test_inference_${TARGET_NAME}.cc
-        DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
+        DEPS paddle_fluid
         ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model)
     set_tests_properties(test_inference_${TARGET_NAME}${arg}
         PROPERTIES DEPENDS test_${TARGET_NAME})
   endforeach()
 endfunction(inference_test)
 
-inference_test(fit_a_line)
+# This unittest is buggy!
+#inference_test(fit_a_line)
 inference_test(image_classification ARGS vgg resnet)
 inference_test(label_semantic_roles)
 inference_test(recognize_digits ARGS mlp conv)
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 5ff987ad8b3ba3c9195e87e6c11e70ac98fa0a11..7d6781c2c38822eaabb64eda9c76ff657bbdeeb8 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -100,7 +100,7 @@ function(op_library TARGET)
     endif()
 
     # Define operators that don't need pybind here.
-    foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
+    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
         endif()
@@ -199,7 +199,6 @@ else()
     set(DEPS_OPS ${DEPS_OPS} send_op prefetch_op recv_op listen_and_serv_op send_vars_op send_barrier_op)
 endif()
 
-op_library(cond_op DEPS framework_proto tensor net_op)
 op_library(cross_entropy_op DEPS cross_entropy)
 op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 op_library(softmax_op DEPS softmax)
@@ -246,9 +245,17 @@ op_library(channel_send_op DEPS concurrency)
 op_library(channel_recv_op DEPS concurrency)
 
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
+
+# The fully connected layer is deleted when the WITH_MKLDNN flag is OFF
+# Because the fully connected layer has only one MKLDNN's operator
+if(NOT WITH_MKLDNN)
+    list(REMOVE_ITEM GENERAL_OPS fc_op)
+endif(NOT WITH_MKLDNN)
+
 foreach(src ${GENERAL_OPS})
     op_library(${src})
 endforeach()
+
 file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
 
 add_subdirectory(reader)
@@ -259,7 +266,6 @@ endforeach()
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
-cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
 cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
 cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
diff --git a/paddle/fluid/operators/batch_norm_op.cu.cc b/paddle/fluid/operators/batch_norm_op.cu.cc
index eecb58e11ef57b550c79c040e6933ed6e52e2e87..cb1927bc0f2eb735f0a3184df5f0f8fada2f9dca 100644
--- a/paddle/fluid/operators/batch_norm_op.cu.cc
+++ b/paddle/fluid/operators/batch_norm_op.cu.cc
@@ -114,23 +114,11 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     const auto *bias = ctx.Input<Tensor>("Bias");
 
     auto *y = ctx.Output<Tensor>("Y");
-    auto *mean_out = ctx.Output<Tensor>("MeanOut");
-    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
-    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
 
     // alloc memory
     y->mutable_data<T>(ctx.GetPlace());
-    mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
-        functor;
-    functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
-    functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
 
     auto handle = dev_ctx.cudnn_handle();
 
@@ -159,6 +147,21 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
       // Run training mode.
       // obtain running mean and running inv var, and see if we need to
       // initialize them.
+
+      auto *mean_out = ctx.Output<Tensor>("MeanOut");
+      auto *variance_out = ctx.Output<Tensor>("VarianceOut");
+      mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+      variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+
+      auto *saved_mean = ctx.Output<Tensor>("SavedMean");
+      auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
+      saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+      saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+      math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
+          functor;
+      functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
+      functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
+
       double this_factor = 1. - momentum;
 
       CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining(
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index d65a7b34678cda38d5f8beb9154d61928f517ce0..4a36b03cb63ac3ea61be1bbc56b8dd0adbe7d334 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/concat_op.h"
+
 #include <string>
 #include <vector>
 
@@ -34,7 +35,10 @@ class ConcatOp : public framework::OperatorWithKernel {
     size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
     const size_t n = ins.size();
 
-    PADDLE_ENFORCE_GT(n, 1, "Input tensors count should > 1.");
+    PADDLE_ENFORCE_GT(n, 0, "Input tensors count should > 0.");
+    if (n == 1) {
+      VLOG(3) << "Warning: concat op have only one input, may waste memory";
+    }
 
     auto out_dims = ins[0];
     size_t in_zero_dims_size = out_dims.size();
diff --git a/paddle/fluid/operators/cond_op.cc b/paddle/fluid/operators/cond_op.cc
deleted file mode 100644
index 15dce9e3e28fa0200e332534f42752838da4db92..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cond_op.cc
+++ /dev/null
@@ -1,235 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/cond_op.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-using Scope = framework::Scope;
-using Variable = framework::Variable;
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DDim = framework::DDim;
-
-framework::Scope& CondOp::AddSubScope(const Scope& scope) const {
-  auto sub_scopes_var = scope.FindVar("SubScopes");
-  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
-                          "Output(SubScopes) of CondOp should not be null.");
-  auto sub_scopes = sub_scopes_var->GetMutable<std::vector<Scope*>>();
-  auto& sub_scope = scope.NewScope();
-  sub_scopes->push_back(&sub_scope);
-  return sub_scope;
-}
-
-std::vector<framework::Scope*>& CondOp::GetSubScopes(
-    const framework::Scope& scope) const {
-  auto sub_scopes_var = scope.FindVar("SubScopes");
-  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
-                          "Output(SubScopes) of CondOp should not be null.");
-  return *sub_scopes_var->GetMutable<std::vector<framework::Scope*>>();
-}
-
-LoDTensor& CondOp::AddIndexTensor(const Scope& scope) const {
-  auto index_tensors_var = scope.FindVar("IndexTensors");
-  PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
-                          "Output(IndexTensors) of CondOp should not be null.");
-  auto& index_tensors =
-      *index_tensors_var->GetMutable<std::vector<LoDTensor>>();
-  index_tensors.push_back(LoDTensor());
-  return index_tensors.back();
-}
-
-std::vector<framework::LoDTensor>& CondOp::GetIndexTensors(
-    const framework::Scope& scope) const {
-  auto* index_tensors_var = scope.FindVar("IndexTensors");
-  PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
-                          "Output(IndexTensors) of CondOp should not be null.");
-  return *index_tensors_var->GetMutable<std::vector<framework::LoDTensor>>();
-}
-
-void CondOp::PrepareDataForSubnet(
-    const framework::Scope& scope,
-    const platform::DeviceContext& dev_ctx) const {
-  PADDLE_ENFORCE(!Inputs("Xs").empty(), "Inputs(Xs) of CondOp can't be empty.");
-
-  for (int i = 0; i < BRANCH_NUM; ++i) {
-    // Create two sub scopes for true and false branches
-    //   sub_scopes[0] for the true branch
-    //   sub_scopes[1] for the false branch
-    AddSubScope(scope);
-    // Create two tensors for true and false indices:
-    //   index_tensors[0] for the true branch
-    //   index_tensors[1] for the false branch
-    AddIndexTensor(scope);
-  }
-
-  Variable* cond_var = scope.FindVar(Input("Cond"));
-  PADDLE_ENFORCE_NOT_NULL(cond_var,
-                          "Input(Cond) of CondOp should not be null.");
-  const LoDTensor* cond = cond_var->GetMutable<LoDTensor>();
-
-  // get the true/false index at runtime according to cond tensor
-  // index_vectors[0]: vector<int>, contains all index for cond[i] == true
-  // index_vectors[1]: vector<int>, contains all index for cond[i] == false
-  std::vector<std::vector<int>> index_vectors;
-  index_vectors.resize(BRANCH_NUM);
-
-  const int* cond_data = cond->data<int>();
-  for (int i = 0; i < cond->dims()[0]; ++i) {
-    if (cond_data[i])
-      index_vectors[TRUE_BRANCH].push_back(i);
-    else
-      index_vectors[FALSE_BRANCH].push_back(i);
-  }
-
-  // put index_vectors[0] and index_vectors[1] into two tensors:
-  // index_tensors[0] and index_tensors[1]
-  std::vector<framework::LoDTensor>& index_tensors = GetIndexTensors(scope);
-  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
-
-  for (int i = 0; i < BRANCH_NUM; ++i) {
-    DDim dim = {static_cast<int64_t>(index_vectors[i].size())};
-    int* index_tensor_data_ptr =
-        index_tensors[i].mutable_data<int>(dim, platform::CPUPlace());
-    memcpy(index_tensor_data_ptr, index_vectors[i].data(),
-           dim[0] * sizeof(int));
-  }
-
-  // create input in subscopes according to index_vectors
-  for (auto& input : Inputs("Xs")) {
-    Variable* var_parent = scope.FindVar(input);
-    PADDLE_ENFORCE_NOT_NULL(var_parent);
-    const auto* tensor_parent = &var_parent->Get<LoDTensor>();
-
-    for (int i = 0; i < BRANCH_NUM; ++i) {
-      Variable* var_child = sub_scopes[i]->FindVar(input);
-      PADDLE_ENFORCE_NOT_NULL(var_child);
-      auto* tensor_child = var_child->GetMutable<LoDTensor>();
-
-      // Resize child
-      DDim dim = tensor_parent->dims();
-      dim[0] = index_tensors[i].dims()[0];
-      tensor_child->mutable_data<float>(dim, platform::CPUPlace());
-
-      CPUGather<float>(dev_ctx, *tensor_parent, index_tensors[i], tensor_child);
-    }
-  }
-
-  // create output_tensors in subscope for sub_net
-  for (int i = 0; i < BRANCH_NUM; ++i) {
-    for (auto& output : (*sub_net_op_[i]).Outputs()) {
-      for (auto& var_name : output.second) {
-        sub_scopes[i]->Var(var_name);
-      }
-    }
-  }
-}
-
-void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
-                                 const platform::DeviceContext& dev_ctx) const {
-  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
-  const std::vector<framework::LoDTensor>& index_tensors =
-      GetIndexTensors(scope);
-
-  // Infer the output dim, out_dim[0] = true_dim[0] + false_dim[0]
-  PADDLE_ENFORCE(!Outputs("Outs").empty(),
-                 "Outputs(Outs) of CondOp can't be empty.");
-  for (auto& output : Outputs("Outs")) {
-    const LoDTensor* tensor_t_out =
-        &sub_scopes[TRUE_BRANCH]->FindVar(output)->Get<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_t_out, "True output should not be NULL");
-    const LoDTensor* tensor_f_out =
-        &sub_scopes[FALSE_BRANCH]->FindVar(output)->Get<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_f_out, "False output should not be NULL");
-
-    auto* var_out = scope.FindVar(output);
-    PADDLE_ENFORCE_NOT_NULL(var_out, "Output not found");
-    LoDTensor* tensor_out = var_out->GetMutable<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_t_out,
-                            "True output tensor should not be NULL");
-
-    DDim true_dim = tensor_t_out->dims();
-    DDim false_dim = tensor_f_out->dims();
-    true_dim[0] = 0;
-    false_dim[0] = 0;
-    PADDLE_ENFORCE_EQ(true_dim, false_dim,
-                      "Outputs not of the same shape except the first dim");
-
-    DDim out_dim = tensor_t_out->dims();
-    out_dim[0] = tensor_t_out->dims()[0] + tensor_f_out->dims()[0];
-    tensor_out->Resize(out_dim);
-    tensor_out->mutable_data<float>(platform::CPUPlace());
-  }
-
-  // merge output results:
-  // output_tensor = true_output_tensor + false_output_tensor
-  for (auto& output : Outputs("Outs")) {
-    Variable* var_parent = scope.FindVar(output);
-    PADDLE_ENFORCE_NOT_NULL(var_parent);
-    auto* tensor_parent = var_parent->GetMutable<LoDTensor>();
-
-    for (int i = 0; i < BRANCH_NUM; ++i) {
-      Variable* var_child = sub_scopes[i]->FindVar(output);
-      PADDLE_ENFORCE_NOT_NULL(var_child);
-      auto* tensor_child = &var_child->Get<LoDTensor>();
-      ScatterAssign<float>(dev_ctx, *tensor_child, index_tensors[i],
-                           tensor_parent);
-    }
-  }
-}
-
-void CondOp::RunImpl(const Scope& scope, const platform::Place& place) const {
-  // get device context from pool
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& dev_ctx = *pool.Get(place);
-
-  PrepareDataForSubnet(scope, dev_ctx);
-  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
-  for (int i = 0; i < BRANCH_NUM; ++i) {
-    sub_net_op_[i]->Run(*sub_scopes[i], place);
-  }
-  MergeDataFromSubnet(scope, dev_ctx);
-}
-
-class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  CondOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Cond", "The condition, which is a bool vector");
-    AddInput("Xs", "Inputs of Subnets").AsDuplicable();
-    AddOutput("Outs", "Outputs of Cond_Op after merge").AsDuplicable();
-
-    AddOutput("SubScopes", "sub scopes for true and false branches");
-    AddOutput("IndexTensors", "Index Tensors contains indices for true/false");
-
-    AddComment(R"DOC(
-Sample Dependent Conditional Operator.
-
-Given Cond[i] as a 1/0 vector to indicate true/false:
-Out[i] = subnet_true[i], if Cond[i] == true
-Out[i] = subnet_false[i], if Cond[i] == false
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_WITHOUT_GRADIENT(cond, paddle::operators::CondOp,
-                             paddle::operators::CondOpProtoAndCheckerMaker);
diff --git a/paddle/fluid/operators/cond_op.h b/paddle/fluid/operators/cond_op.h
deleted file mode 100644
index d3888923dbdeee122fb3045a839c0ba639b892b1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cond_op.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include <vector>
-#include "glog/logging.h"
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/net_op.h"
-
-namespace paddle {
-namespace operators {
-
-/*
- * @brief CondOp is a dynamic if-else Operator
- *
- * It has a input tensor named cond indicating which netop each instance will
- * run.
- *
- * if cond == 1, it will run true_net, which is a NetOp.
- *
- * if cond == 0, it will run false_net, which is another NetOp.
- */
-class CondOp : public framework::OperatorBase {
- public:
-  CondOp(const std::string& type, const framework::VariableNameMap& inputs,
-         const framework::VariableNameMap& outputs,
-         const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {
-    sub_net_op_.resize(BRANCH_NUM);
-  }
-
-  CondOp(const CondOp& o)
-      : framework::OperatorBase(
-            static_cast<const framework::OperatorBase&>(o)) {
-    // TODO(yuyang18): Implement copy ctor well.
-    PADDLE_THROW("Not implemented");
-  }
-
-  framework::Scope& AddSubScope(const framework::Scope& scope) const;
-  std::vector<framework::Scope*>& GetSubScopes(
-      const framework::Scope& scope) const;
-
-  framework::LoDTensor& AddIndexTensor(const framework::Scope& scope) const;
-  std::vector<framework::LoDTensor>& GetIndexTensors(
-      const framework::Scope& scope) const;
-
-  void PrepareDataForSubnet(const framework::Scope& scope,
-                            const platform::DeviceContext& dev_ctx) const;
-  void MergeDataFromSubnet(const framework::Scope& scope,
-                           const platform::DeviceContext& dev_ctx) const;
-
-  /*
-   * Set True Block
-   */
-  void set_truenet(std::unique_ptr<OperatorBase>&& net) {
-    sub_net_op_[TRUE_BRANCH] = std::move(net);
-  }
-
-  /*
-   * Set False Block
-   */
-  void set_falsenet(std::unique_ptr<OperatorBase>&& net) {
-    sub_net_op_[FALSE_BRANCH] = std::move(net);
-  }
-
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override;
-
- private:
-  const int TRUE_BRANCH = 0;
-  const int FALSE_BRANCH = 1;
-  const int BRANCH_NUM = 2;
-
-  // sub_net_op_[0]: subnet_t
-  // sub_net_op_[1]: subnet_f
-  std::vector<std::unique_ptr<framework::OperatorBase>> sub_net_op_;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu
index 54e0b1d9ad83c5f01f3f0dfbc2a95c642c0aaadc..bbad74e96d9c6c1be24639b63e472f18a599cfab 100644
--- a/paddle/fluid/operators/ctc_align_op.cu
+++ b/paddle/fluid/operators/ctc_align_op.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <stdio.h>
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
+#include <vector>
 #include "paddle/fluid/operators/ctc_align_op.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/ctc_align_op.h b/paddle/fluid/operators/ctc_align_op.h
index 70698d99589ae9e2e18ec8b1c1bb3bc8c7476131..9c5c6f5aa03632fe3079074d4b164f871fad634d 100644
--- a/paddle/fluid/operators/ctc_align_op.h
+++ b/paddle/fluid/operators/ctc_align_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <string.h>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index 8bbfd1f15925992efdeaaffbbe7b350ffbcee889..45f88ec8697d9f3de2612f28889fefc36f7ddbf9 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -65,9 +65,8 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
 }
 
 void ProcGetResponse(const VarHandle& var_h,
-                     // const sendrecv::VariableMessage& ret_msg) {
                      const ::grpc::ByteBuffer& ret_msg) {
-  framework::Variable* outvar = NULL;
+  framework::Variable* outvar = nullptr;
   DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar);
 }
 
diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
index d5fc163bc25409e0607b149b61c6266b38119d9d..0b582a08bc0bfbcfdc8f338a6add8edaa5e80818 100644
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -161,6 +161,7 @@ class RequestPrefetch final : public RequestBase {
     ::grpc::ByteBuffer reply;
 
     std::string var_name = request_->OutVarname();
+    VLOG(3) << "prefetch var " << var_name;
     auto var_desc = program_->Block(0).FindVar(var_name);
     framework::Scope* local_scope = &scope_->NewScope();
     auto* var = local_scope->FindVar(var_name);
diff --git a/paddle/fluid/operators/detail/serde_test.cc b/paddle/fluid/operators/detail/serde_test.cc
index f8cae6b26acf9d37ca286487065d70ede4c03120..cb5f89583436b059ac4d6509dac9f2e3868561aa 100644
--- a/paddle/fluid/operators/detail/serde_test.cc
+++ b/paddle/fluid/operators/detail/serde_test.cc
@@ -107,7 +107,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
   for (int i = 0; i < tensor_numel; ++i) {
     EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
   }
-  for (int64_t i = 0; i < rows2->size(); ++i) {
+  for (size_t i = 0; i < rows2->size(); ++i) {
     EXPECT_EQ(rows_data2[i], i);
   }
   EXPECT_EQ(slr2->height(), 1000);
diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h
index f04d8d8fd82ed2336dff9c5b88808dc32de6630a..a33634ab2503f988a8a692682ddb238d4794a3c0 100644
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
@@ -106,18 +107,18 @@ information. However, the output only shares the LoD information with input $X$.
  protected:
   std::string comment_;
 
-  void Replace(std::string& src, std::string from, std::string to) {
+  void Replace(std::string* src, std::string from, std::string to) {
     std::size_t len_from = std::strlen(from.c_str());
     std::size_t len_to = std::strlen(to.c_str());
-    for (std::size_t pos = src.find(from); pos != std::string::npos;
-         pos = src.find(from, pos + len_to)) {
-      src.replace(pos, len_from, to);
+    for (std::size_t pos = src->find(from); pos != std::string::npos;
+         pos = src->find(from, pos + len_to)) {
+      src->replace(pos, len_from, to);
     }
   }
 
   void SetComment(std::string name, std::string equation) {
-    Replace(comment_, "{name}", name);
-    Replace(comment_, "{equation}", equation);
+    Replace(&comment_, "{name}", name);
+    Replace(&comment_, "{equation}", equation);
   }
 };
 
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
index 2a91dcbcd418fcd61445b7d744789bdeee11d2f2..2490b83b8c50ce4a68095be10d78a380174c1a3f 100644
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/gru_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/gru_op.h b/paddle/fluid/operators/gru_op.h
index 0886bebc41d8b0f28745e88685f3954f86c823a1..1d5c291495c0f0c0d8da9ff6949888b4cbb6036d 100644
--- a/paddle/fluid/operators/gru_op.h
+++ b/paddle/fluid/operators/gru_op.h
@@ -13,15 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/gru_compute.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
 
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
index 048391549dd8df24cc215d04431c306ac4c7e5be..5b387d8d344dfc3475a537827acd9e125fe6693c 100644
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/im2sequence_op.h"
+#include <vector>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h
index a6a83fefbc6266fa718dcad78b3a018526f124db..d792c68f784d8ffec0eb303a6ab9b59c9f121fa7 100644
--- a/paddle/fluid/operators/im2sequence_op.h
+++ b/paddle/fluid/operators/im2sequence_op.h
@@ -13,7 +13,7 @@
    limitations under the License. */
 
 #pragma once
-
+#include <vector>
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc
index eef25f8a06ddb3311f3cfea21b64d8f7d7e58f24..c2a8c7f867a4483a7fda2f4336a64ab109ce86e8 100644
--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/label_smooth_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h
index 800a1303e1a427e7bd5e6c04354b8a5fbd816712..d5162bcd742c05980c89394b5d011bd078b61211 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.h
+++ b/paddle/fluid/operators/linear_chain_crf_op.h
@@ -100,7 +100,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
     auto x_row_max = EigenMatrix<T>::From(emission_row_max);
     x_row_max.device(place) =
         x.maximum(Eigen::DSizes<int, 1>(1))
-            .reshape(Eigen::DSizes<int, 2>(int(batch_size), 1));
+            .reshape(Eigen::DSizes<int, 2>(static_cast<int>(batch_size), 1));
 
     auto x_exps = EigenMatrix<T>::From(*emission_exps);
     x_exps.device(place) =
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 9188f2d989e601b7a97dedaf71f7080829cdb7c3..5d293665f0bcc098126ad3ec6c9bf34ff54c3b6f 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <ostream>
-#include <thread>
+#include <thread>  // NOLINT
+#include <vector>
 
 #include "paddle/fluid/operators/listen_and_serv_op.h"
 
@@ -88,8 +89,9 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
 
   auto ins = Inputs("X");
   auto fan_in = Attr<int>("Fanin");
-  auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
-  auto *program = block->Program();
+  auto *optimize_block = Attr<framework::BlockDesc *>(kOptimizeBlock);
+  auto *prefetch_block = Attr<framework::BlockDesc *>(kPrefetchBlock);
+  auto *program = optimize_block->Program();
   size_t num_blocks = program->Size();
   PADDLE_ENFORCE_GE(num_blocks, 2,
                     "server program should have at least 2 blocks");
@@ -97,18 +99,25 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   framework::Executor executor(dev_place);
   std::vector<int> block_list;
   for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
-    block_list.push_back(blkid);
+    if (blkid != prefetch_block->ID()) {
+      block_list.push_back(blkid);
+    }
   }
-  auto prepared = executor.Prepare(*program, block_list);
+  auto optimize_prepared = executor.Prepare(*program, block_list);
   // Insert placeholder for block0 which holds current op itself.
-  prepared.insert(prepared.begin(),
-                  std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
+  optimize_prepared.insert(
+      optimize_prepared.begin(),
+      std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
 
   rpc_service_->SetScope(&recv_scope);
   rpc_service_->SetDevCtx(&dev_ctx);
   // TODO(qiao) set proper fields for table lookup and update
   rpc_service_->SetExecutor(&executor);
-  rpc_service_->SetPrefetchBlkdId(0);
+  VLOG(3) << "prefetch block id is " << prefetch_block->ID();
+  auto prefetch_prepared = executor.Prepare(*program, prefetch_block->ID());
+  rpc_service_->SetPrefetchBlkdId(prefetch_block->ID());
+  rpc_service_->SetPrefetchPreparedCtx(prefetch_prepared.get());
+  prefetch_prepared.release();
   rpc_service_->SetProgram(program);
   // start the server listening after all member initialized.
   server_thread_.reset(new std::thread(RunServer, rpc_service_));
@@ -166,16 +175,18 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
     parallel_blkids.push_back(1);
     double ts = detail::GetTimestamp();
     for (size_t blkid = 2; blkid < num_blocks; ++blkid) {
-      if (program->Block(blkid).Parent() != last_parent_blkid) {
-        ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program,
-                              &recv_scope);
-        parallel_blkids.clear();
-        last_parent_blkid = program->Block(blkid).Parent();
+      if (blkid != prefetch_block->ID()) {
+        if (program->Block(blkid).Parent() != last_parent_blkid) {
+          ParallelExecuteBlocks(parallel_blkids, &executor, optimize_prepared,
+                                program, &recv_scope);
+          parallel_blkids.clear();
+          last_parent_blkid = program->Block(blkid).Parent();
+        }
+        parallel_blkids.push_back(blkid);
       }
-      parallel_blkids.push_back(blkid);
     }
-    ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program,
-                          &recv_scope);
+    ParallelExecuteBlocks(parallel_blkids, &executor, optimize_prepared,
+                          program, &recv_scope);
     VLOG(2) << "run all blocks spent " << detail::GetTimestamp() - ts << "(ms)";
 
     // Reset the received sparse variables, the sum operator would not
@@ -211,6 +222,8 @@ from send_op and send back variables to recv_op.
         .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
     AddAttr<framework::BlockDesc *>(kOptimizeBlock,
                                     "BlockID to run on server side.");
+    AddAttr<framework::BlockDesc *>(kPrefetchBlock,
+                                    "prefetch block to run on server side.");
     AddAttr<int>("Fanin", "How many clients send to this server.")
         .SetDefault(1);
   }
diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h
index 0da87afc961e896f04b4f0028bf9b17d5e992548..759b2a462ba5b938991aa86be9b9dc3e59fe3f7e 100644
--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <stdint.h>
 #include <ostream>
+#include <string>
 
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -27,6 +28,7 @@ namespace paddle {
 namespace operators {
 
 constexpr char kOptimizeBlock[] = "OptimizeBlock";
+constexpr char kPrefetchBlock[] = "PrefetchBlock";
 
 void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service);
 
diff --git a/paddle/fluid/operators/logical_op.cc b/paddle/fluid/operators/logical_op.cc
index 6a7db31cf36f31064259abeb0348e682be9f917c..41aa00ee8ac10e0776c066fc3c37f97b0dd40cc3 100644
--- a/paddle/fluid/operators/logical_op.cc
+++ b/paddle/fluid/operators/logical_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/logical_op.h"
+#include <string>
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index bf33be310686640fa187a07cf46a157b7f433340..5e59bd1b178ad1803f6f70c5f3f9fd7af495ac3c 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -78,6 +78,9 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(boolean, default false) "
                   "Sparse update.")
         .SetDefault(false);
+    AddAttr<bool>("is_distributed",
+                  "(boolean, default false) distributed lookup table.")
+        .SetDefault(false);
     AddAttr<int64_t>("padding_idx",
                      "(int64, default -1) "
                      "If the value is -1, it makes no effect to lookup. "
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index cb1568398125bbb57da974096da527200c1e0975..553a06c3dcdbb9de43afcace75ebec7c5e819d4a 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lrn_op.h"
+#include <string>
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc
index d75537741ef1d13b61ad6e244b2bba1ae5509da5..e062d62c66c25e386c7643e310034bc1481ec43d 100644
--- a/paddle/fluid/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lstm_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h
index 11f9f223b5d9a8091c51c93cee3f9c23b62e5573..a1ef0eb278dea7205cd8052bbe006b0ae4e3a466 100644
--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
diff --git a/paddle/fluid/operators/lstm_unit_op.cu b/paddle/fluid/operators/lstm_unit_op.cu
index 76245a1b5a9c8ba9c7ee7d7c03a95e2595a01591..acf094238fff92711edf00b4180266138362add1 100644
--- a/paddle/fluid/operators/lstm_unit_op.cu
+++ b/paddle/fluid/operators/lstm_unit_op.cu
@@ -18,6 +18,7 @@ https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op_gpu.c
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/cross_entropy_op.h"
+#include "paddle/fluid/operators/lstm_unit_op.h"
 #include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc
index a881ef82ec3cefa826f5f0856cc4fc13c7d7afc0..82541517e122d5da2674b55561ba72af970a2567 100644
--- a/paddle/fluid/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lstmp_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h
index dfa7f74d5116b4e3f1508f8bef94c598711e8124..172db548960135fbc1841cf58b73894d4f74d838 100644
--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index e53183603fec54ceef68873cfd97b4b985b0d437..c28047e6e915280eed6886f99cd6d55704e3f4ad 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -288,9 +288,14 @@ void batched_gemm<platform::CUDADeviceContext, float16>(
   // TODO(kexinzhao): add processing code for compute capability < 53 case
   PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
                     "cublas Hgemm requires GPU compute capability >= 53");
+
+#if CUDA_VERSION >= 8000
   PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(
       context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
       strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount));
+#else
+  PADDLE_ENFORCE(false, "HgemmStridedBatched is not supported on cuda <= 7.5");
+#endif
 }
 
 template <>
@@ -310,9 +315,13 @@ void batched_gemm<platform::CUDADeviceContext, float>(
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   const int strideC = M * N;
 
+#if CUDA_VERSION >= 8000
   PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(
       context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
       strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
+#else
+  PADDLE_ENFORCE(false, "SgemmStridedBatched is not supported on cuda <= 7.5");
+#endif
 }
 
 template <>
@@ -332,9 +341,13 @@ void batched_gemm<platform::CUDADeviceContext, double>(
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   const int strideC = M * N;
 
+#if CUDA_VERSION >= 8000
   PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(
       context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
       strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
+#else
+  PADDLE_ENFORCE(false, "DgemmStridedBatched is not supported on cuda <= 7.5");
+#endif
 }
 
 template <>
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 85855928521b8b4cc5e8746b0b5f841cc2587618..1f5255887391218b766aa23842e443c8b2ad080f 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/matmul_op.h"
+#include <algorithm>
+#include <vector>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/matmul_op.h b/paddle/fluid/operators/matmul_op.h
index 1cd8fe55dcbd23eae771550a363bf0a07e9bf585..f2e9cfdcdbf93326ae193776a7d5f6a324373603 100644
--- a/paddle/fluid/operators/matmul_op.h
+++ b/paddle/fluid/operators/matmul_op.h
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
+#include <algorithm>
+#include <functional>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/matmul.h"
diff --git a/paddle/fluid/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc
index efaae7d5f2d20484d90f79d9e13ec2f5ed6e06c9..4e28d98834d27351be99106d6760eae46baf8938 100644
--- a/paddle/fluid/operators/maxout_op.cc
+++ b/paddle/fluid/operators/maxout_op.cc
@@ -13,6 +13,8 @@
  *     limitations under the License. */
 
 #include "paddle/fluid/operators/maxout_op.h"
+#include <vector>
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc
index 7de9d94979fdc3f3352c556cc8b655ad4bc7e201..a302b24560e680076d62d02b422c6410467deb1d 100644
--- a/paddle/fluid/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/minus_op.h"
-#include "paddle/fluid/operators/net_op.h"
+
+#include <string>
+#include <vector>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/momentum_op.cu b/paddle/fluid/operators/momentum_op.cu
index da4a6af298f61a20e60ff1b8358f30bb0aca2280..5eb9d9950248bb50bb823f071c7fff0ddcc47234 100644
--- a/paddle/fluid/operators/momentum_op.cu
+++ b/paddle/fluid/operators/momentum_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/momentum_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index 90af1e2d602ac039b4d98a69a889ff8b1b85ffc6..5038287527c70d376d8c8a1cc8e4cca0b563126a 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/mul_op.h"
+#include <vector>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/net_op.cc b/paddle/fluid/operators/net_op.cc
deleted file mode 100644
index 0c2da744177b602246719d701257fc1b509ad81e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/net_op.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/net_op.h"
-#include <set>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-const char NetOp::kAll[] = "all";
-
-void NetOp::CompleteAddOp(bool calc) {
-  add_op_done_ = true;
-  if (!calc) return;
-  std::set<std::string> input_set;
-  std::set<std::string> output_set;
-  for (auto& op : ops_) {
-    for (auto& ipt : op->Inputs()) {
-      for (auto& var_name : ipt.second) {
-        // If input variable has been in output set, then it will be
-        // added into intermediate_outputs_. Otherwise, it will be
-        // added into input set.
-        if (Contains(output_set, var_name)) {
-          intermediate_outputs_.insert(var_name);
-        } else {
-          input_set.insert(var_name);
-        }
-      }
-    }
-
-    for (auto& opt : op->Outputs()) {
-      for (auto& var_name : opt.second) {
-        output_set.insert(var_name);
-      }
-    }
-  }
-  auto& inputs = inputs_[kAll];
-  inputs.reserve(input_set.size());
-  std::copy(input_set.begin(), input_set.end(), std::back_inserter(inputs));
-  auto& outputs = outputs_[kAll];
-  outputs.reserve(output_set.size());
-  std::copy(output_set.begin(), output_set.end(), std::back_inserter(outputs));
-}
-
-std::string NetOp::DebugStringEx(const framework::Scope* scope) const {
-  std::ostringstream os;
-  os << OperatorBase::DebugStringEx(scope) << std::endl;
-  for (auto& op : ops_) {
-    std::istringstream is(op->DebugStringEx(scope));
-    for (std::string line; std::getline(is, line);) {
-      os << "    " << line << std::endl;
-    }
-  }
-  return os.str();
-}
-
-bool NetOp::IsNetOp() const { return true; }
-
-std::vector<std::string> NetOp::OutputVars(bool has_intermediate) const {
-  std::vector<std::string> all;
-  for (auto& pair : this->outputs_) {
-    for (auto& var_name : pair.second) {
-      all.push_back(var_name);
-    }
-  }
-  if (has_intermediate) {
-    return all;
-  }
-  std::vector<std::string> ret_val;
-  for (auto& each : all) {
-    if (!Contains(intermediate_outputs_, each)) {
-      ret_val.push_back(each);
-    }
-  }
-  return ret_val;
-}
-
-NetOp::NetOp(const std::string& type, const framework::VariableNameMap& inputs,
-             const framework::VariableNameMap& outputs,
-             const framework::AttributeMap& attrs)
-    : framework::OperatorBase(type, inputs, outputs, attrs) {}
-
-std::unique_ptr<framework::OperatorBase> NetOp::Clone() const {
-  PADDLE_ENFORCE(
-      add_op_done_,
-      "Must clone a sealed NetOp, invoke Net::CompleteAddOp before clone");
-  return std::unique_ptr<OperatorBase>(new NetOp(*this));
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/net_op.h b/paddle/fluid/operators/net_op.h
deleted file mode 100644
index cbf8820cf4991bc24893f13646364dea0955a128..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/net_op.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <set>
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-/**
- * @brief Network is also a type of Operator
- *
- * It will manage the operators it has.
- *
- * Network is the container and controller of a set of operators.
-
- * A network object knows all Operators belonging to this network. Variables,
- * which are inputs and outputs of these operators, are created and managed by a
- * hierarchy of Scope objects.
- *
- * This is the base class of network, all the networks should implement the APIs
- * it defines.
- */
-class NetOp : public framework::OperatorBase {
- public:
-  static const char kAll[];
-  NetOp()
-      : framework::OperatorBase("plain_net", framework::VariableNameMap{},
-                                framework::VariableNameMap{},
-                                framework::AttributeMap{}) {}
-
-  NetOp(const std::string& type, const framework::VariableNameMap& inputs,
-        const framework::VariableNameMap& outputs,
-        const framework::AttributeMap& attrs);
-
-  NetOp(const NetOp& o) : framework::OperatorBase(o.type_, {}, {}, o.attrs_) {
-    this->ops_.reserve(o.ops_.size());
-    std::transform(
-        o.ops_.begin(), o.ops_.end(), std::back_inserter(this->ops_),
-        [](const std::unique_ptr<framework::OperatorBase>& op) {
-          return std::unique_ptr<framework::OperatorBase>(op->Clone());
-        });
-    this->CompleteAddOp();
-  }
-
-  bool SupportGPU() const override {
-    for (auto& op : ops_) {
-      if (!op->SupportGPU()) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  void AppendOp(const framework::OperatorBase& op) { AppendOp(op.Clone()); }
-
-  /**
-   * @brief Add an operator by ptr
-   */
-  void AppendOp(std::unique_ptr<framework::OperatorBase> op) {
-    PADDLE_ENFORCE(!add_op_done_,
-                   "Cannot AppendOp when this network is sealed");
-    PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op");
-    ops_.push_back(std::move(op));
-  }
-
-  void InsertOp(size_t pos, std::unique_ptr<framework::OperatorBase> op) {
-    PADDLE_ENFORCE(!add_op_done_,
-                   "Cannot InsertOp when this network is sealed");
-    PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op");
-    PADDLE_ENFORCE_LE(pos, ops_.size(), "Out of range");
-    ops_.insert(ops_.begin() + pos, std::move(op));
-  }
-
-  void InsertOp(size_t pos, const framework::OperatorBase& op) {
-    InsertOp(pos, op.Clone());
-  }
-
-  void CompleteAddOp(bool calculate = true);
-
-  std::string DebugStringEx(
-      const framework::Scope* scope = nullptr) const override;
-
-  bool IsNetOp() const override;
-  std::vector<std::string> OutputVars(bool has_intermediate) const override;
-
-  std::unique_ptr<framework::OperatorBase> Clone() const override;
-
-  std::vector<std::unique_ptr<framework::OperatorBase>> ops_;
-
- private:
-  /**
-   * @brief Run the network.
-   *
-   * Run all the operators with the `scope`, if no scope is provided, default
-   * scope will be used instead. If no OpContext is provicded, default context
-   * will be used.
-   */
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    for (auto& op : ops_) {
-      op->Run(scope, place);
-    }
-  }
-
-  bool add_op_done_{false};
-  std::set<std::string> intermediate_outputs_;
-
-  template <typename T, typename KeyType>
-  static bool Contains(T container, KeyType key) {
-    return container.find(key) != container.end();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/net_op_test.cc b/paddle/fluid/operators/net_op_test.cc
deleted file mode 100644
index 3b5f57548585398c441fd8038ba8b053c27392cf..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/net_op_test.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/net_op.h"
-
-#include <gtest/gtest.h>
-
-namespace paddle {
-namespace operators {
-using Scope = framework::Scope;
-using DeviceContext = platform::DeviceContext;
-
-static int run_cnt = 0;
-
-class TestOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-  DEFINE_OP_CLONE_METHOD(TestOp);
-
- private:
-  void RunImpl(const Scope& scope,
-               const platform::Place& place) const override {
-    ++run_cnt;
-  }
-};
-
-template <typename T>
-void AssertSameVectorWithoutOrder(const std::vector<T>& expected,
-                                  const std::vector<T>& actual) {
-  ASSERT_EQ(expected.size(), actual.size());
-  std::unordered_set<T> expected_set;
-  for (auto& tmp : expected) {
-    expected_set.insert(tmp);
-  }
-  for (auto& act : actual) {
-    ASSERT_NE(expected_set.end(), expected_set.find(act));
-  }
-}
-
-TEST(OpKernel, all) {
-  auto net = std::make_shared<NetOp>();
-  ASSERT_NE(net, nullptr);
-
-  net->AppendOp(std::unique_ptr<TestOp>(
-      new TestOp("test", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
-                 {{"Out", {"y"}}}, framework::AttributeMap{})));
-  net->AppendOp(std::unique_ptr<TestOp>(
-      new TestOp("test", {{"X", {"y"}}, {"W", {"w2"}}, {"b", {"b2"}}},
-                 {{"Out", {"z"}}}, framework::AttributeMap{})));
-
-  net->CompleteAddOp();
-  AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"},
-                               net->Inputs(NetOp::kAll));
-  AssertSameVectorWithoutOrder({"y", "z"}, net->Outputs(NetOp::kAll));
-
-  auto final_outs = net->OutputVars(false);
-
-  ASSERT_EQ(final_outs.size(), 1UL);
-  ASSERT_EQ(final_outs[0], "z");
-}
-
-TEST(NetOp, insert_op) {
-  NetOp net;
-  auto op1 = std::unique_ptr<framework::NOP>(
-      new framework::NOP("empty", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
-                         {{"Out", {"y"}}}, framework::AttributeMap{}));
-  net.AppendOp(*op1);
-  net.InsertOp(0, *op1);
-  ASSERT_EQ(2UL, net.ops_.size());
-  net.InsertOp(2, std::move(op1));
-  ASSERT_EQ(3UL, net.ops_.size());
-}
-
-TEST(NetOp, Clone) {
-  NetOp net;
-  net.AppendOp(std::unique_ptr<framework::NOP>(new framework::NOP{
-      "empty", framework::VariableNameMap{}, framework::VariableNameMap{},
-      framework::AttributeMap{}}));
-  net.AppendOp(std::unique_ptr<framework::NOP>(new framework::NOP{
-      "empty2", framework::VariableNameMap{}, framework::VariableNameMap{},
-      framework::AttributeMap{}}));
-  net.CompleteAddOp(true);
-  auto new_net_op = net.Clone();
-  ASSERT_NE(new_net_op, nullptr);
-  ASSERT_TRUE(new_net_op->IsNetOp());
-  auto* new_net = static_cast<NetOp*>(new_net_op.get());
-  ASSERT_EQ(2UL, new_net->ops_.size());
-  ASSERT_EQ(new_net->ops_[0]->Type(), "empty");
-  ASSERT_EQ(new_net->ops_[1]->Type(), "empty2");
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/pad_op.h b/paddle/fluid/operators/pad_op.h
index a36abe3789574cb64f05001e34d534cf352a60b2..c93c096575a30dd9344894ead4b81acc16930e21 100644
--- a/paddle/fluid/operators/pad_op.h
+++ b/paddle/fluid/operators/pad_op.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <utility>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc
index c88578570c1acdecaa97dd8b12a702778fef2b7e..63eaaedcd5fc3df17902511dc02b25bf43ccd241 100644
--- a/paddle/fluid/operators/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/pool_mkldnn_op.cc
@@ -83,9 +83,11 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
 
     auto src_memory =
-        mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data);
+        mkldnn::memory({src_md, mkldnn_engine},
+                       static_cast<void*>(const_cast<T*>(input_data)));
     auto dst_memory =
-        mkldnn::memory({dst_md, mkldnn_engine}, (void*)output_data);
+        mkldnn::memory({dst_md, mkldnn_engine},
+                       static_cast<void*>(const_cast<T*>(output_data)));
 
     auto pool_prim = mkldnn::pooling_forward(*pool_pd, src_memory, dst_memory,
                                              *workspace_memory);
@@ -195,9 +197,11 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
         pool_bwd_desc, mkldnn_engine, *pool_pd);
 
     auto diff_src_memory =
-        mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)in_x_grad_data);
+        mkldnn::memory({diff_src_md, mkldnn_engine},
+                       static_cast<void*>(const_cast<T*>(in_x_grad_data)));
     auto diff_dst_memory =
-        mkldnn::memory({diff_dst_md, mkldnn_engine}, (void*)out_grad_data);
+        mkldnn::memory({diff_dst_md, mkldnn_engine},
+                       static_cast<void*>(const_cast<T*>(out_grad_data)));
 
     auto bwd_prim = mkldnn::pooling_backward(
         pool_bwd_pd, diff_dst_memory, *workspace_memory, diff_src_memory);
diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h
index 2fec50ef25e0d2621a87963acdf142d24970329d..a48127ea6983d3d4ea12ec4925f30af233002ef2 100644
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
diff --git a/paddle/fluid/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h
index 83e7bd138ae25c6d3e09c3d01178d6887205bf98..b55fa76eae34c3179d40f31ed6a57d3ecbbaaccf 100644
--- a/paddle/fluid/operators/pool_with_index_op.h
+++ b/paddle/fluid/operators/pool_with_index_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
diff --git a/paddle/fluid/operators/prefetch_op.cc b/paddle/fluid/operators/prefetch_op.cc
index 09ab7da663b5ef5f099b9f65b0df661ceea0d9e2..f9ae01ab5d2972d2a74b36ae6035985d1d874bb6 100644
--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <future>
+#include <future>  // NOLINT
 #include <ostream>
 
 #include "paddle/fluid/framework/data_type.h"
@@ -50,8 +50,8 @@ class PrefetchOp : public framework::OperatorBase {
 
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
-        VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << "to get "
-                << outs[i] << "back";
+        VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get "
+                << outs[i] << " back";
         rpc_client->AsyncPrefetchVariable(epmap[i], ctx, scope, ins[i],
                                           outs[i]);
       } else {
@@ -71,7 +71,7 @@ class PrefetchOpMaker : public framework::OpProtoAndCheckerMaker {
               "(RPCClient) The RPC client object which will be"
               "initialized at most once.");
     AddOutput("Out",
-              "(SelectedRows) result "
+              "(LoDTensor) result "
               "to be fetched from parameter server")
         .AsDuplicable();
     AddAttr<std::vector<std::string>>(
diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
index 447b854544b72043ea09c09c134af3a48c305561..8eaa12a4a6cfc09fd4e2c3642bc8825fe2af6d6b 100644
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/prelu_op.h"
-#include "paddle/fluid/operators/net_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/prior_box_op.cc b/paddle/fluid/operators/prior_box_op.cc
index 82e54139c8c1f42b1d8f74811a6793ec5c66473e..058b13eeb872aaa77a88da37db64a6d59fbdd1cf 100644
--- a/paddle/fluid/operators/prior_box_op.cc
+++ b/paddle/fluid/operators/prior_box_op.cc
@@ -45,7 +45,7 @@ class PriorBoxOp : public framework::OperatorWithKernel {
     bool flip = ctx->Attrs().Get<bool>("flip");
 
     std::vector<float> aspect_ratios_vec;
-    ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec);
+    ExpandAspectRatios(aspect_ratios, flip, &aspect_ratios_vec);
 
     size_t num_priors = aspect_ratios_vec.size() * min_sizes.size();
     if (max_sizes.size() > 0) {
diff --git a/paddle/fluid/operators/prior_box_op.cu b/paddle/fluid/operators/prior_box_op.cu
index 76bf2b3b7de7a24c80e927c16199f89c5b7fb794..0ea8909296f8f52d252b0ec258666cf32d69a8bb 100644
--- a/paddle/fluid/operators/prior_box_op.cu
+++ b/paddle/fluid/operators/prior_box_op.cu
@@ -96,7 +96,7 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
     auto clip = ctx.Attr<bool>("clip");
 
     std::vector<float> aspect_ratios;
-    ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios);
+    ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
 
     T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
     T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
diff --git a/paddle/fluid/operators/prior_box_op.h b/paddle/fluid/operators/prior_box_op.h
index 1e4a12aac1c5f1c3b7e2e1bc83170de9ad590fc3..1c62fd8d2c4d4e4deba4ca6442efbaff83e36c35 100644
--- a/paddle/fluid/operators/prior_box_op.h
+++ b/paddle/fluid/operators/prior_box_op.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <algorithm>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/transform.h"
@@ -22,23 +24,23 @@ namespace operators {
 
 inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
                                bool flip,
-                               std::vector<float>& output_aspect_ratior) {
+                               std::vector<float>* output_aspect_ratior) {
   constexpr float epsilon = 1e-6;
-  output_aspect_ratior.clear();
-  output_aspect_ratior.push_back(1.0f);
+  output_aspect_ratior->clear();
+  output_aspect_ratior->push_back(1.0f);
   for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
     float ar = input_aspect_ratior[i];
     bool already_exist = false;
-    for (size_t j = 0; j < output_aspect_ratior.size(); ++j) {
-      if (fabs(ar - output_aspect_ratior[j]) < epsilon) {
+    for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
+      if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
         already_exist = true;
         break;
       }
     }
     if (!already_exist) {
-      output_aspect_ratior.push_back(ar);
+      output_aspect_ratior->push_back(ar);
       if (flip) {
-        output_aspect_ratior.push_back(1.0f / ar);
+        output_aspect_ratior->push_back(1.0f / ar);
       }
     }
   }
@@ -68,7 +70,7 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
     auto clip = ctx.Attr<bool>("clip");
 
     std::vector<float> aspect_ratios;
-    ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios);
+    ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
 
     T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
     T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
index 767eef56861ea075ec2450b1456e7c5c807ce25d..a1127f11a75e54168ca9682a0189255d37ee8571 100644
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/rank_loss_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc
index 2925b8a85da1b0d19672124e49c8fd22c8b4e6bf..bf02b9958927580608b95d6b8ecfddc7231a02d4 100644
--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
@@ -66,13 +66,7 @@ class ReadOp : public framework::OperatorBase {
     std::vector<std::string> out_arg_names = Outputs("Out");
     std::vector<framework::LoDTensor> ins;
     reader->ReadNext(&ins);
-    if (ins.empty()) {
-      reader->ReInit();
-      reader->ReadNext(&ins);
-      PADDLE_ENFORCE(
-          !ins.empty(),
-          "Reader can not read the next data even it has been re-initialized.");
-    }
+    PADDLE_ENFORCE(!ins.empty(), "There is no next data.");
     PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
     for (size_t i = 0; i < ins.size(); ++i) {
       auto* out =
diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt
index 6fa0195b9ae103418beb56cc4b0fa9ab59e93108..845528860f91d0b479bb3c4dbbe05e32c68dc16f 100644
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
@@ -22,5 +22,6 @@ reader_library(create_batch_reader_op SRCS create_batch_reader_op.cc)
 reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_op.cc)
 reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc)
 reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
+reader_library(create_threaded_reader_op SRCS create_threaded_reader_op.cc)
 # Export local libraries to parent
 set(READER_LIBRARY ${LOCAL_READER_LIBS} PARENT_SCOPE)
diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index ed868786ab2a80efa42574ed4f579c633ce0becf..33a50b5cebc1f65ccf9a00280f0eeadd00982555 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -63,13 +63,14 @@ class DoubleBufferReader : public framework::DecoratedReader {
     StartPrefetcher();
   }
 
-  bool HasNext() const override;
   void ReadNext(std::vector<framework::LoDTensor>* out) override;
   void ReInit() override;
 
   ~DoubleBufferReader() { EndPrefetcher(); }
 
  private:
+  bool HasNext() const;
+
   void StartPrefetcher() {
     channel_ = framework::MakeChannel<Item>(kChannelSize);
     prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
@@ -109,7 +110,9 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase {
 
     auto place_str = Attr<std::string>("place");
     platform::Place place;
-    if (place_str == "CPU") {
+    if (place_str == "AUTO") {
+      place = dev_place;
+    } else if (place_str == "CPU") {
       place = platform::CPUPlace();
     } else {
       std::istringstream sin(place_str);
@@ -140,28 +143,22 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
       enum_range.insert(string::Sprintf("CUDA:%d", i));
     }
     enum_range.insert("CPU");
-    AddAttr<std::string>("place", "The double buffer place, default is CPU")
-        .SetDefault("CPU")
+    enum_range.insert("AUTO");
+    AddAttr<std::string>("place", "The double buffer place")
+        .SetDefault("AUTO")
         .InEnum({enum_range});
   }
 };
 
-bool DoubleBufferReader::HasNext() const {
-  while (!channel_->IsClosed() && !channel_->CanReceive()) {
-  }
-  return channel_->CanReceive();
-}
-
 void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
-  if (!HasNext()) {
-    PADDLE_THROW("There is no next data!");
-  }
-
-  Item batch;
-  channel_->Receive(&batch);
-  *out = batch.payloads_;
-  if (batch.ctx_) {
-    batch.ctx_->Wait();
+  out->clear();
+  if (HasNext()) {
+    Item batch;
+    channel_->Receive(&batch);
+    *out = batch.payloads_;
+    if (batch.ctx_) {
+      batch.ctx_->Wait();
+    }
   }
 }
 
@@ -171,16 +168,26 @@ void DoubleBufferReader::ReInit() {
   StartPrefetcher();
 }
 
+bool DoubleBufferReader::HasNext() const {
+  while (!channel_->IsClosed() && !channel_->CanReceive()) {
+  }
+  return channel_->CanReceive();
+}
+
 void DoubleBufferReader::PrefetchThreadFunc() {
   VLOG(5) << "A new prefetch thread starts.";
   std::vector<std::vector<framework::LoDTensor>> cpu_tensor_cache(kCacheSize);
   std::vector<std::vector<framework::LoDTensor>> gpu_tensor_cache(kCacheSize);
   size_t cached_tensor_id = 0;
 
-  while (reader_->HasNext()) {
+  while (true) {
     Item batch;
     auto& cpu_batch = cpu_tensor_cache[cached_tensor_id];
     reader_->ReadNext(&cpu_batch);
+    if (cpu_batch.empty()) {
+      // The underlying reader have no next data.
+      break;
+    }
     if (platform::is_gpu_place(place_)) {
       auto& gpu_batch = gpu_tensor_cache[cached_tensor_id];
       auto* gpu_ctx = ctxs_[cached_tensor_id].get();
diff --git a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
index b72ccc77a3e1ec30fd817471d3ffd667974ae684..0573345ba502b6a9af35710840d5acf7634f332f 100644
--- a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
@@ -25,22 +25,12 @@ class MultiPassReader : public framework::DecoratedReader {
       : DecoratedReader(reader), pass_num_(pass_num), pass_count_(0) {}
 
   void ReadNext(std::vector<framework::LoDTensor>* out) override {
-    if (!HasNext()) {
-      PADDLE_THROW("There is no next data!");
-    }
     reader_->ReadNext(out);
-  }
-
-  bool HasNext() const override {
-    if (reader_->HasNext()) {
-      return true;
-    } else {
+    if (out->empty()) {
       ++pass_count_;
-      if (pass_count_ >= pass_num_) {
-        return false;
-      } else {
+      if (pass_count_ < pass_num_) {
         reader_->ReInit();
-        return true;
+        reader_->ReadNext(out);
       }
     }
   }
diff --git a/paddle/fluid/operators/reader/create_random_data_generator_op.cc b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
index 95d8674c08b63e872926ff8708d0c734da33684c..d1cb8e47da70cab784858caea7e791151fc104dd 100644
--- a/paddle/fluid/operators/reader/create_random_data_generator_op.cc
+++ b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
@@ -52,8 +52,6 @@ class RandomDataGenerator : public framework::ReaderBase {
 
   void ReInit() override { return; }
 
-  bool HasNext() const override { return true; }
-
  private:
   float min_;
   float max_;
@@ -74,7 +72,7 @@ class CreateRandomDataGeneratorOp : public framework::OperatorBase {
     const auto& ranks = Attr<std::vector<int>>("ranks");
     PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
     PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
-                      int(shape_concat.size()),
+                      static_cast<int>(shape_concat.size()),
                       "The accumulate of all ranks should be equal to the "
                       "shape concat's length.");
     std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
diff --git a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
index adaa0b9e5f1ffcfbf3e9cd8fd060153575f270a6..2ae29725561769ebe6428002c9983246b8eec724 100644
--- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <mutex>
-#include <thread>
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 #include "paddle/fluid/recordio/scanner.h"
 
@@ -35,17 +33,15 @@ class RecordIOFileReader : public framework::FileReader {
     LOG(INFO) << "Creating file reader" << filename;
   }
 
-  bool HasNext() const override { return scanner_.HasNext(); }
-
   void ReInit() override { scanner_.Reset(); }
 
  protected:
   void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
     if (ThreadSafe) {
       std::lock_guard<std::mutex> guard(*mutex_);
-      *out = framework::ReadFromRecordIO(scanner_, dev_ctx_);
+      *out = framework::ReadFromRecordIO(&scanner_, dev_ctx_);
     } else {
-      *out = framework::ReadFromRecordIO(scanner_, dev_ctx_);
+      *out = framework::ReadFromRecordIO(&scanner_, dev_ctx_);
     }
   }
 
@@ -66,7 +62,7 @@ class CreateRecordIOReaderOp : public framework::OperatorBase {
     const auto& ranks = Attr<std::vector<int>>("ranks");
     PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
     PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
-                      int(shape_concat.size()),
+                      static_cast<int>(shape_concat.size()),
                       "The accumulate of all ranks should be equal to the "
                       "shape concat's length.");
     std::string filename = Attr<std::string>("filename");
diff --git a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
index b164ce232d6bea7b4ff0c67ee0a7dd83b14f61a2..13825d65913be95f4f444bd9d5271a036ec8b1e2 100644
--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
@@ -30,35 +30,33 @@ class ShuffleReader : public framework::DecoratedReader {
       std::random_device device;
       seed_ = device();
     }
-    ReadIntoBuffers();
+    ReloadBuffer();
   }
 
   void ReadNext(std::vector<framework::LoDTensor>* out) override {
-    if (!HasNext()) {
-      PADDLE_THROW("There is no next data!");
-    }
+    out->clear();
     if (iteration_pos_ >= buffer_.size()) {
       VLOG(10) << "Resetting shuffle buffer";
-      ReadIntoBuffers();
+      ReloadBuffer();
+      if (buffer_.empty()) {
+        return;
+      }
     }
     *out = buffer_[iteration_pos_++];
   }
 
-  bool HasNext() const override {
-    return iteration_pos_ < buffer_.size() || reader_->HasNext();
-  }
-
  private:
-  void ReadIntoBuffers() {
+  void ReloadBuffer() {
     buffer_.clear();
     buffer_.reserve(buffer_size_);
     iteration_pos_ = 0;
     for (size_t i = 0; i < buffer_size_; ++i) {
-      if (!reader_->HasNext()) {
+      std::vector<framework::LoDTensor> ins;
+      reader_->ReadNext(&ins);
+      if (ins.empty()) {
         break;
       }
-      buffer_.emplace_back();
-      reader_->ReadNext(&buffer_.back());
+      buffer_.emplace_back(ins);
     }
     std::mt19937 g(seed_);
     std::shuffle(buffer_.begin(), buffer_.end(), g);
diff --git a/paddle/fluid/operators/reader/create_threaded_reader_op.cc b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cbf709d5e734c0f2adf3735dc28043c1340349da
--- /dev/null
+++ b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
@@ -0,0 +1,94 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class ThreadedReader : public framework::DecoratedReader {
+ public:
+  ThreadedReader(ReaderBase* reader, bool safe_mode)
+      : DecoratedReader(reader), safe_mode_(safe_mode) {}
+
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    reader_->ReadNext(out);
+  }
+
+  void ReInit() override {
+    if (safe_mode_) {
+      PADDLE_THROW(
+          "ThreadedReader::ReInit() is disabled when 'safe_mode' is true.");
+    }
+    VLOG(5) << "ThreadedReader::ReInit() is invoked! It might be buggy in "
+               "multi-thread environment.";
+    reader_->ReInit();
+  }
+
+ private:
+  bool safe_mode_;
+  std::mutex mutex_;
+};
+
+class CreateThreadedReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    auto* out = detail::Ref(scope.FindVar(Output("Out")))
+                    .GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) {
+      return;
+    }
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    bool safe_mode = Attr<bool>("safe_mode");
+    out->Reset(new ThreadedReader(underlying_reader.Get(), safe_mode));
+  }
+};
+
+class CreateThreadedReaderOpMaker : public DecoratedReaderMakerBase {
+ public:
+  CreateThreadedReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : DecoratedReaderMakerBase(op_proto, op_checker) {
+    AddAttr<bool>("safe_mode",
+                  "When 'safe_mode' is true, 'ReInit()' is disabled to avoid "
+                  "unexpected bugs in multi-thread environment.")
+        .SetDefault(true);
+    AddComment(R"DOC(
+      CreateThreadedReader Operator
+
+      This operator creates a threaded reader. A threaded reader's 
+      'ReadNext()' can be invoked by several threads at the same 
+      time. 
+      When the attribute 'safe_mode' is true, the threaded reader's 
+      'ReInit()' is disabled to avoid unexpected bugs in multi-thread 
+      environment.
+    )DOC");
+  }
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+
+namespace reader = paddle::operators::reader;
+REGISTER_DECORATED_READER_OPERATOR(create_threaded_reader,
+                                   reader::CreateThreadedReaderOp,
+                                   reader::CreateThreadedReaderOpMaker);
diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc
index eacedeea8835d27b712b287824b9d30b03ebebbf..779dc8a6a0deb7792e0540071e3a2588102fa708 100644
--- a/paddle/fluid/operators/reader/open_files_op.cc
+++ b/paddle/fluid/operators/reader/open_files_op.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <thread>  // NOLINT
+
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 
@@ -19,38 +21,23 @@ namespace paddle {
 namespace operators {
 namespace reader {
 
-class MultipleReader : public framework::ReaderBase {
+class MultiFileReader : public framework::ReaderBase {
  public:
-  class ThreadBufferMap {
-   public:
-    std::vector<framework::LoDTensor>& operator[](
-        const std::thread::id& thread_id) {
-      std::lock_guard<std::mutex> lock(mutex_);
-      return buffer_[thread_id];
-    }
-
-    void Clear() { buffer_.clear(); }
-
-   private:
-    std::mutex mutex_;
-    std::unordered_map<std::thread::id, std::vector<framework::LoDTensor>>
-        buffer_;
-  };
-
-  MultipleReader(const std::vector<std::string>& file_names,
-                 const std::vector<framework::DDim>& dims, size_t thread_num)
-      : file_names_(file_names), dims_(dims) {
+  MultiFileReader(const std::vector<std::string>& file_names,
+                  const std::vector<framework::DDim>& dims, size_t thread_num,
+                  size_t buffer_size)
+      : file_names_(file_names), dims_(dims), buffer_size_(buffer_size) {
     prefetchers_.resize(thread_num);
     StartNewScheduler();
   }
 
   void ReadNext(std::vector<framework::LoDTensor>* out) override;
-  bool HasNext() const override;
   void ReInit() override;
 
-  ~MultipleReader() { EndScheduler(); }
+  ~MultiFileReader() { EndScheduler(); }
 
  private:
+  bool HasNext();
   void StartNewScheduler();
   void EndScheduler();
   void ScheduleThreadFunc();
@@ -60,39 +47,36 @@ class MultipleReader : public framework::ReaderBase {
   std::vector<framework::DDim> dims_;
   std::thread scheduler_;
   std::vector<std::thread> prefetchers_;
+  size_t buffer_size_;
   framework::Channel<size_t>* waiting_file_idx_;
   framework::Channel<size_t>* available_thread_idx_;
   framework::Channel<std::vector<framework::LoDTensor>>* buffer_;
-  mutable ThreadBufferMap thread_buffer_map_;
 };
 
-void MultipleReader::ReadNext(std::vector<framework::LoDTensor>* out) {
-  if (!HasNext()) {
-    PADDLE_THROW("There is no next data!");
+void MultiFileReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+  out->clear();
+  if (HasNext()) {
+    buffer_->Receive(out);
   }
-  auto& thread_local_buffer = thread_buffer_map_[std::this_thread::get_id()];
-  *out = thread_local_buffer;
-  thread_local_buffer.clear();
-}
-
-bool MultipleReader::HasNext() const {
-  auto& thread_local_buffer = thread_buffer_map_[std::this_thread::get_id()];
-  return thread_local_buffer.empty() ? buffer_->Receive(&thread_local_buffer)
-                                     : true;
 }
 
-void MultipleReader::ReInit() {
+void MultiFileReader::ReInit() {
   EndScheduler();
-  thread_buffer_map_.Clear();
   StartNewScheduler();
 }
 
-void MultipleReader::StartNewScheduler() {
+bool MultiFileReader::HasNext() {
+  while (!buffer_->IsClosed() && !buffer_->CanReceive()) {
+  }
+  return buffer_->CanReceive();
+}
+
+void MultiFileReader::StartNewScheduler() {
   size_t thread_num = prefetchers_.size();
   waiting_file_idx_ = framework::MakeChannel<size_t>(file_names_.size());
   available_thread_idx_ = framework::MakeChannel<size_t>(thread_num);
   buffer_ =
-      framework::MakeChannel<std::vector<framework::LoDTensor>>(thread_num);
+      framework::MakeChannel<std::vector<framework::LoDTensor>>(buffer_size_);
 
   for (size_t i = 0; i < file_names_.size(); ++i) {
     waiting_file_idx_->Send(&i);
@@ -105,7 +89,7 @@ void MultipleReader::StartNewScheduler() {
   scheduler_ = std::thread([this] { ScheduleThreadFunc(); });
 }
 
-void MultipleReader::EndScheduler() {
+void MultiFileReader::EndScheduler() {
   available_thread_idx_->Close();
   buffer_->Close();
   waiting_file_idx_->Close();
@@ -117,8 +101,8 @@ void MultipleReader::EndScheduler() {
   delete waiting_file_idx_;
 }
 
-void MultipleReader::ScheduleThreadFunc() {
-  VLOG(5) << "MultipleReader schedule thread starts.";
+void MultiFileReader::ScheduleThreadFunc() {
+  VLOG(5) << "MultiFileReader schedule thread starts.";
   size_t completed_thread_num = 0;
   size_t thread_idx;
   while (available_thread_idx_->Receive(&thread_idx)) {
@@ -150,17 +134,20 @@ void MultipleReader::ScheduleThreadFunc() {
       p.join();
     }
   }
-  VLOG(5) << "MultipleReader schedule thread terminates.";
+  VLOG(5) << "MultiFileReader schedule thread terminates.";
 }
 
-void MultipleReader::PrefetchThreadFunc(std::string file_name,
-                                        size_t thread_idx) {
+void MultiFileReader::PrefetchThreadFunc(std::string file_name,
+                                         size_t thread_idx) {
   VLOG(5) << "The prefetch thread of file '" << file_name << "' starts.";
   std::unique_ptr<framework::ReaderBase> reader =
       CreateReaderByFileName(file_name, dims_);
-  while (reader->HasNext()) {
+  while (true) {
     std::vector<framework::LoDTensor> ins;
     reader->ReadNext(&ins);
+    if (ins.empty()) {
+      break;
+    }
     try {
       buffer_->Send(&ins);
     } catch (paddle::platform::EnforceNotMet e) {
@@ -197,11 +184,13 @@ class OpenFilesOp : public framework::OperatorBase {
     const auto& file_names = Attr<std::vector<std::string>>("file_names");
     PADDLE_ENFORCE(!file_names.empty(), "No file to be read!");
     const size_t thread_num = Attr<int>("thread_num");
+    const size_t buffer_size = Attr<int>("buffer_size");
 
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new MultipleReader(
-        file_names, RestoreShapes(shape_concat, ranks), thread_num));
+    out->Reset(new MultiFileReader(file_names,
+                                   RestoreShapes(shape_concat, ranks),
+                                   thread_num, buffer_size));
   }
 };
 
@@ -212,11 +201,12 @@ class OpenFilesOpMaker : public FileReaderMakerBase {
     AddAttr<std::vector<std::string>>("file_names", "Files to be read.");
     AddAttr<int>("thread_num", "The maximal concurrent prefetch thread number.")
         .GreaterThan(0);
+    AddAttr<int>("buffer_size", "The size of prefetch buffer.").GreaterThan(0);
 
     AddComment(R"DOC(
       OpenFiles Operator
 
-      An OpenFilesOp creates a MultipleReader, which is able to 
+      An OpenFilesOp creates a MultiFileReader, which is able to 
       read data multi-threaded from multiple files.
     )DOC");
   }
diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc
index 083c1fae5e2016ada6309aba78bdfa6ad7fef89c..a4dcf704a63ae3bad6567ddb042ea23513bccff7 100644
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <future>  // NOLINT
 #include <ostream>
 
 #include "paddle/fluid/framework/data_type.h"
@@ -19,7 +20,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 
-#include <future>
 #include "paddle/fluid/operators/detail/grpc_client.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h
index f38c5a3c0c9952b37f7db468ea00470a00b5ff6f..54e07490319cf1da749bd33449a7b51efd6c3d65 100644
--- a/paddle/fluid/operators/roi_pool_op.h
+++ b/paddle/fluid/operators/roi_pool_op.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <algorithm>
+#include <limits>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index b16d06df8d0f7f57a5ec2f2be9a2cbb12a8ba55d..7ca7639fdb9b4c0fe5fe059a1cad1a22987d47e4 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/scale_op.h"
-#include "paddle/fluid/operators/net_op.h"
+
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc
index 542bc3fde2a3616807eea560be85fb42026d5825..3bf5d57809019d3ae469471c2ee2e7aac70b9faf 100644
--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include <unistd.h>
 #include <string>
-#include <thread>
+#include <thread>  // NOLINT
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -37,11 +37,11 @@ namespace m = paddle::operators::math;
 std::unique_ptr<f::OperatorBase> listen_and_serv_op;
 int selected_port;
 
-void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) {
+void InitTensorsInScope(const p::CPUPlace &place, f::Scope *scope) {
   p::CPUDeviceContext ctx(place);
   for (int i = 0; i < 2; ++i) {
     auto var_name = paddle::string::Sprintf("x%d", i);
-    auto var = scope.Var(var_name);
+    auto var = scope->Var(var_name);
     auto tensor = var->GetMutable<f::LoDTensor>();
     tensor->Resize({10, 10});
     float *expect = tensor->mutable_data<float>(place);
@@ -50,20 +50,20 @@ void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) {
     }
   }
 
-  auto out_var = scope.Var("Out");
+  auto out_var = scope->Var("Out");
   auto out_tensor = out_var->GetMutable<f::LoDTensor>();
   out_tensor->Resize({10, 10});
   out_tensor->mutable_data<float>(place);  // allocate
 }
 
-void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) {
+void InitSelectedRowsInScope(const p::CPUPlace &place, f::Scope *scope) {
   p::CPUDeviceContext ctx(place);
   int64_t height = 10;
   int64_t row_numel = 10;
   m::SetConstant<p::CPUDeviceContext, float> set_one;
   // init x0
   std::vector<int64_t> rows0{0, 4, 7};
-  auto x0_var = scope.Var("x0");
+  auto x0_var = scope->Var("x0");
   auto x0 = x0_var->GetMutable<f::SelectedRows>();
   x0->set_rows(rows0);
   x0->set_height(height);
@@ -74,7 +74,7 @@ void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) {
 
   // init x1
   std::vector<int64_t> rows1{2, 9};
-  auto x1_var = scope.Var("x1");
+  auto x1_var = scope->Var("x1");
   auto x1 = x1_var->GetMutable<f::SelectedRows>();
   x1->set_rows(rows1);
   x1->set_height(height);
@@ -83,7 +83,7 @@ void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) {
       f::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), place);
   set_one(ctx, x1_value, 1.0);
 
-  auto out_var = scope.Var("Out");
+  auto out_var = scope->Var("Out");
   auto out = out_var->GetMutable<f::SelectedRows>();
   auto out_value = out->mutable_value();
   out->set_height(height);
@@ -117,15 +117,16 @@ void StartServerNet(bool is_sparse) {
   f::Scope scope;
   p::CPUPlace place;
   if (is_sparse) {
-    InitSelectedRowsInScope(scope, place);
+    InitSelectedRowsInScope(place, &scope);
   } else {
-    InitTensorsInScope(scope, place);
+    InitTensorsInScope(place, &scope);
   }
 
   // sub program run in listen_and_serv_op, for simple test we use sum
   f::ProgramDesc program;
   const auto &root_block = program.Block(0);
   auto *optimize_block = program.AppendBlock(root_block);
+  auto *prefetch_block = program.AppendBlock(root_block);
   // X for server side tensors, RX for received tensers, must be of same shape.
   AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block);
 
@@ -135,6 +136,7 @@ void StartServerNet(bool is_sparse) {
   attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
   attrs.insert({"GradList", std::vector<std::string>({"x1"})});
   attrs.insert({"OptimizeBlock", optimize_block});
+  attrs.insert({"PrefetchBlock", prefetch_block});
   listen_and_serv_op =
       f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs);
   LOG(INFO) << "selected port before run " << selected_port;
@@ -148,7 +150,7 @@ TEST(SendRecvOp, CPUDense) {
   // local net
   f::Scope scope;
   p::CPUPlace place;
-  InitTensorsInScope(scope, place);
+  InitTensorsInScope(place, &scope);
   // create rpc client var
   scope.Var("RPC_CLIENT_VAR");
 
@@ -191,7 +193,7 @@ TEST(SendRecvOp, CPUSparse) {
   f::Scope scope;
   p::CPUPlace place;
   p::CPUDeviceContext ctx(place);
-  InitSelectedRowsInScope(scope, place);
+  InitSelectedRowsInScope(place, &scope);
   scope.Var("RPC_CLIENT_VAR");
   f::AttributeMap attrs;
   selected_port = static_cast<paddle::operators::ListenAndServOp *>(
diff --git a/paddle/fluid/operators/send_vars_op.cc b/paddle/fluid/operators/send_vars_op.cc
index 2cbd9e2394800dc3b9c5be1163d16bbec435c533..56b3713d6af28d0787e114a672a503e86cbd85fd 100644
--- a/paddle/fluid/operators/send_vars_op.cc
+++ b/paddle/fluid/operators/send_vars_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <future>
+#include <future>  // NOLINT
 #include <ostream>
 
 #include "paddle/fluid/framework/data_type.h"
@@ -36,7 +36,7 @@ class SendVarsOp : public framework::OperatorBase {
     auto ins = Inputs("X");
 
     std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
-    int sync_send = Attr<int>("sync_sent");
+    int sync_send = Attr<int>("sync_send");
 
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
diff --git a/paddle/fluid/operators/sgd_op.cc b/paddle/fluid/operators/sgd_op.cc
index 074fa9e00f2ec531f324ff10113d95144687d500..06cb0550ad7d4ad0241a4f439ea9ac16d9714c38 100644
--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
@@ -35,8 +35,8 @@ class SGDOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
                       "Learning rate should have 1 element");
     auto param_dim = ctx->GetInputDim("Param");
-    // TODO(qijun): check dimensions of Param and Grad at complie
-    // and run time.
+    // TODO(qijun): check dimensions of Param and Grad at compile
+    // and runtime.
     ctx->SetOutputDim("ParamOut", param_dim);
   }
 
diff --git a/paddle/fluid/operators/split_ids_op.cc b/paddle/fluid/operators/split_ids_op.cc
index a54f8a2878c8606e6b487552324d1e7dfa94b9b8..a53cbc8ac5199061dafdc7f4cf560b9e4fc577ab 100644
--- a/paddle/fluid/operators/split_ids_op.cc
+++ b/paddle/fluid/operators/split_ids_op.cc
@@ -48,11 +48,11 @@ class SplitIdsOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutputs("Out"), "SplitIdsOp must has output Out.");
 
     auto ids_var_type = ctx->GetInputsVarType("Ids").front();
-    PADDLE_ENFORCE_EQ(ids_var_type, framework::proto::VarType::LOD_TENSOR);
-
     auto ids_dims = ctx->GetInputDim("Ids");
-    PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
-    PADDLE_ENFORCE_EQ(ids_dims[1], 1);
+    if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
+      PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
+      PADDLE_ENFORCE_EQ(ids_dims[1], 1);
+    }
   }
 };
 
@@ -60,8 +60,9 @@ class SplitIdsOpInferVarType : public framework::VarTypeInference {
  public:
   void operator()(const framework::OpDesc &op_desc,
                   framework::BlockDesc *block) const override {
+    auto *input_var = block->Var(op_desc.Input("Ids")[0]);
     for (auto &out_var : op_desc.Output("Out")) {
-      block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR);
+      block->Var(out_var)->SetType(input_var->GetType());
     }
   }
 };
@@ -73,4 +74,5 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(split_ids, ops::SplitIdsOp, ops::SplitIdsOpMaker,
                   ops::SplitIdsOpInferVarType);
 REGISTER_OP_CPU_KERNEL(
-    split_ids, ops::SplitIdsOpKernel<paddle::platform::CPUPlace, int64_t>);
+    split_ids, ops::SplitIdsOpKernel<paddle::platform::CPUPlace, int64_t>,
+    ops::SplitIdsOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h
index d36ed398ebce661a62ca92696b0089b5289d5b1c..ba1e903dbb6daaa86b1b664322d100a800fd16b3 100644
--- a/paddle/fluid/operators/split_ids_op.h
+++ b/paddle/fluid/operators/split_ids_op.h
@@ -24,35 +24,63 @@ namespace operators {
 template <typename DeviceContext, typename T>
 class SplitIdsOpKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext &ctx) const override {
     auto place = ctx.GetPlace();
     if (!platform::is_cpu_place(place)) {
       PADDLE_THROW("SplitIds do not support GPU kernel");
     }
 
-    auto& ids_dims = ctx.Input<framework::LoDTensor>("Ids")->dims();
-    const T* ids = ctx.Input<framework::LoDTensor>("Ids")->data<T>();
-    auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
-    const size_t shard_num = outs.size();
+    const auto *ids_var = ctx.InputVar("Ids");
+    if (ids_var->IsType<framework::LoDTensor>()) {
+      const auto &ids_dims = ctx.Input<framework::LoDTensor>("Ids")->dims();
+      const T *ids = ctx.Input<framework::LoDTensor>("Ids")->data<T>();
+      auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
+      const size_t shard_num = outs.size();
 
-    std::vector<std::vector<T>> out_ids;
-    out_ids.resize(outs.size());
+      std::vector<std::vector<T>> out_ids;
+      out_ids.resize(outs.size());
 
-    // split id by their shard_num.
-    for (int i = 0; i < ids_dims[0]; ++i) {
-      T id = ids[i];
-      size_t shard_id = static_cast<size_t>(id) % shard_num;
-      out_ids[shard_id].push_back(id);
-    }
+      // split id by their shard_num.
+      for (int i = 0; i < ids_dims[0]; ++i) {
+        T id = ids[i];
+        size_t shard_id = static_cast<size_t>(id) % shard_num;
+        out_ids[shard_id].push_back(id);
+      }
+
+      // create tensor for each shard and send to parameter server
+      for (size_t i = 0; i < out_ids.size(); ++i) {
+        auto *shard_t = outs[i];
+        std::vector<T> ids = out_ids[i];
+        auto *shard_data = shard_t->mutable_data<T>(
+            framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
+        for (size_t i = 0; i < ids.size(); ++i) {
+          shard_data[i] = ids[i];
+        }
+      }
+    } else if (ids_var->IsType<framework::SelectedRows>()) {
+      const auto *ids_selected_rows = ctx.Input<framework::SelectedRows>("Ids");
+      auto &ids_dims = ids_selected_rows->value().dims();
+      PADDLE_ENFORCE_EQ(ids_dims[0], ids_selected_rows->rows().size(), "");
+      const T *ids = ids_selected_rows->value().data<T>();
+      const auto &ids_rows = ids_selected_rows->rows();
+      auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
+      const size_t shard_num = outs.size();
+      // get rows for outputs
+      for (auto &id : ids_rows) {
+        size_t shard_id = static_cast<size_t>(id) % shard_num;
+        outs[shard_id]->mutable_rows()->push_back(id);
+      }
 
-    // create tensor for each shard and send to parameter server
-    for (size_t i = 0; i < out_ids.size(); ++i) {
-      auto* shard_t = outs[i];
-      std::vector<T> ids = out_ids[i];
-      auto* shard_data = shard_t->mutable_data<T>(
-          framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
-      for (size_t i = 0; i < ids.size(); ++i) {
-        shard_data[i] = ids[i];
+      int64_t row_width = ids_dims[1];
+      for (auto &out : outs) {
+        out->set_height(ids_selected_rows->height());
+        framework::DDim ddim = framework::make_ddim(
+            {static_cast<int64_t>(out->rows().size()), row_width});
+        T *output = out->mutable_value()->mutable_data<T>(ddim, place);
+        for (size_t i = 0; i < ddim[0]; ++i) {
+          memcpy(output + i * row_width, ids + out->rows()[i] * row_width,
+                 row_width * sizeof(T));
+        }
       }
     }
   }
diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
index dffac772f11bee2fa3dcdf469a86adc57369b54d..e745509ec8c1f2ec305d7d4aabfdd43d847124b5 100644
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/split_op.h"
-#include "paddle/fluid/operators/net_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h
index 22c1db82e9f5aff6aa9a311cd1093b33fa7e6db9..7a10218e1556698f3e0a1828db5de8851dd1c90b 100644
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -37,8 +37,8 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src,
                           const framework::DDim& src_stride,
                           const framework::DDim& dst_dim,
                           const framework::DDim& dst_stride, T* dst) {
-  using namespace detail;
-  StridedCopyDimVisitor<T> func(dev_ctx, src, src_stride, dst_stride, dst);
+  paddle::operators::detail::StridedCopyDimVisitor<T> func(
+      dev_ctx, src, src_stride, dst_stride, dst);
   boost::apply_visitor(func, dst_dim);
 }
 
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index 9061e137bd1c789d34665729c48c1c2ea9525c8e..108f26fafe7af76eaa613d77ed77748ee43ea234 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -10,9 +10,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sum_op.h"
+
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 
@@ -37,7 +39,10 @@ class SumOp : public framework::OperatorWithKernel {
 
     auto x_dims = ctx->GetInputsDim("X");
     size_t N = x_dims.size();
-    PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1.");
+    PADDLE_ENFORCE_GT(N, 0, "Input tensors count should > 0.");
+    if (N == 1) {
+      VLOG(3) << "Warning: sum have only one input, may waste memory";
+    }
 
     framework::DDim in_dim({0});
     for (auto& x_dim : x_dims) {
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index bfd26c2f2294f954adc81a1719650c46372098c4..d7f4d383ce0d9e1ff42fc12c96aaf0ceb532e5db 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/top_k_op.h"
 #include "paddle/fluid/platform/assert.h"
 
 namespace paddle {
@@ -133,71 +134,71 @@ __device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
 }
 
 template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam,
+__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
                                               int beam_size, const T* src,
-                                              bool& firstStep, bool& is_empty,
-                                              Pair<T>& max, int dim,
+                                              bool* firstStep, bool* is_empty,
+                                              Pair<T>* max, int dim,
                                               const int tid) {
-  if (beam > 0) {
-    int length = beam < beam_size ? beam : beam_size;
-    if (firstStep) {
-      firstStep = false;
+  if (*beam > 0) {
+    int length = (*beam) < beam_size ? *beam : beam_size;
+    if (*firstStep) {
+      *firstStep = false;
       GetTopK<T, BlockSize>(topk, src, tid, dim, length);
     } else {
       for (int k = 0; k < MaxLength; k++) {
-        if (k < MaxLength - beam) {
-          topk[k] = topk[k + beam];
+        if (k < MaxLength - (*beam)) {
+          topk[k] = topk[k + *beam];
         } else {
           topk[k].set(-INFINITY, -1);
         }
       }
-      if (!is_empty) {
-        GetTopK<T, BlockSize>(topk + MaxLength - beam, src, tid, dim, max,
+      if (!(*is_empty)) {
+        GetTopK<T, BlockSize>(topk + MaxLength - *beam, src, tid, dim, *max,
                               length);
       }
     }
 
-    max = topk[MaxLength - 1];
-    if (max.v == -1) is_empty = true;
-    beam = 0;
+    *max = topk[MaxLength - 1];
+    if ((*max).v == -1) *is_empty = true;
+    *beam = 0;
   }
 }
 
 template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam,
+__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
                                               int beam_size, const T* val,
-                                              int* col, bool& firstStep,
-                                              bool& is_empty, Pair<T>& max,
+                                              int* col, bool* firstStep,
+                                              bool* is_empty, Pair<T>* max,
                                               int dim, const int tid) {
-  if (beam > 0) {
-    int length = beam < beam_size ? beam : beam_size;
-    if (firstStep) {
-      firstStep = false;
+  if (*beam > 0) {
+    int length = (*beam) < beam_size ? *beam : beam_size;
+    if (*firstStep) {
+      *firstStep = false;
       GetTopK<T, BlockSize>(topk, val, col, tid, dim, length);
     } else {
       for (int k = 0; k < MaxLength; k++) {
-        if (k < MaxLength - beam) {
-          topk[k] = topk[k + beam];
+        if (k < MaxLength - *beam) {
+          topk[k] = topk[k + *beam];
         } else {
           topk[k].set(-INFINITY, -1);
         }
       }
-      if (!is_empty) {
-        GetTopK<T, BlockSize>(topk + MaxLength - beam, val, col, tid, dim, max,
+      if (!(*is_empty)) {
+        GetTopK<T, BlockSize>(topk + MaxLength - *beam, val, col, tid, dim, max,
                               length);
       }
     }
 
-    max = topk[MaxLength - 1];
-    if (max.v == -1) is_empty = true;
-    beam = 0;
+    *max = topk[MaxLength - 1];
+    if ((*max).v == -1) *is_empty = true;
+    *beam = 0;
   }
 }
 
 template <typename T, int MaxLength, int BlockSize>
 __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
                                             Pair<T> topk[], T** topVal,
-                                            int64_t** topIds, int& beam, int& k,
+                                            int64_t** topIds, int* beam, int* k,
                                             const int tid, const int warp) {
   while (true) {
     __syncthreads();
@@ -225,17 +226,17 @@ __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
       (*topVal)++;
       (*topIds)++;
     }
-    if (tid == maxid[0]) beam++;
-    if (--k == 0) break;
+    if (tid == maxid[0]) (*beam)++;
+    if (--(*k) == 0) break;
     __syncthreads();
 
     if (tid == maxid[0]) {
-      if (beam < MaxLength) {
-        sh_topk[tid] = topk[beam];
+      if (*beam < MaxLength) {
+        sh_topk[tid] = topk[*beam];
       }
     }
     if (maxid[0] / 32 == warp) {
-      if (__shfl(beam, (maxid[0]) % 32, 32) == MaxLength) break;
+      if (__shfl(*beam, (maxid[0]) % 32, 32) == MaxLength) break;
     }
   }
 }
@@ -268,13 +269,13 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
     topk[k].set(-INFINITY, -1);
   }
   while (k) {
-    ThreadGetTopK<T, MaxLength, BlockSize>(topk, beam, k,
-                                           src + blockIdx.x * lds, firststep,
-                                           is_empty, max, dim, tid);
+    ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k,
+                                           src + blockIdx.x * lds, &firststep,
+                                           &is_empty, &max, dim, tid);
 
     sh_topk[tid] = topk[0];
     BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &output,
-                                         &indices, beam, k, tid, warp);
+                                         &indices, &beam, &k, tid, warp);
   }
 }
 
@@ -308,9 +309,9 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
     KeMatrixTopK<T, 5, 256><<<
         grid, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                               ctx.device_context())
-                              .stream()>>>(output_data, output->dims()[1],
-                                           indices_data, input_data,
-                                           input_width, input_width, int(k));
+                              .stream()>>>(
+        output_data, output->dims()[1], indices_data, input_data, input_width,
+        input_width, static_cast<int>(k));
   }
 };
 
diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h
index 881d611d4ac26f992036f639097815aff625227b..8758af0804ae08fec6fa64d7387f197f046ce20e 100644
--- a/paddle/fluid/platform/cuda_helper.h
+++ b/paddle/fluid/platform/cuda_helper.h
@@ -33,22 +33,26 @@ constexpr int PADDLE_CUDA_NUM_THREADS = 512;
 USE_CUDA_ATOMIC(Add, float);
 USE_CUDA_ATOMIC(Add, int);
 USE_CUDA_ATOMIC(Add, unsigned int);
-USE_CUDA_ATOMIC(Add, unsigned long long int);
+// CUDA API uses unsigned long long int, we cannot use uint64_t here.
+// It because unsigned long long int is not necessarily uint64_t
+USE_CUDA_ATOMIC(Add, unsigned long long int);  // NOLINT
 
 CUDA_ATOMIC_WRAPPER(Add, int64_t) {
-  static_assert(sizeof(int64_t) == sizeof(long long int),
+  // Here, we check long long int must be int64_t.
+  static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
                 "long long should be int64");
-  return CudaAtomicAdd(reinterpret_cast<unsigned long long int*>(address),
-                       static_cast<unsigned long long int>(val));
+  return CudaAtomicAdd(
+      reinterpret_cast<unsigned long long int*>(address),  // NOLINT
+      static_cast<unsigned long long int>(val));           // NOLINT
 }
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
 USE_CUDA_ATOMIC(Add, double);
 #else
 CUDA_ATOMIC_WRAPPER(Add, double) {
-  unsigned long long int* address_as_ull =
-      reinterpret_cast<unsigned long long int*>(address);
-  unsigned long long int old = *address_as_ull, assumed;
+  unsigned long long int* address_as_ull =                 // NOLINT
+      reinterpret_cast<unsigned long long int*>(address);  // NOLINT
+  unsigned long long int old = *address_as_ull, assumed;   // NOLINT
 
   do {
     assumed = old;
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index 29990043206509e4192bfff84832f09ef127d9dd..ca9ab2c7aecff47924f0198802d710b7661f5576 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -14,8 +14,9 @@
 
 #pragma once
 
-#include <thread>
+#include <thread>  // NOLINT
 #include <typeindex>
+#include <vector>
 #include "paddle/fluid/platform/dynload/nccl.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -29,6 +30,8 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) {
     return ncclDouble;
   } else if (type == typeid(int)) {  // NOLINT
     return ncclInt;
+  } else if (type == typeid(int64_t)) {  // NOLINT
+    return ncclInt64;
   } else {
     PADDLE_THROW("Not supported");
   }
@@ -58,7 +61,7 @@ struct NCCLContext {
   ncclComm_t comm_;
 
   explicit NCCLContext(int dev_id)
-      : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))) {}
+      : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {}
 
   cudaStream_t stream() const { return ctx_->stream(); }
 
@@ -66,23 +69,23 @@ struct NCCLContext {
     return boost::get<platform::CUDAPlace>(ctx_->GetPlace()).device;
   }
 
-  static void InitNCCLContext(std::unordered_map<int, NCCLContext> &contexts,
+  static void InitNCCLContext(std::unordered_map<int, NCCLContext> *contexts,
                               const std::vector<platform::Place> &places) {
     std::vector<ncclComm_t> comms;
     std::vector<int> devs;
-    comms.resize(contexts.size());
-    devs.reserve(contexts.size());
+    comms.resize(contexts->size());
+    devs.reserve(contexts->size());
 
     for (auto &p : places) {
       devs.push_back(boost::get<platform::CUDAPlace>(p).device);
     }
 
     PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
-        &comms[0], static_cast<int>(contexts.size()), &devs[0]));
+        &comms[0], static_cast<int>(contexts->size()), &devs[0]));
 
     int i = 0;
     for (auto &dev_id : devs) {
-      contexts.at(dev_id).comm_ = comms[i++];
+      contexts->at(dev_id).comm_ = comms[i++];
     }
   }
 };
@@ -91,7 +94,8 @@ struct NCCLContextMap {
   std::unordered_map<int, NCCLContext> contexts_;
   std::vector<int> order_;
 
-  NCCLContextMap(const std::vector<platform::Place> &places) {
+  explicit NCCLContextMap(const std::vector<platform::Place> &places) {
+    PADDLE_ENFORCE(!places.empty());
     order_.reserve(places.size());
     for (auto &p : places) {
       int dev_id = boost::get<CUDAPlace>(p).device;
@@ -102,15 +106,17 @@ struct NCCLContextMap {
         order_.size(), contexts_.size(),
         "NCCL Context Map does not support contain two or more same device");
 
-    std::vector<ncclComm_t> comms;
-    comms.resize(order_.size());
+    if (places.size() > 1) {
+      std::vector<ncclComm_t> comms;
+      comms.resize(order_.size());
 
-    PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
-        &comms[0], static_cast<int>(order_.size()), &order_[0]));
+      PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
+          &comms[0], static_cast<int>(order_.size()), &order_[0]));
 
-    int i = 0;
-    for (auto &dev_id : order_) {
-      contexts_.at(dev_id).comm_ = comms[i++];
+      int i = 0;
+      for (auto &dev_id : order_) {
+        contexts_.at(dev_id).comm_ = comms[i++];
+      }
     }
   }
 
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 884289a7fda65f9713392ec459219b4c89271e73..4fef351c2118e43697606c90a616cd870e78cd77 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -2,13 +2,13 @@ if(WITH_PYTHON)
   if(WITH_AMD_GPU)
     hip_library(paddle_pybind SHARED
       SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-      DEPS pybind python backward proto_desc memory executor prune init profiler feed_fetch_method
+      DEPS pybind python proto_desc memory executor prune init profiler feed_fetch_method
            parallel_executor
       ${GLOB_OP_LIB})
   else()
     cc_library(paddle_pybind SHARED
       SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-      DEPS pybind python backward proto_desc memory executor prune init profiler feed_fetch_method
+      DEPS pybind python proto_desc memory executor prune init profiler feed_fetch_method
            parallel_executor
       ${GLOB_OP_LIB})
     if(NOT APPLE AND NOT ANDROID)
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 2fe829036386086075a7f6ad0b9348a9e8c5e85a..93533e5c9d88a9113d4d3eacb01901a8c14b6324 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <string>
 #include <tuple>
 
-#include "paddle/fluid/framework/backward.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -125,23 +124,6 @@ void BindProgramDesc(pybind11::module *m) {
            })
       .def("append_block", &pd::ProgramDesc::AppendBlock,
            pybind11::return_value_policy::reference)
-      .def("append_backward",
-           [](pd::ProgramDesc &program_desc, const pd::VarDesc &target,
-              const std::unordered_set<std::string> &no_grad_vars) {
-             pd::ParamGradInfoMap param_grad_map =
-                 AppendBackward(program_desc, target, no_grad_vars);
-             std::unordered_map<
-                 std::string, std::tuple<std::string /* grad_var_name */,
-                                         int /* block_idx */, int /* op_idx */>>
-                 retv;
-             for (auto it = param_grad_map.begin(); it != param_grad_map.end();
-                  ++it) {
-               const auto &grad_info = it->second;
-               retv[it->first] = std::make_tuple(
-                   grad_info.name_, grad_info.block_idx_, grad_info.op_idx_);
-             }
-             return retv;
-           })
       .def("block", &pd::ProgramDesc::MutableBlock,
            pybind11::return_value_policy::reference)
       .def("num_blocks", &pd::ProgramDesc::Size)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 392404045578489014f2283b885c388d5a4586cf..a1e8ff6399f0812773a7bb753c90e4400b1763d9 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -20,9 +20,6 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/pybind/protobuf.h"
-
-#include "paddle/fluid/framework/backward.h"
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
@@ -31,18 +28,18 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/cond_op.h"
-#include "paddle/fluid/operators/net_op.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/pybind/const_value.h"
 #include "paddle/fluid/pybind/exception.h"
-#include "paddle/fluid/pybind/pybind.h"
+#include "paddle/fluid/pybind/protobuf.h"
+#include "paddle/fluid/pybind/pybind.h"  // NOLINT
 #include "paddle/fluid/pybind/recordio.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 
@@ -239,11 +236,6 @@ All parameter, weight, gradient are variables in Paddle.
            },
            py::return_value_policy::reference)
 #endif
-      .def("get_net",
-           [](Variable &self) -> operators::NetOp * {
-             return self.GetMutable<operators::NetOp>();
-           },
-           py::return_value_policy::reference)
       .def("get_reader",
            [](Variable &self) -> framework::ReaderHolder * {
              PADDLE_ENFORCE(self.IsType<framework::ReaderHolder>());
@@ -252,7 +244,6 @@ All parameter, weight, gradient are variables in Paddle.
            py::return_value_policy::reference);
 
   py::class_<framework::ReaderHolder>(m, "Reader", "")
-      .def("has_next", &framework::ReaderHolder::HasNext)
       .def("reset", &framework::ReaderHolder::ReInit);
 
   py::class_<Scope>(m, "Scope", "")
@@ -389,11 +380,6 @@ All parameter, weight, gradient are variables in Paddle.
                                    desc.InitializationErrorString());
                     return OpRegistry::CreateOp(desc);
                   })
-      .def("backward",
-           [](const OperatorBase &forwardOp,
-              const std::unordered_set<std::string> &no_grad_vars) {
-             return Backward(forwardOp, no_grad_vars).release();
-           })
       .def("run",
            [](OperatorBase &self, const Scope &scope,
               const platform::CPUPlace &place) { self.Run(scope, place); })
@@ -421,42 +407,6 @@ All parameter, weight, gradient are variables in Paddle.
            [](const OperatorBase &op) { return op.OutputVars(false); })
       .def("support_gpu", &OperatorBase::SupportGPU);
 
-  py::class_<operators::NetOp, OperatorBase>(m, "Net")
-      .def_static("create",
-                  []() -> operators::NetOp * {
-                    auto *retv = new operators::NetOp;
-                    retv->SetType("plain_net");
-                    return retv;
-                  })
-      .def("append_op", [](operators::NetOp &self,
-                           const OperatorBase &op) { self.AppendOp(op); })
-      .def("complete_add_op", &operators::NetOp::CompleteAddOp)
-      .def("complete_add_op", [](std::shared_ptr<operators::NetOp> &self) {
-        self->CompleteAddOp();
-      });
-
-  // cond_op
-  py::class_<operators::CondOp, OperatorBase>(m, "CondOp")
-      .def_static("create",
-                  [](py::bytes protobin) -> operators::CondOp * {
-                    proto::OpDesc desc;
-                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
-                                   "Cannot parse user input to OpDesc");
-                    PADDLE_ENFORCE(desc.IsInitialized(),
-                                   "User OpDesc is not initialized, reason %s",
-                                   desc.InitializationErrorString());
-                    auto cond_op = OpRegistry::CreateOp(desc);
-                    return static_cast<operators::CondOp *>(cond_op.release());
-                  })
-      .def("set_truenet",
-           [](operators::CondOp &self, const operators::NetOp &net) -> void {
-             self.set_truenet(net.Clone());
-           })
-      .def("set_falsenet",
-           [](operators::CondOp &self, const operators::NetOp &net) -> void {
-             self.set_falsenet(net.Clone());
-           });
-
   py::class_<framework::Executor>(m, "Executor")
       .def(py::init<const platform::Place &>())
       .def("run",
@@ -554,6 +504,7 @@ All parameter, weight, gradient are variables in Paddle.
                                   bcast_vars, main_program, loss_var_name,
                                   scope, local_scopes, allow_op_delay);
            })
+      .def("bcast_params", &ParallelExecutor::BCastParamsToGPUs)
       .def("local_scopes",
            [](ParallelExecutor &self) -> std::vector<Scope *> * {
              return &self.GetLocalScopes();
diff --git a/paddle/fluid/pybind/recordio.cc b/paddle/fluid/pybind/recordio.cc
index 0644d91425af1a1ac9363b1dec9e317689331fcb..330d104e0a774d905e463566f85bd2e64a080190 100644
--- a/paddle/fluid/pybind/recordio.cc
+++ b/paddle/fluid/pybind/recordio.cc
@@ -39,7 +39,7 @@ class RecordIOWriter {
   void CompleteAppendTensor() {
     auto& ctx =
         *platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
-    framework::WriteToRecordIO(writer_, tensors_, ctx);
+    framework::WriteToRecordIO(&writer_, tensors_, ctx);
     tensors_.clear();
   }
 
diff --git a/paddle/fluid/recordio/chunk.cc b/paddle/fluid/recordio/chunk.cc
index e7ebbba452c5c37113f0962e459da65c66b70873..82d9aa601cf450b8f90573d6c582bb12ced7a48a 100644
--- a/paddle/fluid/recordio/chunk.cc
+++ b/paddle/fluid/recordio/chunk.cc
@@ -14,13 +14,13 @@
 
 #include "paddle/fluid/recordio/chunk.h"
 
+#include <zlib.h>
 #include <algorithm>
 #include <memory>
 #include <sstream>
 
 #include "paddle/fluid/platform/enforce.h"
-#include "snappy_stream/include/snappystream.hpp"
-#include "zlib/include/zlib.h"
+#include "snappystream.hpp"
 
 namespace paddle {
 namespace recordio {
diff --git a/paddle/fluid/recordio/header.cc b/paddle/fluid/recordio/header.cc
index ed09d58f6a3e2dba50bf4407c0463480575b248e..c4822329a43a79adc81f0b0cf145b22661ac6f50 100644
--- a/paddle/fluid/recordio/header.cc
+++ b/paddle/fluid/recordio/header.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/recordio/header.h"
+
+#include <string>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 4885b74e6c6644704cff01dbf49975d6e87ce0c4..be1565ab533037d4bc72b6d2834c48b04638c297 100755
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -231,7 +231,7 @@ function gen_fluid_inference_lib() {
     Deploying fluid inference library ...
     ========================================
 EOF
-        make inference_lib_dist
+        make -j `nproc` inference_lib_dist
     fi
 }
 
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index c3696e421f53443abef38d4189ea9b75aa2f5051..bb4b6d5fc4d84a5f899916377942861c3736bea0 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -29,6 +29,7 @@ import optimizer
 import backward
 import regularizer
 import average
+import metrics
 from param_attr import ParamAttr, WeightNormParamAttr
 from data_feeder import DataFeeder
 from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace
diff --git a/python/paddle/fluid/average.py b/python/paddle/fluid/average.py
index ded6eb085968343fcdc9f6e4b8353c08408df426..6abe8233b07c484494848c566e9898600a7d8f5c 100644
--- a/python/paddle/fluid/average.py
+++ b/python/paddle/fluid/average.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import numpy as np
+import warnings
 """
     Class of all kinds of Average.
 
@@ -22,6 +23,8 @@ import numpy as np
     wrappers of Python functions.
 """
 
+__all__ = ["WeightedAverage"]
+
 
 def _is_number_(var):
     return isinstance(var, int) or isinstance(var, float) or (isinstance(
@@ -34,6 +37,9 @@ def _is_number_or_matrix_(var):
 
 class WeightedAverage(object):
     def __init__(self):
+        warnings.warn(
+            "The %s is deprecated, please use fluid.metrics.Accuracy instead." %
+            (self.__class__.__name__), Warning)
         self.reset()
 
     def reset(self):
diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py
index 0ec3ebc7e3dba6e4cf89c8a76622761d210276cf..b0522b49f44d8ed0c8c7e3148e24f312fbdd1123 100644
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -13,14 +13,17 @@
 # limitations under the License.
 
 from __future__ import print_function
-import framework
-from framework import Program, default_main_program, default_startup_program, Parameter, Variable
-import optimizer
-from layer_helper import LayerHelper
-import distributed_splitter as splitter
+
 import math
+
+import distributed_splitter as splitter
+import framework
+from framework import Program, default_main_program, Variable
 from . import core
-import debuger
+
+LOOKUP_TABLE_TYPE = "lookup_table"
+LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad"
+RPC_CLIENT_VAR_NAME = "RPC_CLIENT_VAR"
 
 
 class VarBlock:
@@ -35,9 +38,9 @@ class VarBlock:
 
 
 class UnionFind(object):
-    """ Union-find data struct.
+    """ Union-find data structure.
 
-    Union-find is a data struct that keeps track of a set of elements partitioned
+    Union-find is a data structure that keeps track of a set of elements partitioned
     into a number of disjoint (non-overlapping) subsets.
 
     Reference:
@@ -185,19 +188,66 @@ class DistributeTranspiler:
         assert (callable(split_method))
         if program is None:
             program = default_main_program()
-        self.program = program
-        self.trainers = trainers
+        self.origin_program = program
+        self.trainer_num = trainers
         self.optimize_ops = optimize_ops
         # TODO(typhoonzero): currently trainer_id is fetched from cluster system
         # like Kubernetes, we should port this to use etcd later when developing
         # fluid distributed training with fault-tolerance.
         self.trainer_id = trainer_id
         pserver_endpoints = pservers.split(",")
+        self.pserver_endpoints = pserver_endpoints
+
+        # process lookup_table_op
+        # 1. check all lookup_table_op is distributed
+        # 2. check all lookup_table_op share the same table.
+        distributed_lookup_table_ops = []
+        # support only one distributed_lookup_table now
+        self.table_name = None
+        for op in program.global_block().ops:
+            if op.type == LOOKUP_TABLE_TYPE:
+                if op.attrs['is_distributed'] is True:
+                    if self.table_name is None:
+                        self.table_name = op.input("W")[0]
+                    if self.table_name != op.input("W")[0]:
+                        raise RuntimeError("all distributed lookup_table_ops"
+                                           " should have only one table")
+                    distributed_lookup_table_ops.append(op)
+                else:
+                    if self.table_name is not None:
+                        assert op.input("W")[0] != self.table_name
+
+        self.has_distributed_lookup_table = len(
+            distributed_lookup_table_ops) > 0
 
         # step1: For large parameters and gradients, split them into smaller
         # blocks.
         param_list = [pg[0] for pg in params_grads]
         grad_list = [pg[1] for pg in params_grads]
+
+        if self.has_distributed_lookup_table:
+            param_list = [
+                param for param in param_list if param.name != self.table_name
+            ]
+            grad_list = [
+                grad for grad in grad_list
+                if grad.name != framework.grad_var_name(self.table_name)
+            ]
+            self.table_param_grad = [
+                param_grad for param_grad in params_grads
+                if param_grad[0].name == self.table_name
+            ][0]
+            table_grad_var = self.table_param_grad[1]
+            self.table_grad_list = [
+                program.global_block().create_var(
+                    name="%s.trainer_%d.pserver_%d" %
+                    (table_grad_var.name, trainer_id, index),
+                    type=table_grad_var.type,
+                    shape=table_grad_var.shape,
+                    dtype=table_grad_var.dtype)
+                for index in range(len(self.pserver_endpoints))
+            ]
+
         grad_blocks = split_dense_variable(grad_list, len(pserver_endpoints))
         param_blocks = split_dense_variable(param_list, len(pserver_endpoints))
         # step2: Create new vars for the parameters and gradients blocks and
@@ -229,7 +279,7 @@ class DistributeTranspiler:
             self.param_grad_ep_mapping[ep]["grads"].append(grad)
 
         rpc_client_var = program.global_block().create_var(
-            name="RPC_CLIENT_VAR",
+            name=RPC_CLIENT_VAR_NAME,
             persistable=True,
             type=core.VarDesc.VarType.RAW)
 
@@ -252,12 +302,19 @@ class DistributeTranspiler:
                 outputs={"Out": [orig_param]},
                 attrs={"axis": 0})
 
+        if self.has_distributed_lookup_table:
+            self._replace_lookup_table_op_with_prefetch(program, rpc_client_var,
+                                                        eplist)
+            self._split_table_grad_and_add_send_vars(program, rpc_client_var,
+                                                     pserver_endpoints)
+
     def get_trainer_program(self):
         # remove optimize ops and add a send op to main_program
-        self.program.global_block().delete_ops(self.optimize_ops)
+        self.origin_program.global_block().delete_ops(self.optimize_ops)
+        self.origin_program.sync_with_cpp()
         # FIXME(typhoonzero): serialize once will fix error occurs when clone.
-        self.program.__str__()
-        return self.program
+        self.origin_program.__str__()
+        return self.origin_program
 
     def get_pserver_program(self, endpoint):
         """
@@ -293,8 +350,8 @@ class DistributeTranspiler:
                     type=v.type,
                     dtype=v.dtype,
                     shape=v.shape)
-            if self.trainers > 1:
-                for trainer_id in xrange(self.trainers):
+            if self.trainer_num > 1:
+                for trainer_id in xrange(self.trainer_num):
                     var = pserver_program.global_block().create_var(
                         name="%s.trainer_%d" % (orig_var_name, trainer_id),
                         persistable=False,
@@ -308,7 +365,7 @@ class DistributeTranspiler:
         # step3
         optimize_block = pserver_program.create_block(0)
         # step 4
-        # Create a union-find data struct from optimize ops,
+        # Create a union-find data structure from optimize ops,
         # If two ops are connected, we could add these two ops
         # into one set.
         ufind = self._create_ufind(self.optimize_ops)
@@ -383,6 +440,23 @@ class DistributeTranspiler:
         #             __append_optimize_op__(glb_op, optimize_block)
         #             break
 
+        # process distributed lookup_table
+        prefetch_block = None
+        if self.has_distributed_lookup_table:
+            pserver_index = self.pserver_endpoints.index(endpoint)
+            self._create_table_optimize_block(pserver_index, pserver_program,
+                                              append_block)
+            prefetch_block = self._create_prefetch_block(
+                pserver_index, pserver_program, optimize_block)
+
+        # NOTE: if has_distributed_lookup_table is False, then prefetch_block will
+        # not be executed, so it's safe to use optimize_block to hold the place
+        if self.has_distributed_lookup_table:
+            assert prefetch_block is not None
+        else:
+            assert prefetch_block is None
+            prefetch_block = pserver_program.global_block()
+
         # step5 append the listen_and_serv op
         pserver_program.global_block().append_op(
             type="listen_and_serv",
@@ -391,8 +465,10 @@ class DistributeTranspiler:
             attrs={
                 "OptimizeBlock": optimize_block,
                 "endpoint": endpoint,
-                "Fanin": self.trainers
+                "Fanin": self.trainer_num,
+                "PrefetchBlock": prefetch_block
             })
+
         pserver_program.sync_with_cpp()
         return pserver_program
 
@@ -450,6 +526,197 @@ class DistributeTranspiler:
                     attrs=op.attrs)
         return s_prog
 
+    # transpiler function for dis lookup_table
+    def _replace_lookup_table_op_with_prefetch(self, program, rpc_client_var,
+                                               eplist):
+        # 1. replace lookup_table_op with split_ids_op -> prefetch_op -> sum_op
+        self.prefetch_input_vars = None
+        self.prefetch_output_vars = None
+
+        continue_search_lookup_table_op = True
+        while continue_search_lookup_table_op:
+            continue_search_lookup_table_op = False
+            all_ops = program.global_block().ops
+            for op in all_ops:
+                if op.type == LOOKUP_TABLE_TYPE:
+                    continue_search_lookup_table_op = True
+
+                    op_index = list(all_ops).index(op)
+                    ids_name = op.input("Ids")
+                    out_name = op.output("Out")
+
+                    if self.prefetch_input_vars is None:
+                        ids_var = program.global_block().vars[ids_name[0]]
+                        self.prefetch_input_vars = self.create_splited_vars(
+                            source_var=ids_var,
+                            block=program.global_block(),
+                            tag="_prefetch_in_")
+                    if self.prefetch_output_vars is None:
+                        out_var = program.global_block().vars[out_name[0]]
+                        self.prefetch_output_vars = self.create_splited_vars(
+                            source_var=out_var,
+                            block=program.global_block(),
+                            tag="_prefetch_out_")
+
+                    # insert split_ids_op
+                    program.global_block().insert_op(
+                        index=op_index,
+                        type="split_ids",
+                        inputs={
+                            'Ids': [
+                                program.global_block().vars[varname]
+                                for varname in ids_name
+                            ]
+                        },
+                        outputs={"Out": self.prefetch_input_vars})
+
+                    # insert prefetch_op
+                    program.global_block().insert_op(
+                        index=op_index + 1,
+                        type="prefetch",
+                        inputs={'X': self.prefetch_input_vars},
+                        outputs={
+                            "Out": self.prefetch_output_vars,
+                            "RPCClient": rpc_client_var
+                        },
+                        attrs={"epmap": eplist})
+
+                    # insert concat_op
+                    program.global_block().insert_op(
+                        index=op_index + 2,
+                        type="concat",
+                        inputs={'X': self.prefetch_output_vars},
+                        outputs={
+                            "Out": [
+                                program.global_block().vars[varname]
+                                for varname in out_name
+                            ]
+                        },
+                        attrs={"axis": 0})
+
+                    # delete lookup_table_op
+                    program.global_block().delete_ops([op])
+                    program.sync_with_cpp()
+                    # break for loop
+                    break
+
+    def _split_table_grad_and_add_send_vars(self, program, rpc_client_var,
+                                            pserver_endpoints):
+        # 2. add split_ids_op and send_vars_op to send gradient to pservers
+        # there should only be one table_name
+        all_ops = program.global_block().ops
+        table_grad_name = framework.grad_var_name(self.table_name)
+        for op in all_ops:
+            if table_grad_name in op.output_arg_names:
+                op_index = list(all_ops).index(op)
+                # insert split_ids_op
+                program.global_block().insert_op(
+                    index=op_index + 1,
+                    type="split_ids",
+                    inputs={
+                        'Ids': [program.global_block().vars[table_grad_name]]
+                    },
+                    outputs={"Out": self.table_grad_list})
+                program.global_block().insert_op(
+                    index=op_index + 2,
+                    type="send_vars",
+                    inputs={'X': self.table_grad_list},
+                    outputs={"RPCClient": rpc_client_var},
+                    attrs={"sync_send": True,
+                           "epmap": pserver_endpoints})
+                break
+
+    def _create_prefetch_block(self, pserver_index, pserver_program,
+                               optimize_block):
+        # STEP: create prefetch block
+        table_var = pserver_program.global_block().vars[self.table_name]
+        prefetch_block = pserver_program.create_block(optimize_block.idx)
+        trainer_ids = self.prefetch_input_vars[pserver_index]
+        pserver_ids = pserver_program.global_block().create_var(
+            name=trainer_ids.name,
+            type=trainer_ids.type,
+            shape=trainer_ids.shape,
+            dtype=trainer_ids.dtype)
+        trainer_out = self.prefetch_output_vars[pserver_index]
+        pserver_out = pserver_program.global_block().create_var(
+            name=trainer_out.name,
+            type=trainer_out.type,
+            shape=trainer_out.shape,
+            dtype=trainer_out.dtype)
+        prefetch_block.append_op(
+            type=LOOKUP_TABLE_TYPE,
+            inputs={'Ids': pserver_ids,
+                    "W": table_var},
+            outputs={"Out": pserver_out},
+            attrs={
+                "is_sparse": True,  # has no effect on lookup_table op
+                "is_distributed": True,
+                "padding_idx": -1
+            })
+        return prefetch_block
+
+    def _create_table_optimize_block(self, pserver_index, pserver_program,
+                                     append_block):
+        def _clone_var(block, var, persistable=True):
+            assert isinstance(var, Variable)
+            return block.create_var(
+                name=var.name,
+                shape=var.shape,
+                dtype=var.dtype,
+                type=var.type,
+                persistable=persistable)
+
+        # STEP: create table optimize block
+        # create table param and grad var in pserver program
+        param_var = _clone_var(
+            pserver_program.global_block(),
+            self.origin_program.global_block().vars[self.table_name])
+        grad_var = _clone_var(
+            pserver_program.global_block(),
+            self.origin_program.global_block().vars[framework.grad_var_name(
+                self.table_name)],
+            persistable=False)
+
+        # create grad vars in pserver program
+        table_grad_var = self.table_param_grad[1]
+        table_grad_list = [
+            pserver_program.global_block().create_var(
+                name="%s.trainer_%d.pserver_%d" %
+                (table_grad_var.name, index, pserver_index),
+                type=table_grad_var.type,
+                shape=table_grad_var.shape,
+                dtype=table_grad_var.dtype) for index in range(self.trainer_num)
+        ]
+
+        # create table optimize block in pserver program
+        table_opt_op = [
+            op for op in self.optimize_ops
+            if op.input("Param")[0] == self.table_name
+        ][0]
+        table_opt_block = pserver_program.create_block(append_block.idx)
+        # only support sgd now
+        assert table_opt_op.type == "sgd"
+
+        # append sum op for table_grad_list
+        table_opt_block.append_op(
+            type="sum",
+            inputs={"X": table_grad_list},
+            outputs={"Out": [grad_var]})
+
+        lr_var = pserver_program.global_block().vars[table_opt_op.input(
+            "LearningRate")[0]]
+        inputs = {
+            "Param": [param_var],
+            "Grad": [grad_var],
+            "LearningRate": [lr_var]
+        }
+        outputs = {"ParamOut": [param_var]}
+        table_opt_block.append_op(
+            type=table_opt_op.type,
+            inputs=inputs,
+            outputs=outputs,
+            attrs=table_opt_op.attrs)
+
     # ====================== private transpiler functions =====================
     def _create_vars_from_blocklist(self,
                                     program,
@@ -511,7 +778,17 @@ class DistributeTranspiler:
             program.global_block().sync_with_cpp()
         return var_mapping
 
-    def _clone_var(self, block, var):
+    def create_splited_vars(self, source_var, block, tag):
+        return [
+            block.create_var(
+                name=str(source_var.name + tag + str(index)),
+                type=source_var.type,
+                shape=source_var.shape,
+                dtype=source_var.dtype)
+            for index in range(len(self.pserver_endpoints))
+        ]
+
+    def _clone_var(self, block, var, persistable=True):
         assert isinstance(var, Variable)
         return block.create_var(
             name=var.name,
@@ -519,12 +796,12 @@ class DistributeTranspiler:
             dtype=var.dtype,
             type=var.type,
             lod_level=var.lod_level,
-            persistable=True)
+            persistable=persistable)
 
     def _append_split_op(self, program, gradblocks):
         # Split variables that need to be split and append respective ops
         add_suffix = False
-        if self.trainers > 1:
+        if self.trainer_num > 1:
             add_suffix = True
         var_mapping = self._create_vars_from_blocklist(
             program, gradblocks, add_trainer_suffix=add_suffix)
@@ -615,9 +892,9 @@ class DistributeTranspiler:
                     return
                 merged_var = \
                     pserver_block.vars[self._orig_varname(grad_block.name)]
-                if self.trainers > 1:
+                if self.trainer_num > 1:
                     vars2merge = []
-                    for i in xrange(self.trainers):
+                    for i in xrange(self.trainer_num):
                         per_trainer_name = "%s.trainer_%d" % \
                         (self._orig_varname(grad_block.name), i)
                         vars2merge.append(pserver_block.vars[per_trainer_name])
@@ -632,7 +909,7 @@ class DistributeTranspiler:
                             type="scale",
                             inputs={"X": merged_var},
                             outputs={"Out": merged_var},
-                            attrs={"scale": 1.0 / float(self.trainers)})
+                            attrs={"scale": 1.0 / float(self.trainer_num)})
                 new_inputs[key] = merged_var
             elif key == "Param":
                 # param is already created on global program
@@ -668,7 +945,7 @@ class DistributeTranspiler:
             new_shape = None
             if key in ["Param", "Grad", "LearningRate"]:
                 continue
-            var = self.program.global_block().vars[opt_op.input(key)[0]]
+            var = self.origin_program.global_block().vars[opt_op.input(key)[0]]
             # update accumulator variable shape
             param_shape = new_inputs["Param"].shape
             new_shape = self._get_optimizer_input_shape(opt_op.type, key,
@@ -681,8 +958,8 @@ class DistributeTranspiler:
             new_inputs[key] = tmpvar
 
         # change output's ParamOut variable
-        outputs = self._get_output_map_from_op(self.program.global_block().vars,
-                                               opt_op)
+        outputs = self._get_output_map_from_op(
+            self.origin_program.global_block().vars, opt_op)
         outputs["ParamOut"] = new_inputs["Param"]
 
         optimize_block.append_op(
@@ -694,8 +971,8 @@ class DistributeTranspiler:
     def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
         program = optimize_block.program
         # Append the ops for parameters that do not need to be optimized/updated
-        inputs = self._get_input_map_from_op(self.program.global_block().vars,
-                                             opt_op)
+        inputs = self._get_input_map_from_op(
+            self.origin_program.global_block().vars, opt_op)
         for varlist in inputs.itervalues():
             if not isinstance(varlist, list):
                 varlist = [varlist]
@@ -708,8 +985,8 @@ class DistributeTranspiler:
                         dtype=var.dtype,
                         shape=var.shape)
 
-        outputs = self._get_output_map_from_op(self.program.global_block().vars,
-                                               opt_op)
+        outputs = self._get_output_map_from_op(
+            self.origin_program.global_block().vars, opt_op)
 
         for varlist in outputs.itervalues():
             if not isinstance(varlist, list):
@@ -782,7 +1059,6 @@ class DistributeTranspiler:
                 if same_or_split_var(n, param) and n != param:
                     return True
             return False
-        return False
 
     def _get_input_map_from_op(self, varmap, op):
         """Returns a dict from op input name to the vars in varmap."""
@@ -820,7 +1096,7 @@ class DistributeTranspiler:
 
         find_ops = []
         # find ops which output is lr var
-        block = self.program.global_block()
+        block = self.origin_program.global_block()
         for op in block.ops:
             if set(op.output_arg_names) & lr_vars:
                 find_ops.append(op)
diff --git a/python/paddle/fluid/evaluator.py b/python/paddle/fluid/evaluator.py
index 19e5b61b0b32aba3fe1e7805704a3740e3854fc8..13475025b5c2a759779066f9d511ed8a786118d5 100644
--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
 import numpy as np
 
 import layers
@@ -59,6 +60,9 @@ class Evaluator(object):
     """
 
     def __init__(self, name, **kwargs):
+        warnings.warn(
+            "The %s is deprecated, because maintain a modified program inside evaluator cause bug easily, please use fluid.metrics.%s instead."
+            % (self.__class__.__name__, self.__class__.__name__), Warning)
         self.states = []
         self.metrics = []
         self.helper = LayerHelper(name, **kwargs)
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 793421a22fbf6f3c25ec6a9bf8359f4e71e905de..4b841ef31dcb67ab660475cf6e231fd8a4ae83d6 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1119,24 +1119,6 @@ class Program(object):
     def current_block(self):
         return self.blocks[self.current_block_idx]
 
-    def append_backward(self, target, no_grad_set=None):
-        """
-        return map(param_name -> (grad_name, block_index, op_index))
-        """
-        assert isinstance(target, Variable)
-        if no_grad_set is None:
-            no_grad_set = set()
-        try:
-            param_to_grad_info = self.desc.append_backward(target.desc,
-                                                           no_grad_set)
-        except Exception as e:
-            raise core.EnforceNotMet(
-                str(e) + "\nCurrent protobuf is\n{0}".format(
-                    self.to_string(False)))
-
-        self.sync_with_cpp()
-        return param_to_grad_info
-
     def create_block(self, parent_idx=None):
         new_block_idx = len(self.blocks)
         parent = self.current_block() if parent_idx is None else self.block(
@@ -1201,6 +1183,8 @@ class Parameter(Variable):
 
         self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None)
 
+        self.do_model_average = kwargs.get('do_model_average', None)
+
     def __str__(self):
         return self.to_string(True)
 
@@ -1221,7 +1205,7 @@ class Parameter(Variable):
         if with_details:
             res_str = Variable.to_string(self, throw_on_error, True)
             additional_attr = ("trainable", "optimize_attr", "regularizer",
-                               "gradient_clip_attr")
+                               "gradient_clip_attr", "do_model_average")
             for attr_name in additional_attr:
                 res_str += "%s: %s\n" % (attr_name,
                                          str(getattr(self, attr_name)))
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 927f1e625a579737b98e60683d8d9ed90d5e7e03..4e132ed26183eaa5e572128e679cdbffd42e5a42 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -18,7 +18,8 @@ import contextlib
 
 __all__ = [
     'Constant', 'Uniform', 'Normal', 'Xavier', 'force_init_on_cpu',
-    'init_on_cpu'
+    'init_on_cpu', 'ConstantInitializer', 'UniformInitializer',
+    'NormalInitializer', 'XavierInitializer'
 ]
 
 _force_init_on_cpu_ = False
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 969398bda4cfd0b2f5e39f45d34a1da9b216901f..e7d6c4e2521bee133c4794ed1db669b02fc2152b 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -21,8 +21,7 @@ from ..executor import global_scope
 
 __all__ = [
     'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
-    'open_files', 'read_file', 'create_shuffle_reader',
-    'create_double_buffer_reader', 'create_multi_pass_reader'
+    'open_files', 'read_file', 'shuffle', 'double_buffer'
 ]
 
 
@@ -237,13 +236,9 @@ def monkey_patch_reader_methods(reader):
         var = scope.find_var(reader.name)
         return var.get_reader()
 
-    def eof():
-        return not __get_reader__().has_next()
-
     def reset():
         return __get_reader__().reset()
 
-    reader.eof = eof
     reader.reset = reset
     reader.stop_gradient = True
     reader.persistable = True
@@ -283,7 +278,42 @@ def _copy_reader_create_op_(block, op):
     return new_op
 
 
-def open_recordio_file(filename, shapes, lod_levels, dtypes):
+def open_recordio_file(filename,
+                       shapes,
+                       lod_levels,
+                       dtypes,
+                       pass_num=1,
+                       for_parallel=False):
+    """
+    Open a RecordIO file
+
+    This layer takes a RecordIO file to read from and returns a Reader Variable.
+    Via the Reader Variable, we can get data from the given RecordIO file.
+
+    Args:
+       filename(str): The RecordIO file's name.
+       shapes(list): List of tuples which declaring data shapes.
+       lod_levels(list): List of ints which declaring data lod_level.
+       dtypes(list): List of strs which declaring data type.
+       pass_num(int): Number of passes to run.
+       for_parallel(Bool): Set it as True if you are going to run
+            subsequent operators in parallel.
+
+    Returns:
+       Variable: A Reader Variable via which we can get RecordIO file data.
+
+    Examples:
+       .. code-block:: python
+
+         reader = fluid.layers.io.open_recordio_file(
+                                          filename='./data.recordio',
+                                          shapes=[(3,224,224), (1)],
+                                          lod_levels=[0, 0],
+                                          dtypes=['float32', 'int64'])
+
+         # Via the reader, we can use 'read_file' layer to get data:
+         image, label = fluid.layers.read_file(reader)
+    """
     dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
     shape_concat = []
     ranks = []
@@ -310,10 +340,63 @@ def open_recordio_file(filename, shapes, lod_levels, dtypes):
     startup_var.persistable = True
     main_prog_var = _copy_reader_var_(default_main_program().current_block(),
                                       startup_var)
+
+    if pass_num > 1:
+        main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
+
+    if for_parallel:
+        main_prog_var = parallel(reader=main_prog_var)
+
     return monkey_patch_reader_methods(main_prog_var)
 
 
-def open_files(filenames, thread_num, shapes, lod_levels, dtypes):
+def open_files(filenames,
+               shapes,
+               lod_levels,
+               dtypes,
+               thread_num,
+               buffer_size=None,
+               pass_num=1,
+               for_parallel=False):
+    """
+    Open files
+
+    This layer takes a list of files to read from and returns a Reader Variable. 
+    Via the Reader Variable, we can get data from given files. All files must 
+    have name suffixs to indicate their formats, e.g., '*.recordio'. 
+
+    Args:
+       filenames(list): The list of file names.
+       shapes(list): List of tuples which declaring data shapes.
+       lod_levels(list): List of ints which declaring data lod_level.
+       dtypes(list): List of strs which declaring data type.
+       thread_num(int): The maximal concurrent prefetch thread number.
+       buffer_size(int): The size of prefetch buffer.
+       pass_num(int): Number of passes to run.
+       for_parallel(Bool): Set it as True if you are going to run 
+            subsequent operators in parallel.
+
+    Returns:
+       Variable: A Reader Variable via which we can get file data.
+
+    Examples:
+       .. code-block:: python
+
+         reader = fluid.layers.io.open_files(filenames=['./data1.recordio',
+                                                     './data2.recordio'],
+                                             shapes=[(3,224,224), (1)],
+                                             lod_levels=[0, 0],
+                                             dtypes=['float32', 'int64'],
+                                             thread_num=2,
+                                             buffer_size=2)
+
+         # Via the reader, we can use 'read_file' layer to get data:
+         image, label = fluid.layers.io.read_file(reader)
+    """
+    if buffer_size is None:
+        buffer_size = thread_num
+    if isinstance(filenames, basestring):
+        filenames = [filenames]
     dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
     shape_concat = []
     ranks = []
@@ -322,29 +405,36 @@ def open_files(filenames, thread_num, shapes, lod_levels, dtypes):
         shape_concat.extend(shape)
         ranks.append(len(shape))
 
-    var_name = unique_name('multiple_reader')
-
+    multi_file_reader_name = unique_name('multi_file_reader')
     startup_blk = default_startup_program().current_block()
-    startup_var = startup_blk.create_var(name=var_name)
+    startup_reader = startup_blk.create_var(name=multi_file_reader_name)
     startup_blk.append_op(
         type='open_files',
-        outputs={'Out': [startup_var]},
+        outputs={'Out': [startup_reader]},
         attrs={
             'shape_concat': shape_concat,
             'lod_levels': lod_levels,
             'ranks': ranks,
             'file_names': filenames,
-            'thread_num': thread_num
+            'thread_num': thread_num,
+            'buffer_size': buffer_size
         })
 
-    startup_var.desc.set_dtypes(dtypes)
-    startup_var.persistable = True
-    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
-                                      startup_var)
-    return monkey_patch_reader_methods(main_prog_var)
+    startup_reader.desc.set_dtypes(dtypes)
+    startup_reader.persistable = True
+    main_prog_reader = _copy_reader_var_(default_main_program().current_block(),
+                                         startup_reader)
+    if pass_num > 1:
+        main_prog_reader = multi_pass(
+            reader=main_prog_reader, pass_num=pass_num)
+
+    if for_parallel:
+        main_prog_reader = parallel(reader=main_prog_reader)
+
+    return monkey_patch_reader_methods(main_prog_reader)
 
 
-def __create_decorated_reader__(op_type, reader, attrs):
+def __create_shared_decorated_reader__(op_type, reader, attrs):
     var_name = unique_name(op_type)
     startup_blk = default_startup_program().current_block()
     startup_var = startup_blk.create_var(name=var_name)
@@ -360,22 +450,41 @@ def __create_decorated_reader__(op_type, reader, attrs):
     return monkey_patch_reader_methods(main_prog_var)
 
 
-def create_shuffle_reader(reader, buffer_size):
-    return __create_decorated_reader__('create_shuffle_reader', reader,
-                                       {'buffer_size': int(buffer_size)})
+def __create_unshared_decorated_reader__(op_type, reader, attrs):
+    new_reader_name = unique_name(op_type)
+    main_blk = default_main_program().current_block()
+    new_reader = main_blk.create_var(name=new_reader_name)
+    main_blk.append_op(
+        type=op_type,
+        inputs={'UnderlyingReader': reader},
+        outputs={'Out': [new_reader]},
+        attrs=attrs)
+    new_reader.persistable = True
+    new_reader.stop_gradient = True
+    return monkey_patch_reader_methods(new_reader)
+
+
+def shuffle(reader, buffer_size):
+    return __create_unshared_decorated_reader__(
+        'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)})
 
 
-def create_double_buffer_reader(reader, place=None):
+def double_buffer(reader, place=None):
     attrs = dict()
     if place is not None:
         attrs['place'] = str(place).upper()
-    return __create_decorated_reader__('create_double_buffer_reader', reader,
-                                       attrs)
+    return __create_unshared_decorated_reader__('create_double_buffer_reader',
+                                                reader, attrs)
+
+
+def multi_pass(reader, pass_num):
+    return __create_shared_decorated_reader__(
+        'create_multi_pass_reader', reader, {'pass_num': int(pass_num)})
 
 
-def create_multi_pass_reader(reader, pass_num):
-    return __create_decorated_reader__('create_multi_pass_reader', reader,
-                                       {'pass_num': int(pass_num)})
+def parallel(reader):
+    return __create_shared_decorated_reader__('create_threaded_reader', reader,
+                                              {})
 
 
 def read_file(file_obj):
diff --git a/python/paddle/fluid/layers/metric.py b/python/paddle/fluid/layers/metric.py
index 3d9157ad4ef9381b70b4007c5bdca91f1482b427..f66dccfa2d040ea0a9d29daeaa1d2da640525959 100644
--- a/python/paddle/fluid/layers/metric.py
+++ b/python/paddle/fluid/layers/metric.py
@@ -15,12 +15,13 @@
 All layers just related to metric.
 """
 
+import warnings
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
 from ..param_attr import ParamAttr
 
-__all__ = ['accuracy']
+__all__ = ['accuracy', 'auc']
 
 
 def accuracy(input, label, k=1, correct=None, total=None):
@@ -55,3 +56,37 @@ def accuracy(input, label, k=1, correct=None, total=None):
             "Total": [total],
         })
     return acc_out
+
+
+def auc(input, label, curve='ROC', num_thresholds=200):
+    warnings.warn(
+        "This interface not recommended, fluid.layers.auc compute the auc at every minibatch, \
+        but can not aggregate them and get the pass AUC, because pass \
+        auc can not be averaged with weighted from the minibatch auc value. \
+        Please use fluid.metrics.Auc, it can compute the auc value via Python natively, \
+        which can get every minibatch and every pass auc value.", Warning)
+    helper = LayerHelper("auc", **locals())
+    topk_out = helper.create_tmp_variable(dtype=input.dtype)
+    topk_indices = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="top_k",
+        inputs={"X": [input]},
+        outputs={"Out": [topk_out],
+                 "Indices": [topk_indices]},
+        attrs={"k": k})
+    auc_out = helper.create_tmp_variable(dtype="float32")
+    if correct is None:
+        correct = helper.create_tmp_variable(dtype="int64")
+    if total is None:
+        total = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="accuracy",
+        inputs={
+            "Out": [topk_out],
+            "Indices": [topk_indices],
+            "Label": [label]
+        },
+        attrs={"curve": curve,
+               "num_thresholds": num_thresholds},
+        outputs={"AUC": [auc_out], })
+    return auc_out
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index d2e7d58524bfb11627b6acb36ef873c41b348f0f..5c2c2dd7abebf8960d68b4c4dfd746a4e27acd03 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -218,6 +218,7 @@ def fc(input,
 def embedding(input,
               size,
               is_sparse=False,
+              is_distributed=False,
               padding_idx=None,
               param_attr=None,
               dtype='float32'):
@@ -268,8 +269,11 @@ def embedding(input,
         inputs={'Ids': input,
                 'W': w},
         outputs={'Out': tmp},
-        attrs={'is_sparse': is_sparse,
-               'padding_idx': padding_idx})
+        attrs={
+            'is_sparse': is_sparse,
+            'is_distributed': is_distributed,
+            'padding_idx': padding_idx
+        })
     return tmp
 
 
@@ -1516,7 +1520,8 @@ def batch_norm(input,
                in_place=False,
                name=None,
                moving_mean_name=None,
-               moving_variance_name=None):
+               moving_variance_name=None,
+               do_model_average_for_mean_and_var=False):
     """
     This function helps create an operator to implement
     the BatchNorm layer using the configurations from the input parameters.
@@ -1547,7 +1552,10 @@ def batch_norm(input,
 
     mean = helper.create_parameter(
         attr=ParamAttr(
-            name=moving_mean_name, initializer=Constant(0.0), trainable=False),
+            name=moving_mean_name,
+            initializer=Constant(0.0),
+            trainable=False,
+            do_model_average=do_model_average_for_mean_and_var),
         shape=param_shape,
         dtype=input.dtype)
     mean.stop_gradient = True
@@ -1556,7 +1564,8 @@ def batch_norm(input,
         attr=ParamAttr(
             name=moving_variance_name,
             initializer=Constant(1.0),
-            trainable=False),
+            trainable=False,
+            do_model_average=do_model_average_for_mean_and_var),
         shape=param_shape,
         dtype=input.dtype)
     variance.stop_gradient = True
@@ -3374,14 +3383,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
     Here are some examples to explain it.
 
     1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
-    is [6, 8], the reshape operator will transform x into a 2-D tensor with 
+    is [6, 8], the reshape operator will transform x into a 2-D tensor with
     shape [6, 8] and leaving x's data unchanged.
 
     2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
     specified is [2, 3, -1, 2], the reshape operator will transform x into a
     4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this
-    case, one dimension of the target shape is set to -1, the value of this 
-    dimension is inferred from the total element number of x and remaining 
+    case, one dimension of the target shape is set to -1, the value of this
+    dimension is inferred from the total element number of x and remaining
     dimensions.
 
     3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
@@ -3615,7 +3624,7 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
 def pad(x, paddings, pad_value=0., name=None):
     """
     Pads a tensor with a constant value given by :attr:`pad_value`, and the
-    padded width is specified by :attr:`paddings`. 
+    padded width is specified by :attr:`paddings`.
 
     Specifically, the number of values padded before the contents of :attr:`x`
     in dimension :attr:`i` is indicated by :attr:`paddings[i]`, and the number
@@ -3643,7 +3652,7 @@ def pad(x, paddings, pad_value=0., name=None):
         x (Variable): The input tensor variable.
         paddings (list): A list of integers. Its elements specify the padded
                          width before and after for each dimension in turn.
-                         The length of :attr:paddings must be 
+                         The length of :attr:paddings must be
                          :math:`rank(x) \\times 2`.
         pad_value (float): The constant value used to pad.
         name(str|None): A name for this layer(optional). If set None, the layer
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..99a81c1d4244b919a53dfec36fc5a6659c10adae
--- /dev/null
+++ b/python/paddle/fluid/metrics.py
@@ -0,0 +1,378 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fluid Metrics
+
+The metrics are accomplished via Python natively. 
+"""
+import numpy as np
+import copy
+import warnings
+
+__all__ = [
+    'MetricBase',
+    'CompositeMetric',
+    'Accuracy',
+    'ChunkEvaluator',
+    'EditDistance',
+    'DetectionMAP',
+    'Auc',
+]
+
+
+def _is_numpy_(var):
+    return isinstance(var, (np.ndarray, np.generic))
+
+
+def _is_number_(var):
+    return isinstance(var, int) or isinstance(var, float) or (isinstance(
+        var, np.ndarray) and var.shape == (1, ))
+
+
+def _is_number_or_matrix_(var):
+    return _is_number_(var) or isinstance(var, np.ndarray)
+
+
+class MetricBase(object):
+    """
+    Base Class for all evaluators
+
+    Args:
+        name(str): The name of evaluator. such as, "accuracy". Used for generate
+            temporary variable name.
+    Interface:
+        Note(*) : the states is the attributes who not has _ prefix.
+
+        get_config(): print current states and configuration
+        reset(): clear the states. If the Metrics states type is not (int, float, np.ndarray),
+                Please override this method.
+        update(): update states at every minibatch
+        eval(): get metric evaluation in numpy type.
+    """
+
+    def __init__(self, name, **kwargs):
+        self._name = str(name) if name != None else self.__class__.__name__
+        self._kwargs = kwargs if kwargs != None else dict()
+        self.reset()
+
+    def __str__(self):
+        return self._name
+
+    def reset(self):
+        """
+        states is the attributes who not has _ prefix.
+        reset the states of metrics.
+        """
+        states = {
+            attr: value
+            for attr, value in self.__dict__.iteritems()
+            if not attr.startswith("_")
+        }
+        for attr, value in states.iteritems():
+            if isinstance(value, int):
+                setattr(self, attr, 0)
+            elif isinstance(value, float):
+                setattr(self, attr, .0)
+            elif isinstance(value, (np.ndarray, np.generic)):
+                setattr(self, attr, np.zeros_like(value))
+            else:
+                setattr(self, attr, None)
+
+    def get_config(self):
+        states = {
+            attr: value
+            for attr, value in self.__dict__.iteritems()
+            if not attr.startswith("_")
+        }
+        config = copy.deepcopy(self._kwargs)
+        config.update({"name": self._name, "states": copy.deepcopy(states)})
+        return config
+
+    def update(self):
+        raise NotImplementedError()
+
+    def eval(self):
+        raise NotImplementedError()
+
+
+class CompositeMetric(MetricBase):
+    """
+    Compute multiple metrics in each minibatch.
+    for example, merge F1, accuracy, recall into one Metric.
+    """
+
+    def __init__(self, name=None, **kwargs):
+        super(CompositeMetric, self).__init__(name, kwargs)
+        self._metrics = []
+
+    def add_metric(self, metric):
+        if not isinstance(metric, MetricBase):
+            raise ValueError("SubMetric should be inherit from MetricBase.")
+        self._metrics.append(metric)
+
+    def eval(self):
+        ans = []
+        for m in self._metrics:
+            ans.append(m.eval())
+        return ans
+
+
+class Accuracy(MetricBase):
+    """
+    Accumulate the accuracy from minibatches and compute the average accuracy
+    for every pass.
+
+    Args:
+       name: the metrics name
+
+    Example:
+        minibatch_accuracy = fluid.layers.accuracy(pred, label)
+        accuracy_evaluator = fluid.metrics.Accuracy()
+        for epoch in PASS_NUM:
+            accuracy_evaluator.reset()
+            for data in batches:
+                loss = exe.run(fetch_list=[cost, minibatch_accuracy])
+            accuracy_evaluator.update(value=minibatch_accuracy, weight=batches)
+            accuracy = accuracy_evaluator.eval()
+    """
+
+    def __init__(self, name=None):
+        super(Accuracy, self).__init__(name)
+        self.value = .0
+        self.weight = .0
+
+    def update(self, value, weight):
+        if not _is_number_or_matrix_(value):
+            raise ValueError(
+                "The 'value' must be a number(int, float) or a numpy ndarray.")
+        if not _is_number_(weight):
+            raise ValueError("The 'weight' must be a number(int, float).")
+        self.value += value * weight
+        self.weight += weight
+
+    def eval(self):
+        if self.weight == 0:
+            raise ValueError(
+                "There is no data in Accuracy Metrics. Please check layers.accuracy output has added to Accuracy."
+            )
+        return self.value / self.weight
+
+
+class ChunkEvalutor(MetricBase):
+    """
+    Accumulate counter numbers output by chunk_eval from mini-batches and
+    compute the precision recall and F1-score using the accumulated counter
+    numbers.
+    """
+
+    def __init__(self, name=None):
+        super(ChunkEvalutor, self).__init__(name)
+        self.num_infer_chunks = 0
+        self.num_label_chunks = 0
+        self.num_correct_chunks = 0
+
+    def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks):
+        if not _is_number_or_matrix_(num_infer_chunks):
+            raise ValueError(
+                "The 'num_infer_chunks' must be a number(int, float) or a numpy ndarray."
+            )
+        if not _is_number_or_matrix_(num_label_chunks):
+            raise ValueError(
+                "The 'num_label_chunks' must be a number(int, float) or a numpy ndarray."
+            )
+        if not _is_number_or_matrix_(num_correct_chunks):
+            raise ValueError(
+                "The 'num_correct_chunks' must be a number(int, float) or a numpy ndarray."
+            )
+        self.num_infer_chunks += num_infer_chunks
+        self.num_label_chunks += num_label_chunks
+        self.num_correct_chunks += num_correct_chunks
+
+    def eval(self):
+        precision = float(
+            self.num_correct_chunks
+        ) / self.num_infer_chunks if self.num_infer_chunks else 0
+        recall = float(self.num_correct_chunks
+                       ) / self.num_label_chunks if self.num_label_chunks else 0
+        f1_score = float(2 * precision * recall) / (
+            precision + recall) if self.num_correct_chunks else 0
+        return precision, recall, f1_score
+
+
+class EditDistance(MetricBase):
+    """
+    Accumulate edit distance sum and sequence number from mini-batches and
+    compute the average edit_distance and instance error of all batches.
+
+    Args:
+        name: the metrics name
+
+    Example:
+        edit_distance_metrics = fluid.layers.edit_distance(input, label)
+        distance_evaluator = fluid.metrics.EditDistance()
+        for epoch in PASS_NUM:
+            distance_evaluator.reset()
+            for data in batches:
+                loss = exe.run(fetch_list=[cost] + list(edit_distance_metrics))
+            distance_evaluator.update(*edit_distance_metrics)
+            distance, instance_error = distance_evaluator.eval()
+
+        In the above example:
+        'distance' is the average of the edit distance in a pass.
+        'instance_error' is the instance error rate in a pass.
+
+    """
+
+    def __init__(self, name):
+        super(EditDistance, self).__init__(name)
+        self.total_distance = .0
+        self.seq_num = 0
+        self.instance_error = 0
+
+    def update(self, distances, seq_num):
+        if not _is_numpy_(distances):
+            raise ValueError("The 'distances' must be a numpy ndarray.")
+        if not _is_number_(seq_num):
+            raise ValueError("The 'seq_num' must be a number(int, float).")
+        seq_right_count = np.sum(distances == 0)
+        total_distance = np.sum(distances)
+        self.seq_num += seq_num
+        self.instance_error += seq_num - seq_right_count
+        self.total_distance += total_distance
+
+    def eval():
+        if self.seq_num == 0:
+            raise ValueError(
+                "There is no data in EditDistance Metric. Please check layers.edit_distance output has been added to EditDistance."
+            )
+        avg_distance = self.total_distance / self.seq_num
+        avg_instance_error = self.instance_error / self.seq_num
+        return avg_distance, avg_instance_error
+
+
+class DetectionMAP(MetricBase):
+    """
+    Calculate the detection mean average precision (mAP).
+
+    TODO (Dang Qingqing): update the following doc.
+    The general steps are as follows:
+    1. calculate the true positive and false positive according to the input
+        of detection and labels.
+    2. calculate mAP value, support two versions: '11 point' and 'integral'.
+
+    Please get more information from the following articles:
+      https://sanchom.wordpress.com/tag/average-precision/
+      https://arxiv.org/abs/1512.02325
+    """
+
+    def __init__(self, name=None):
+        super(DetectionMAP, self).__init__(name)
+        # the current map value
+        self.value = .0
+
+    def update(self, value, weight):
+        if not _is_number_or_matrix_(value):
+            raise ValueError(
+                "The 'value' must be a number(int, float) or a numpy ndarray.")
+        if not _is_number_(weight):
+            raise ValueError("The 'weight' must be a number(int, float).")
+        self.value += value
+        self.weight += weight
+
+    def eval(self):
+        if self.weight == 0:
+            raise ValueError(
+                "There is no data in DetectionMAP Metrics. "
+                "Please check layers.detection_map output has added to DetectionMAP."
+            )
+        return self.value / self.weight
+
+
+class Auc(MetricBase):
+    """
+    Auc Metrics which adapts to binary classification.
+    Need to note that auc metrics compute the value via Python natively.
+    If you concern the speed, please use the fluid.layers.auc instead.
+
+    The `auc` function creates four local variables, `true_positives`,
+      `true_negatives`, `false_positives` and `false_negatives` that are used to
+      compute the AUC. To discretize the AUC curve, a linearly spaced set of
+      thresholds is used to compute pairs of recall and precision values. The area
+      under the ROC-curve is therefore computed using the height of the recall
+      values by the false positive rate, while the area under the PR-curve is the
+      computed using the height of the precision values by the recall.
+
+    Args:
+        name: metric name
+        curve: Specifies the name of the curve to be computed, 'ROC' [default] or
+          'PR' for the Precision-Recall-curve.
+        num_thresholds: The number of thresholds to use when discretizing the roc
+            curve.
+
+    "NOTE: only implement the ROC curve type via Python now."
+    """
+
+    def __init__(self, name, curve='ROC', num_thresholds=200):
+        super(MetricBase, self).__init__(name, curve, num_thresholds)
+        self._curve = curve
+        self._num_thresholds = num_thresholds
+        self._epsilon = 1e-6
+        self.tp_list = np.ndarray((num_thresholds, ))
+        self.fn_list = np.ndarray((num_thresholds, ))
+        self.tn_list = np.ndarray((num_thresholds, ))
+        self.fp_list = np.ndarray((num_thresholds, ))
+
+    def update(self, labels, predictions, axis=1):
+        if not _is_numpy_(labels):
+            raise ValueError("The 'labels' must be a numpy ndarray.")
+        if not _is_numpy_(predictions):
+            raise ValueError("The 'predictions' must be a numpy ndarray.")
+
+        kepsilon = 1e-7  # to account for floating point imprecisions
+        thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
+                      for i in range(num_thresholds - 2)]
+        thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
+
+        # caculate TP, FN, TN, FP count
+        for idx_thresh, thresh in enumerate(thresholds):
+            tp, fn, tn, fp = 0, 0, 0, 0
+            for i, lbl in enumerate(labels):
+                if lbl:
+                    if predictions[i, 0] >= thresh:
+                        tp += 1
+                    else:
+                        fn += 1
+                else:
+                    if predictions[i, 0] >= thresh:
+                        fp += 1
+                    else:
+                        tn += 1
+            tp_list[idx_thresh] += tp
+            fn_list[idx_thresh] += fn
+            tn_list[idx_thresh] += tn
+            fp_list[idx_thresh] += fp
+
+    def eval(self):
+        epsilon = self._epsilon
+        num_thresholds = self._num_thresholds
+        tpr = (tp_list.astype("float32") + epsilon) / (
+            tp_list + fn_list + epsilon)
+        fpr = fp_list.astype("float32") / (fp_list + tn_list + epsilon)
+        rec = (tp_list.astype("float32") + epsilon) / (
+            tp_list + fp_list + epsilon)
+
+        x = fpr[:num_thresholds - 1] - fpr[1:]
+        y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0
+        auc_value = np.sum(x * y)
+        return auc_value
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 180575c35dc6e115e11cccf9fff9fb2d3cd7e9a6..36503cac6d5391821b977d90e6b77c4df7e3b564 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import re
 from collections import defaultdict
 from paddle.fluid.framework import Program
 import framework
@@ -818,8 +818,8 @@ class ModelAverage(Optimizer):
     min_average_window, max_average_window and current update times.
 
     Args:
-        params_grads: A list of parameter-grad variable pairs.
         average_window_rate: The rate of average window.
+        params_grads: A list of parameter-grad variable pairs.
         min_average_window: The minimum size of average window.
         max_average_window: The maximum size of average window.
 
@@ -840,8 +840,8 @@ class ModelAverage(Optimizer):
     """
 
     def __init__(self,
-                 params_grads,
                  average_window_rate,
+                 params_grads=None,
                  min_average_window=10000,
                  max_average_window=10000,
                  **kwargs):
@@ -849,24 +849,37 @@ class ModelAverage(Optimizer):
         self.average_window = average_window_rate
         self.min_average_window = min_average_window
         self.max_average_window = max_average_window
-        self.params_grads = params_grads
+
+        self.params_grads = [] if params_grads is None else params_grads
+        params = {}
+        for param, grad in self.params_grads:
+            if param.do_model_average != False:
+                params[param.name] = (param, grad)
+        for param in framework.default_main_program().global_block(
+        ).all_parameters():
+            if param.name not in params and param.do_model_average != False:
+                grad = param.block.create_var(
+                    name=unique_name.generate(".".join([param.name, 'tmp'])),
+                    dtype=param.dtype,
+                    persistable=False,
+                    stop_gradient=True)
+                params[param.name] = (param, grad)
+        self.params_grads = params.values()
+
         for param, grad in self.params_grads:
-            if grad is not None:
-                self._append_average_accumulate_op(param)
+            self._append_average_accumulate_op(param)
 
         self.apply_program = Program()
         block = self.apply_program.global_block()
         with program_guard(main_program=self.apply_program):
             for param_grad in self.params_grads:
-                if param_grad[1] is not None:
-                    self._add_average_apply_op(block, param_grad)
+                self._add_average_apply_op(block, param_grad)
 
         self.restore_program = Program()
         block = self.restore_program.global_block()
         with program_guard(main_program=self.restore_program):
             for param_grad in self.params_grads:
-                if param_grad[1] is not None:
-                    self._add_average_restore_op(block, param_grad)
+                self._add_average_restore_op(block, param_grad)
 
     def _add_average_apply_op(self, block, param_grad):
         param = block.clone_variable(param_grad[0])
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 24dfa6144ae9584f1678e662716da123352430dd..5ce2aa1fc4d0b275b502af0f97e4a0f83e85de5b 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -100,9 +100,11 @@ class ParallelExecutor(object):
         local_scopes = share_vars_from.executor.local_scopes(
         ) if share_vars_from else []
 
-        persistable_vars = [
+        self.persistable_vars = [
             v.name
-            for v in filter(lambda var: var.persistable, main.list_vars())
+            for v in filter(lambda var: \
+                var.persistable and var.type != core.VarDesc.VarType.RAW,
+                main.list_vars())
         ]
 
         self.executor = core.ParallelExecutor(
@@ -113,7 +115,7 @@ class ParallelExecutor(object):
                 p.name for p in main.global_block().iter_parameters()
                 if not p.stop_gradient
             ]),
-            set(persistable_vars),
+            set(self.persistable_vars),
             main.desc,
             loss_name if loss_name else '',
             scope,
@@ -143,3 +145,6 @@ class ParallelExecutor(object):
         self.executor.run(fetch_list, fetch_var_name, feed_tensor_dict)
         arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
         return [arr[i] for i in range(len(arr))]
+
+    def bcast_params(self):
+        self.executor.bcast_params(set(self.persistable_vars))
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index 255cd2104325afa31449cbd3875499a7c5d7f572..1c6970441bccdc1c1221503256c30c83502bd123 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -28,13 +28,15 @@ class ParamAttr(object):
                  learning_rate=1.0,
                  regularizer=None,
                  trainable=True,
-                 gradient_clip=None):
+                 gradient_clip=None,
+                 do_model_average=None):
         self.name = name
         self.initializer = initializer
         self.learning_rate = learning_rate
         self.regularizer = regularizer
         self.trainable = trainable
         self.gradient_clip = gradient_clip
+        self.model_average = do_model_average
 
     def set_default_initializer(self, initializer):
         if initializer is None:
@@ -80,7 +82,8 @@ class ParamAttr(object):
             },
             'regularizer': self.regularizer,
             'trainable': self.trainable,
-            'gradient_clip_attr': self.gradient_clip
+            'gradient_clip_attr': self.gradient_clip,
+            'model_average': self.model_average
         }
         if with_initializer:
             kwargs['initializer'] = self.initializer
@@ -90,7 +93,7 @@ class ParamAttr(object):
 class WeightNormParamAttr(ParamAttr):
     """
     Used for weight normalization. Any field in ParamAttr can also be set here.
-    Besides, an extra field dim can be set to indicate the dimension except 
+    Besides, an extra field dim can be set to indicate the dimension except
     which to normalize.
     """
     # List to record the parameters reparameterized by weight normalization.
diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index c0a6df831acbfe2654a5941cf95c91343992ef13..4d8bca4d2430a248ccf421572bdafdffc3a3003a 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -37,7 +37,7 @@ depth = 8
 mix_hidden_lr = 1e-3
 
 IS_SPARSE = True
-PASS_NUM = 10
+PASS_NUM = 100
 BATCH_SIZE = 10
 
 embedding_name = 'emb'
@@ -77,7 +77,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
     emb_layers.append(mark_embedding)
 
     hidden_0_layers = [
-        fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers
+        fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
+        for emb in emb_layers
     ]
 
     hidden_0 = fluid.layers.sums(input=hidden_0_layers)
@@ -94,8 +95,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
 
     for i in range(1, depth):
         mix_hidden = fluid.layers.sums(input=[
-            fluid.layers.fc(input=input_tmp[0], size=hidden_dim),
-            fluid.layers.fc(input=input_tmp[1], size=hidden_dim)
+            fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
+            fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh')
         ])
 
         lstm = fluid.layers.dynamic_lstm(
@@ -109,8 +110,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
         input_tmp = [mix_hidden, lstm]
 
     feature_out = fluid.layers.sums(input=[
-        fluid.layers.fc(input=input_tmp[0], size=label_dict_len),
-        fluid.layers.fc(input=input_tmp[1], size=label_dict_len)
+        fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
+        fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
     ])
 
     return feature_out
@@ -171,7 +172,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
     # check other optimizers and check why out will be NAN
     sgd_optimizer = fluid.optimizer.SGD(
         learning_rate=fluid.layers.exponential_decay(
-            learning_rate=0.0001,
+            learning_rate=0.01,
             decay_steps=100000,
             decay_rate=0.5,
             staircase=True))
@@ -233,7 +234,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
                         print("second per batch: " + str((time.time(
                         ) - start_time) / batch_id))
                     # Set the threshold low to speed up the CI test
-                    if float(pass_precision) > 0.05:
+                    if float(pass_precision) > 0.01:
                         if save_dirname is not None:
                             # TODO(liuyiqun): Change the target to crf_decode
                             fluid.io.save_inference_model(save_dirname, [
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index f10ef9b63412ecf74471f4fb94eb91ac72d5f8f9..3bd24c98a22b5db9833a312f481ed74c3d26f0ad 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1,6 +1,12 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
+# The fully connected test is removed whe the WITH_MKLDNN flag is OFF
+# Because the fully connected layer has only one kernel (MKLDNN)
+if(NOT WITH_MKLDNN)
+    list(REMOVE_ITEM TEST_OPS test_fc_op)
+endif(NOT WITH_MKLDNN)
+
 if(NOT WITH_DISTRIBUTE)
     list(REMOVE_ITEM TEST_OPS test_recv_op)
 endif(NOT WITH_DISTRIBUTE)
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index 10aa63e18a6eeaa44e5b12f7532998dca2bc5e9f..7ecf9a1459ffc9740ae8c12df3902163ee689f59 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -14,23 +14,13 @@
 
 import unittest
 import numpy as np
-from op_test import OpTest
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest
 from paddle.fluid.framework import grad_var_name
 
 
-def get_backward_op(scope, op, no_grad_set):
-    backward_op = core.Operator.backward(op, no_grad_set)
-    for input in backward_op.input_vars():
-        var = scope.var(input)
-        var.get_tensor()
-    for output in backward_op.output_vars():
-        var = scope.var(output)
-        var.get_tensor()
-    return backward_op
-
-
 def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
     x_shape = x.shape
     if len(x_shape) == 2:
@@ -64,11 +54,6 @@ def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
 
 def _reference_training(x, scale, offset, epsilon, data_format):
     x_shape = x.shape
-    if len(x_shape) == 2:
-        if data_format == "NCHW":
-            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
-        else:
-            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
 
     if data_format == "NCHW":
         n, c, h, w = x.shape
@@ -88,8 +73,6 @@ def _reference_training(x, scale, offset, epsilon, data_format):
         offset_tile = np.reshape(offset, (1, c, 1, 1))
         offset_tile = np.reshape(offset_tile, (1, c, 1, 1))
         y = normalized * scale_tile + offset_tile
-        if len(x_shape) == 2:
-            y = np.reshape(y, (y.shape[0], y.shape[1]))
         return y, mean, var
     elif data_format == "NHWC":
         x_square = x * x
@@ -100,59 +83,42 @@ def _reference_training(x, scale, offset, epsilon, data_format):
         var = x_square_sum / element_count - mean * mean
         normalized = (x - mean) / np.sqrt(var + epsilon)
         y = normalized * scale + offset
-        if len(x_shape) == 2:
-            y = np.reshape(y, x_shape)
         return y, mean, var
     else:
         raise ValueError("Unknown data order.")
 
 
-def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format):
+def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format):
     # Use the following formulas to calculate gradients:
     # grad_scale =
     #   sum(grad_y * (x - mean)) * rsqrt(var + epsilon)
     #
     # grad_offset = sum(output_y)
     #
-    # grad_x =
+    # x_grad =
     #   1/N * scale * rsqrt(var + epsilon) * (N * grad_y - sum(grad_y) -
     #   (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon))
 
     # transfer from (N, C, H, W) to (N, H, W, C) to simplify computation
-    x_shape = x.shape
-
-    if len(x_shape) == 2:
-        if data_format == "NCHW":
-            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
-            grad_y = np.reshape(grad_y,
-                                (grad_y.shape[0], grad_y.shape[1], 1, 1))
-        else:
-            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
-            grad_y = np.reshape(grad_y,
-                                (grad_y.shape[0], 1, 1, grad_y.shape[1]))
-
     if data_format == "NCHW":
         x = np.transpose(x, (0, 2, 3, 1))
-        grad_y = np.transpose(grad_y, (0, 2, 3, 1))
+        y_grad = np.transpose(y_grad, (0, 2, 3, 1))
 
-        # raise ValueError("data_format must be NHWC, got %s." % data_format)
-    grad_x = scale * (grad_y - np.mean(
-        grad_y, axis=(0, 1, 2)) - (x - mean) * np.mean(
-            grad_y * (x - mean), axis=(0, 1, 2)) /
+    x_grad = scale * (y_grad - np.mean(
+        y_grad, axis=(0, 1, 2)) - (x - mean) * np.mean(
+            y_grad * (x - mean), axis=(0, 1, 2)) /
                       (var + epsilon)) / np.sqrt(var + epsilon)
-    grad_scale = np.sum(grad_y * (x - mean) / np.sqrt(var + epsilon),
+    grad_scale = np.sum(y_grad * (x - mean) / np.sqrt(var + epsilon),
                         axis=(0, 1, 2))
-    grad_offset = np.sum(grad_y, axis=(0, 1, 2))
+    grad_offset = np.sum(y_grad, axis=(0, 1, 2))
 
     # transfer back to N, C, H, W
     if data_format == "NCHW":
-        grad_x = np.transpose(grad_x, (0, 3, 1, 2))
+        x_grad = np.transpose(x_grad, (0, 3, 1, 2))
         x = np.transpose(x, (0, 3, 1, 2))
-        grad_y = np.transpose(grad_y, (0, 3, 1, 2))
+        y_grad = np.transpose(y_grad, (0, 3, 1, 2))
 
-    if len(x_shape) == 2:
-        grad_x = np.reshape(grad_x, x_shape)
-    return grad_x, grad_scale, grad_offset
+    return x_grad, grad_scale, grad_offset
 
 
 def create_or_get_tensor(scope, var_name, var, place):
@@ -186,7 +152,7 @@ def set_output_grad(scope, outputs, place, feed_dict=None):
         __set_tensor__(output, data)
 
 
-class TestBatchNormOpInference(OpTest):
+class TestBatchNormOpInference(unittest.TestCase):
     def setUp(self):
         self.dtype = np.float32
 
@@ -304,231 +270,121 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
                 self.check_with_place(place, data_format, self.dtype, [2, 3])
 
 
-class TestBatchNormOpTraining(OpTest):
+class TestBatchNormOpTraining(unittest.TestCase):
     def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        if not np.allclose(np.array(tensor), np_array, atol=atol):
+            import pdb
+            pdb.set_trace()
         self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
 
-    def test_python_testing(self):
-        data_format = "NHWC"
-        epsilon = 0.00001
-
-        n, h, w, c = 2, 3, 4, 5
-        x_shape = [n, h, w, c]
-        scale_shape = [c]
-
-        x_val = np.random.random_sample(x_shape).astype(np.float32)
-        scale_val = np.random.random_sample(scale_shape).astype(np.float32)
-        bias_val = np.random.random_sample(scale_shape).astype(np.float32)
-
-        mean = np.zeros(scale_shape).astype(np.float32)
-        variance = np.ones(scale_shape).astype(np.float32)
-
-        y_out = _reference_testing(x_val, scale_val, bias_val, mean, variance,
-                                   epsilon, "NHWC")
-
-        # running N, C, H, W case
-        # should produce the same results
-        x_shape2 = [n, c, h, w]
-        x_val2 = np.transpose(x_val, (0, 3, 1, 2))
-        y_out2 = _reference_testing(x_val2, scale_val, bias_val, mean, variance,
-                                    epsilon, "NCHW")
-
-        # transfer (N, C, H, W) back to (N, H, W, C)
-        y_out2_trans = np.transpose(y_out2, (0, 2, 3, 1))
-        self.__assert_close(y_out, y_out2_trans, "inference output")
-        print 'python: NHWC, NCHW, inference checking passed'
-
-    def test_python_training(self):
-        data_format = "NHWC"
-        epsilon = 0.00001
-        momentum = 0.9
-
-        # N, H, W, C: 2, 3, 4, 2
-        n, h, w, c = 2, 3, 4, 5
-        x_shape = [n, h, w, c]
-        scale_shape = [c]
-
-        x_val = np.random.random_sample(x_shape).astype(np.float32)
-        scale_val = np.random.random_sample(scale_shape).astype(np.float32)
-        bias_val = np.random.random_sample(scale_shape).astype(np.float32)
-
-        mean = np.zeros(scale_shape).astype(np.float32)
-        variance = np.ones(scale_shape).astype(np.float32)
-
-        # run forward
-        y_out, saved_mean, var_ref = _reference_training(
-            x_val, scale_val, bias_val, epsilon, "NHWC")
-
-        #
-        mean_out = saved_mean * (1. - momentum) + momentum * mean
-        variance_out = var_ref * (1. - momentum) + momentum * variance
-        saved_variance = 1. / np.sqrt(var_ref + epsilon)
-
-        # running N, C, H, W case
-        # should produce the same results
-        x_shape2 = [n, c, h, w]
-        x_val2 = np.transpose(x_val, (0, 3, 1, 2))
-        y_out2, saved_mean2, var_ref2 = _reference_training(
-            x_val2, scale_val, bias_val, epsilon, "NCHW")
-
-        self.__assert_close(saved_mean, saved_mean2, "batch mean")
-        self.__assert_close(var_ref, var_ref2, "batch variance")
-
-        # transfer (N, C, H, W) back to (N, H, W, C)
-        y_out2_trans = np.transpose(y_out2, (0, 2, 3, 1))
-        self.__assert_close(y_out, y_out2_trans, "batch output")
-        print 'python: NHWC, NCHW, forward checking passed'
-
-        # test backward now
-        # NHWC
-        self.y_grad = np.random.random_sample(x_shape).astype(np.float32)
-        y_grad = self.y_grad
-        # y_grad = np.ones(x_shape).astype(np.float32)
-        x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad(
-            x_val, y_grad, scale_val, saved_mean, var_ref, epsilon, "NHWC")
-
-        # NCHW
-        y_grad2 = np.transpose(y_grad, (0, 3, 1, 2))
-        # y_grad2 = np.ones(x_shape2).astype(np.float32)
-        x_grad_ref2, scale_grad_ref2, bias_grad_ref2 = _reference_grad(
-            x_val2, y_grad2, scale_val, saved_mean2, var_ref2, epsilon, "NCHW")
-
-        self.__assert_close(scale_grad_ref, scale_grad_ref2, "scale gradient")
-        self.__assert_close(bias_grad_ref, bias_grad_ref2, "bias gradient")
-
-        x_grad_transpose = np.transpose(x_grad_ref2, (0, 2, 3, 1))
-        self.__assert_close(x_grad_ref, x_grad_transpose, "x gradient")
-        print 'python: NHWC, NCHW, backward checking passed'
-
     def test_forward_backward(self):
         def test_with_place(place, data_layout, shape):
             # attr
             epsilon = 0.00001
             momentum = 0.9
-
-            if len(shape) == 2:
-                x_shape = shape
-                c = shape[1]
+            if data_layout == "NCHW":
+                n, c, h, w = shape[0], shape[1], shape[2], shape[3]
             else:
-                # n, h, w, c = 2, 3, 4, 2
                 n, h, w, c = shape[0], shape[1], shape[2], shape[3]
-                if data_format == "NHWC":
-                    x_shape = [n, h, w, c]
-                elif data_format == "NCHW":
-                    x_shape = [n, c, h, w]
-                else:
-                    raise ValueError("Unknown data type.")
             scale_shape = [c]
 
-            x_val = np.random.random_sample(x_shape).astype(np.float32)
-            scale_val = np.random.random_sample(scale_shape).astype(np.float32)
-            bias_val = np.random.random_sample(scale_shape).astype(np.float32)
-
+            np.random.seed(123)
+            x = np.random.random_sample(shape).astype(np.float32)
+            scale = np.random.random_sample(scale_shape).astype(np.float32)
+            bias = np.random.random_sample(scale_shape).astype(np.float32)
             mean = np.zeros(scale_shape).astype(np.float32)
             variance = np.ones(scale_shape).astype(np.float32)
 
             # run forward
-            y_out, saved_mean, var_ref = _reference_training(
-                x_val, scale_val, bias_val, epsilon, data_format)
-
-            # update moving mean and variance
+            y, saved_mean, var_ref = _reference_training(x, scale, bias,
+                                                         epsilon, data_layout)
             mean_out = saved_mean * (1. - momentum) + momentum * mean
             variance_out = var_ref * (1. - momentum) + momentum * variance
             saved_variance = 1. / np.sqrt(var_ref + epsilon)
-
-            #  for gradient test
-            # y_grad = np.ones(x_shape).astype(np.float32)
-            y_grad = np.zeros(x_shape).astype(np.float32)
-            if len(y_grad.shape) == 2:
-                y_grad[0, 0] = 1.
-            else:
-                y_grad[0, 0, 0, 0] = 1.
-            # y_grad = np.random.random_sample(x_shape).astype(np.float32)
-            x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad(
-                x_val, y_grad, scale_val, saved_mean, var_ref, epsilon,
-                data_format)
-
-            scope = core.Scope()
-
-            # create input
-            x_tensor = create_or_get_tensor(scope, "x_val", x_val, place)
-            scale_tensor = create_or_get_tensor(scope, "scale_val", scale_val,
-                                                place)
-            bias_tensor = create_or_get_tensor(scope, "bias_val", bias_val,
-                                               place)
-            mean_tensor = create_or_get_tensor(scope, "mean", mean, place)
-            variance_tensor = create_or_get_tensor(scope, "variance", variance,
-                                                   place)
-
-            # create output
-            y_tensor = create_or_get_tensor(scope, "y_out", None, place)
-            saved_mean_tensor = create_or_get_tensor(scope, "saved_mean", None,
-                                                     place)
-            saved_variance_tensor = create_or_get_tensor(
-                scope, "saved_variance", None, place)
-            mean_out_tensor = mean_tensor
-            variance_out_tensor = variance_tensor
-
-            batch_norm_op = Operator(
-                "batch_norm",
-                # inputs
-                X="x_val",
-                Scale="scale_val",
-                Bias="bias_val",
-                Mean="mean",
-                Variance="variance",
-                # outputs
-                Y="y_out",
-                MeanOut="mean",
-                VarianceOut="variance",
-                SavedMean="saved_mean",
-                SavedVariance="saved_variance",
-                # attrs
-                is_test=False,
-                data_layout=data_layout,
-                momentum=momentum,
-                epsilon=epsilon)
-
-            batch_norm_op.run(scope, place)
-
-            # check forward result
-            self.__assert_close(y_tensor, y_out, "y_out")
-            self.__assert_close(saved_mean_tensor, saved_mean, "saved_mean")
-            self.__assert_close(saved_variance_tensor, saved_variance,
-                                "saved_variance")
-            self.__assert_close(mean_out_tensor, mean_out, "mean_out")
-            if isinstance(place, core.CUDAPlace):
-                atol = 5e-2
-            else:
-                atol = 1e-4
-            self.__assert_close(variance_out_tensor, variance_out,
-                                "variance_out", atol)
-            print "op test forward passed: ", str(place), data_layout
-
             # run backward
-            batch_norm_op_grad = get_backward_op(scope, batch_norm_op, set())
-            set_output_grad(
-                scope,
-                ["y_out", "mean", "variance", "saved_mean", "saved_variance"],
-                place,
-                feed_dict={"y_out": y_grad})
-            batch_norm_op_grad.run(scope, place)
-
-            x_grad_tensor = create_or_get_tensor(scope,
-                                                 grad_var_name("x_val"), None,
-                                                 place)
-            scale_grad_tensor = create_or_get_tensor(scope,
-                                                     grad_var_name("scale_val"),
-                                                     None, place)
-            bias_grad_tensor = create_or_get_tensor(scope,
-                                                    grad_var_name("bias_val"),
-                                                    None, place)
+            y_grad = np.random.random_sample(shape).astype(np.float32)
+            x_grad, scale_grad, bias_grad = _reference_grad(
+                x, y_grad, scale, saved_mean, var_ref, epsilon, data_format)
+
+            var_dict = locals()
+            var_dict['y@GRAD'] = y_grad
+
+            var_names = [
+                'x', 'scale', 'bias', 'mean', 'variance', 'y', 'saved_mean',
+                'saved_variance'
+            ]
+            ground_truth = {name: var_dict[name] for name in var_names}
+
+            program = fluid.Program()
+            with fluid.program_guard(program):
+                block = program.global_block()
+                for name in ground_truth:
+                    block.create_var(
+                        name=name,
+                        dtype='float32',
+                        shape=ground_truth[name].shape)
+                bn_op = block.append_op(
+                    type="batch_norm",
+                    inputs={
+                        "X": block.var('x'),
+                        "Scale": block.var('scale'),
+                        "Bias": block.var('bias'),
+                        "Mean": block.var('mean'),
+                        "Variance": block.var('variance')
+                    },
+                    outputs={
+                        "Y": block.var('y'),
+                        "MeanOut": block.var('mean'),  # share the same memory
+                        "VarianceOut":
+                        block.var('variance'),  # share the same memory
+                        "SavedMean": block.var('saved_mean'),
+                        "SavedVariance": block.var('saved_variance')
+                    },
+                    attrs={
+                        "momentum": momentum,
+                        "epsilon": epsilon,
+                        "is_test": False,
+                        "data_layout": data_layout
+                    })
+                block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
+
+                # generate backward op_desc
+                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                    bn_op.desc, set(), [])
+                grad_op_desc = grad_op_desc_list[0]
+                new_op_desc = block.desc.append_op()
+                new_op_desc.copy_from(grad_op_desc)
+                for var_name in grad_op_desc.output_arg_names():
+                    block.desc.var(var_name.encode("ascii"))
+                grad_op_desc.infer_var_type(block.desc)
+                grad_op_desc.infer_shape(block.desc)
+                for arg in grad_op_desc.output_arg_names():
+                    grad_var = block.desc.find_var(arg.encode("ascii"))
+                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+                exe = fluid.Executor(place)
+                out = exe.run(
+                    program,
+                    feed={
+                        name: var_dict[name]
+                        for name in
+                        ['x', 'scale', 'bias', 'mean', 'variance', 'y@GRAD']
+                    },
+                    fetch_list=[
+                        'y', 'mean', 'variance', 'saved_mean', 'saved_variance',
+                        'x@GRAD', 'scale@GRAD', 'bias@GRAD'
+                    ])
+
+            self.__assert_close(y, out[0], "y")
+            self.__assert_close(mean_out, out[1], "mean")
+            self.__assert_close(variance_out, out[2], "variance", 1e-3)
+            self.__assert_close(saved_mean, out[3], "saved_mean")
+            self.__assert_close(saved_variance, out[4], "saved_variance", 1e-3)
+            self.__assert_close(x_grad, out[5], "x_grad")
+            self.__assert_close(scale_grad, out[6], "scale_grad")
+            self.__assert_close(bias_grad, out[7], "bias_grad")
 
-            # check gradient output
-            self.__assert_close(x_grad_tensor, x_grad_ref, "x_grad")
-            self.__assert_close(scale_grad_tensor, scale_grad_ref, "scale_grad")
-            self.__assert_close(bias_grad_tensor, bias_grad_ref, "bias_grad")
-            print "op test backward passed: ", str(place), data_layout
+            print "op test forward passed: ", str(place), data_layout
 
         places = [core.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
@@ -537,7 +393,6 @@ class TestBatchNormOpTraining(OpTest):
         for place in places:
             for data_format in ["NCHW", "NHWC"]:
                 test_with_place(place, data_format, [2, 3, 4, 5])
-                test_with_place(place, data_format, [2, 3])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_cond_op.py b/python/paddle/fluid/tests/unittests/test_cond_op.py
deleted file mode 100644
index 66fbae961a2701e79da5222ae2689108335c4065..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_cond_op.py
+++ /dev/null
@@ -1,128 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import paddle.fluid.core as core
-import unittest
-import numpy as np
-from paddle.fluid.op import Operator, CondOp
-
-
-class PySimpleCond(object):
-    '''
-    A simple implementation of dynamic if-else based on numpy
-    '''
-
-    def __init__(self):
-        array = [1] * 10
-        for i in range(1, 10, 2):
-            array[i] = 0
-        self.cond = np.array(array)
-        self.x = np.ones(shape=(10, 1)).astype("float32")
-
-    def forward(self):
-        self.index_t = np.where(self.cond == 1)
-        self.index_f = np.where(self.cond == 0)
-        y_t = self.x[self.index_t]
-        y_f = self.x[self.index_f]
-        y_t = y_t * 2.
-        y_f = y_f * (-2.)
-        output = np.zeros(shape=(10, 1))
-        output[self.index_t] = y_t
-        output[self.index_f] = y_f
-        return output
-
-
-class PySimpleCondTest(unittest.TestCase):
-    def setUp(self):
-        self.condnn = PySimpleCond()
-
-    def test_forward(self):
-        output = self.condnn.forward()
-
-
-def create_tensor(scope, name, shape, np_data):
-    tensor = scope.var(name).get_tensor()
-    tensor.set_dims(shape)
-    tensor.set(np_data, core.CPUPlace())
-    return tensor
-
-
-class TestCondOp(unittest.TestCase):
-    '''
-    Test CondOp
-
-    equation:
-        cond = [True, False, True, False, ...]
-        y[index_t] = x[index_t] * 2.
-        y[index_f] = x[index_f] * -2.
-    outputs:
-        y
-    '''
-
-    def setUp(self):
-        self.py_cond = PySimpleCond()
-
-    def forward(self):
-        self.scope = core.Scope()
-        self.create_global_variables()
-        self.create_cond_op()
-        self.create_sub_net()
-        self.condop.run(self.scope, core.CPUPlace())
-        return np.array(self.scope.find_var("Out").get_tensor())
-
-    def create_global_variables(self):
-        x_np_data = self.py_cond.x
-        create_tensor(self.scope, "X", [10, 1], x_np_data)
-        cond_np_data = self.py_cond.cond.astype("int32")
-        create_tensor(self.scope, "cond", [10, 1], cond_np_data)
-        self.scope.var("SubScopes")
-        self.scope.var("IndexTensors")
-        self.scope.var("Out")
-
-    def create_cond_op(self):
-        self.condop = CondOp(
-            Cond="cond",
-            Xs=["X"],
-            Outs=["Out"],
-            SubScopes="SubScopes",
-            IndexTensors="IndexTensors")
-
-    def create_sub_net(self):
-        truenet = core.Net.create()
-        scale_op_t = Operator("scale", X='X', Out='Out', scale=2.)
-        truenet.append_op(scale_op_t)
-        truenet.complete_add_op(True)
-        self.condop.set_truenet(truenet)
-
-        falsenet = core.Net.create()
-        scale_op_t = Operator("scale", X='X', Out='Out', scale=-2.)
-        falsenet.append_op(scale_op_t)
-        falsenet.complete_add_op(True)
-        self.condop.set_falsenet(falsenet)
-
-    def test_forward(self):
-        print 'test cond op forward'
-        pd_output = self.forward()
-        py_output = self.py_cond.forward()
-        print 'pd_output', pd_output
-        print
-        print 'py_output', py_output
-        self.assertEqual(pd_output.shape, py_output.shape)
-        print 'test passed'
-        return 0
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
index 8c67e45b7fc997012af5f678f21271ad8b220edc..69365db4d104a1b69916a605534eff83e242289f 100644
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
@@ -15,10 +15,8 @@ import unittest
 import numpy as np
 
 from operator import mul
-from op_test import OpTest
 import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
 
 np.random.random(123)
 
@@ -70,161 +68,93 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1):
     return grad_x, d_scale, d_bias
 
 
-def get_backward_op(scope, op, no_grad_set):
-    backward_op = core.Operator.backward(op, no_grad_set)
-    for input in backward_op.input_vars():
-        var = scope.var(input)
-        var.get_tensor()
-    for output in backward_op.output_vars():
-        var = scope.var(output)
-        var.get_tensor()
-    return backward_op
-
-
-def create_or_get_tensor(scope, var_name, var, place):
-    tensor = scope.var(var_name).get_tensor()
-    if var is not None:
-        assert isinstance(var, np.ndarray)
-        tensor.set_lod([[]])
-        tensor.set_dims(var.shape)
-        tensor.set(var, place)
-    return tensor
-
-
-def set_output_grad(scope, outputs, place, feed_dict=None):
-    def __set_tensor__(name, data=None):
-        out_tensor = scope.find_var(name).get_tensor()
-        grad_tensor = scope.var(grad_var_name(name)).get_tensor()
-        out_dtype = out_tensor.dtype()
-        if data is None:
-            if out_dtype == core.VarDesc.VarType.FP64:
-                data = np.ones(out_tensor.shape(), dtype=np.float64)
-            elif out_dtype == core.VarDesc.VarType.FP32:
-                data = np.ones(out_tensor.shape(), dtype=np.float32)
-            else:
-                raise ValueError("Not supported data type " + str(out_dtype))
-        grad_tensor.set(data, place)
-
-    for output in outputs:
-        data = None
-        if output in feed_dict:
-            data = feed_dict[output]
-        __set_tensor__(output, data)
-
-
-class TestLayerNormdOp(OpTest):
+class TestLayerNormdOp(unittest.TestCase):
     def __assert_close(self, tensor, np_array, msg, atol=1e-4):
         self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
 
-    def __assert_grad_close(self,
-                            tensor,
-                            np_array,
-                            name,
-                            place,
-                            max_relative_error=0.02):
-        a = np.array(tensor)
-        b = np_array
-        abs_a = np.abs(a)
-        abs_a[abs_a < 1e-5] = 1
-
-        diff_mat = np.abs(a - b) / abs_a
-        max_diff = np.max(diff_mat)
-
-        def err_msg():
-            offset = np.argmax(diff_mat > max_relative_error)
-            return ("%s Variable %s max gradient diff %f over limit %f, "
-                    "the first error element is %d, %f, %f") % (
-                        "Gradient Check On %s" % str(place), name, max_diff,
-                        max_relative_error, offset, a.flatten()[offset],
-                        b.flatten()[offset])
-
-        self.assertLessEqual(max_diff, max_relative_error, err_msg())
-
     def check_forward_backward(self, shape, begin_norm_axis):
-        def test_with_place(place, shape, begin_norm_axis=1):
-            # setUp
-            assert begin_norm_axis > 0 and begin_norm_axis < len(
-                shape), 'begin_norm_axis must be between 0 and len(shape)-1.'
+        def test_with_place(place, shape, begin_norm_axis):
             # attr
             epsilon = 0.00001
             x_shape = shape
             D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
             scale_shape = [D]
 
-            x_val = np.random.random_sample(x_shape).astype(np.float32)
-            scale_val = np.random.random_sample(scale_shape).astype(np.float32)
-            bias_val = np.random.random_sample(scale_shape).astype(np.float32)
+            np.random.seed(123)
+            x = np.random.random_sample(x_shape).astype(np.float32)
+            scale = np.random.random_sample(scale_shape).astype(np.float32)
+            bias = np.random.random_sample(scale_shape).astype(np.float32)
             y_grad = np.random.random_sample(x_shape).astype(np.float32)
 
-            # run forward
-            y_out, saved_mean, var_ref = _reference_layer_norm_naive(
-                x_val, scale_val, bias_val, epsilon, begin_norm_axis)
-            naive_fw = {"Y": y_out, "Mean": saved_mean, "Variance": var_ref}
-
-            # get gradient
-            x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_layer_norm_grad(
-                x_val, y_grad, scale_val, saved_mean, var_ref, begin_norm_axis)
-            naive_grad = {
-                "X": x_grad_ref,
-                "Scale": scale_grad_ref,
-                "Bias": bias_grad_ref
-            }
-
-            scope = core.Scope()
-
-            # create input
-            input_map = {"X": x_val, "Scale": scale_val, "Bias": bias_val}
-            for i_name in input_map:
-                create_or_get_tensor(scope, i_name, input_map[i_name], place)
-
-            # create output
-            output_map = {"Y": None, "Mean": None, "Variance": None}
-            output_tensor = {}
-            for o_name in output_map:
-                output_tensor[o_name] = create_or_get_tensor(
-                    scope, o_name, output_map[o_name], place)
-
-            layer_norm_op = Operator(
-                "layer_norm",
-                # inputs
-                X="X",
-                Scale="Scale",
-                Bias="Bias",
-                # outputs
-                Y="Y",
-                Mean="Mean",
-                Variance="Variance",
-                # attrs
-                epsilon=epsilon,
-                begin_norm_axis=begin_norm_axis)
-
-            layer_norm_op.run(scope, place)
-
-            # check forward result
-            atol = 5e-2 if isinstance(place, core.CUDAPlace) else 1e-4
-            for o_tensor in output_tensor:
-                self.__assert_close(output_tensor[o_tensor], naive_fw[o_tensor],
-                                    o_tensor, atol)
-
-            # run backward
-            layer_norm_op_grad = get_backward_op(scope, layer_norm_op, set())
-            set_output_grad(
-                scope, ["Y", "Mean", "Variance"],
-                place,
-                feed_dict={"Y": y_grad})
-            layer_norm_op_grad.run(scope, place)
-
-            # get output
-            grad_tensor = {}
-            for o_name in naive_grad:
-                grad_tensor[o_name] = x_ = create_or_get_tensor(
-                    scope, grad_var_name(o_name), None, place)
-
-            # check gradient output
-            for o_grad in naive_grad:
-                self.__assert_grad_close(grad_tensor[o_grad],
-                                         naive_grad[o_grad], o_grad + "@GRAD",
-                                         place)
+            # reference forward & backward
+            y, mean, variance = _reference_layer_norm_naive(
+                x, scale, bias, epsilon, begin_norm_axis)
+            x_grad, scale_grad, bias_grad = _reference_layer_norm_grad(
+                x, y_grad, scale, mean, variance, begin_norm_axis)
+
+            var_dict = locals()
+            var_dict['y@GRAD'] = y_grad
+            var_names = [
+                'x', 'scale', 'bias', 'mean', 'variance', 'y', 'y@GRAD'
+            ]
+            ground_truth = {name: var_dict[name] for name in var_names}
+
+            program = fluid.Program()
+            with fluid.program_guard(program):
+                block = program.global_block()
+                for name in ground_truth:
+                    block.create_var(
+                        name=name,
+                        dtype='float32',
+                        shape=ground_truth[name].shape)
+                layer_norm_op = block.append_op(
+                    type="layer_norm",
+                    inputs={
+                        "X": block.var('x'),
+                        "Scale": block.var('scale'),
+                        "Bias": block.var('bias'),
+                    },
+                    outputs={
+                        "Y": block.var('y'),
+                        "Mean": block.var('mean'),  # share the same memory
+                        "Variance":
+                        block.var('variance'),  # share the same memory
+                    },
+                    attrs={
+                        "epsilon": epsilon,
+                        "begin_norm_axis": begin_norm_axis
+                    })
+
+                # generate backward op_desc
+                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                    layer_norm_op.desc, set(), [])
+                grad_op_desc = grad_op_desc_list[0]
+                new_op_desc = block.desc.append_op()
+                new_op_desc.copy_from(grad_op_desc)
+                for var_name in grad_op_desc.output_arg_names():
+                    block.desc.var(var_name.encode("ascii"))
+                grad_op_desc.infer_var_type(block.desc)
+                grad_op_desc.infer_shape(block.desc)
+                for arg in grad_op_desc.output_arg_names():
+                    grad_var = block.desc.find_var(arg.encode("ascii"))
+                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+                exe = fluid.Executor(place)
+                out = exe.run(program,
+                              feed={
+                                  name: var_dict[name]
+                                  for name in ['x', 'scale', 'bias', 'y@GRAD']
+                              },
+                              fetch_list=[
+                                  'y', 'mean', 'variance', 'x@GRAD',
+                                  'scale@GRAD', 'bias@GRAD'
+                              ])
+                self.__assert_close(y, out[0], "y")
+                self.__assert_close(mean, out[1], "mean")
+                self.__assert_close(variance, out[2], "variance", 1e-3)
+                self.__assert_close(x_grad, out[3], "x_grad")
+                self.__assert_close(scale_grad, out[4], "scale_grad", 1e-3)
+                self.__assert_close(bias_grad, out[5], "bias_grad")
 
         places = [core.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
@@ -237,15 +167,6 @@ class TestLayerNormdOp(OpTest):
         self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
         self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3)
 
-    def test_check_forward_backward_with_scale(self):
-        pass  # TODO(zcd)
-
-    def test_check_forward_backward_with_bias(self):
-        pass  # TODO(zcd)
-
-    def test_check_forward_backward(self):
-        pass  # TODO(zcd)
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 2179826d81f715d6d280aea28a76f919330dd644..f88a6f1ce6e953c54da29f9e96199169b2cecd8b 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -32,7 +32,6 @@ class TestBook(unittest.TestCase):
             cost = layers.square_error_cost(input=y_predict, label=y)
             avg_cost = layers.mean(cost)
             self.assertIsNotNone(avg_cost)
-            program.append_backward(avg_cost)
 
         print(str(program))
 
@@ -94,8 +93,6 @@ class TestBook(unittest.TestCase):
             cost = layers.cross_entropy(input=predict, label=label)
             avg_cost = layers.mean(cost)
 
-            program.append_backward(avg_cost)
-
         print(str(program))
 
     def test_word_embedding(self):
diff --git a/python/paddle/fluid/tests/unittests/test_multiple_reader.py b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
similarity index 91%
rename from python/paddle/fluid/tests/unittests/test_multiple_reader.py
rename to python/paddle/fluid/tests/unittests/test_multi_file_reader.py
index a60a5d6c4af2b6b3652d0fe2089018b9403eee25..5dc41e54d6158787eb966333c894e378b5c706d0 100644
--- a/python/paddle/fluid/tests/unittests/test_multiple_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
@@ -61,8 +61,12 @@ class TestMultipleReader(unittest.TestCase):
             exe.run(fluid.default_startup_program())
 
             batch_count = 0
-            while not data_files.eof():
-                img_val, = exe.run(fetch_list=[img])
+            while True:
+                try:
+                    img_val, = exe.run(fetch_list=[img])
+                except fluid.core.EnforceNotMet as ex:
+                    self.assertIn("There is no next data.", ex.message)
+                    break
                 batch_count += 1
                 self.assertLessEqual(img_val.shape[0], self.batch_size)
             data_files.reset()
diff --git a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
index 0b7a29075939a548320185947b5afa7261029d49..1471843ded7a42432a84a9fad76bb97dcf7fb9c2 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
@@ -44,7 +44,7 @@ class TestMultipleReader(unittest.TestCase):
                 shapes=[(-1, 784), (-1, 1)],
                 lod_levels=[0, 0],
                 dtypes=['float32', 'int64'])
-            data_file = fluid.layers.create_multi_pass_reader(
+            data_file = fluid.layers.io.multi_pass(
                 reader=data_file, pass_num=self.pass_num)
             img, label = fluid.layers.read_file(data_file)
 
@@ -57,8 +57,12 @@ class TestMultipleReader(unittest.TestCase):
             exe.run(fluid.default_startup_program())
 
             batch_count = 0
-            while not data_file.eof():
-                img_val, = exe.run(fetch_list=[img])
+            while True:
+                try:
+                    img_val, = exe.run(fetch_list=[img])
+                except fluid.core.EnforceNotMet as ex:
+                    self.assertIn("There is no next data.", ex.message)
+                    break
                 batch_count += 1
                 self.assertLessEqual(img_val.shape[0], self.batch_size)
             data_file.reset()
diff --git a/python/paddle/fluid/tests/unittests/test_net.py b/python/paddle/fluid/tests/unittests/test_net.py
deleted file mode 100644
index ae1699d647d7c0adab36200fb07bde12085053c1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_net.py
+++ /dev/null
@@ -1,53 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-import unittest
-
-
-def fc(X, W, Y):
-    ret_v = core.Net.create()
-
-    ret_v.append_op(Operator("mul", X="X", Y="W", Out="pre_activation"))
-    ret_v.append_op(Operator("sigmoid", X="pre_activation", Out=Y))
-    ret_v.complete_add_op(True)
-    return ret_v
-
-
-class TestNet(unittest.TestCase):
-    def test_net_all(self):
-        net = core.Net.create()
-        op1 = Operator("sum", X=["X", "Y"], Out="Out")
-        net.append_op(op1)
-
-        net2 = core.Net.create()
-        net2.append_op(fc(X="X", W="w", Y="fc.out"))
-        net2.complete_add_op(True)
-        net.append_op(net2)
-        net.complete_add_op(True)
-
-        expected = '''
-Op(plain_net), inputs:{all[W, X, Y]}, outputs:{all[Out, fc.out, pre_activation]}.
-    Op(sum), inputs:{X[X, Y]}, outputs:{Out[Out]}.
-    Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
-        Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
-            Op(mul), inputs:{X[X], Y[W]}, outputs:{Out[pre_activation]}.
-            Op(sigmoid), inputs:{X[pre_activation]}, outputs:{Out[fc.out]}.
-'''
-        self.assertEqual(expected, "\n" + str(net))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
index 8401716db88ef3dda68644a052d78b4476c9fdc7..95845ea4de54ad43754ec5811d28ed52a8a3ae86 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -26,11 +26,14 @@ def simple_fc_net(use_feed):
         img = fluid.layers.data(name='image', shape=[784], dtype='float32')
         label = fluid.layers.data(name='label', shape=[1], dtype='int64')
     else:
-        reader = fluid.layers.open_recordio_file(
-            filename='./mnist.recordio',
+        reader = fluid.layers.open_files(
+            filenames=['./mnist.recordio'],
             shapes=[[-1, 784], [-1, 1]],
             lod_levels=[0, 0],
-            dtypes=['float32', 'int64'])
+            dtypes=['float32', 'int64'],
+            thread_num=1,
+            for_parallel=True)
+        reader = fluid.layers.io.double_buffer(reader)
         img, label = fluid.layers.read_file(reader)
     hidden = img
     for _ in xrange(4):
@@ -51,11 +54,14 @@ def fc_with_batchnorm(use_feed):
         img = fluid.layers.data(name='image', shape=[784], dtype='float32')
         label = fluid.layers.data(name='label', shape=[1], dtype='int64')
     else:
-        reader = fluid.layers.open_recordio_file(
-            filename='./mnist.recordio',
+        reader = fluid.layers.open_files(
+            filenames=['mnist.recordio'],
             shapes=[[-1, 784], [-1, 1]],
             lod_levels=[0, 0],
-            dtypes=['float32', 'int64'])
+            dtypes=['float32', 'int64'],
+            thread_num=1,
+            for_parallel=True)
+        reader = fluid.layers.io.double_buffer(reader)
         img, label = fluid.layers.read_file(reader)
 
     hidden = img
@@ -467,7 +473,7 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
             loss = simple_fc_net(True)
             test_program = main.clone(for_test=True)
 
-            opt = fluid.optimizer.SGD(learning_rate=0.0001)
+            opt = fluid.optimizer.SGD(learning_rate=0.001)
             opt.minimize(loss)
 
             batch_size = 32
@@ -494,4 +500,8 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
 
                 train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
                 train_loss = numpy.array(train_loss)
-                self.assertTrue(numpy.allclose(train_loss, test_loss))
+                self.assertTrue(
+                    numpy.allclose(
+                        train_loss, test_loss, atol=1e-8),
+                    "Train loss: " + str(train_loss) + "\n Test loss:" +
+                    str(test_loss))
diff --git a/python/paddle/fluid/tests/unittests/test_program.py b/python/paddle/fluid/tests/unittests/test_program.py
index 87a2195f0d5c7fd355ea01a3c8f60908b33d4b9d..c51a48239330621d8e008415f81361616467cabf 100644
--- a/python/paddle/fluid/tests/unittests/test_program.py
+++ b/python/paddle/fluid/tests/unittests/test_program.py
@@ -87,57 +87,6 @@ class TestProgram(unittest.TestCase):
         print(prog)
         print(prog_restored)
 
-    def test_append_backward(self):
-        prog = Program()
-        block = prog.global_block()
-
-        mul_x = block.create_var(
-            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        mul_op = block.append_op(
-            type="mul",
-            inputs={"X": [mul_x],
-                    "Y": mul_y},
-            outputs={"Out": [mul_out]},
-            attrs={"x_num_col_dims": 1})
-
-        add_y = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="add.y")
-        add_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="add.out")
-        add_op = block.append_op(
-            type="elementwise_add",
-            inputs={"X": mul_out,
-                    "Y": add_y},
-            outputs={"Out": add_out},
-            attrs={"x_num_col_dims": 1})
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-        block.append_op(
-            type="mean", inputs={"X": add_out}, outputs={"Out": mean_out})
-
-        self.assertEqual(mul_op.idx, 0)
-        self.assertEqual(add_op.idx, 1)
-        param_to_grad = prog.append_backward(mean_out, set())
-
-        for var_name in ("mul.x", "mul.y", "mul.out", "add.y", "add.out",
-                         "mean.out"):
-            self.assertEqual(param_to_grad[var_name][0],
-                             grad_var_name(var_name))
-            self.assertEqual(param_to_grad[var_name][1], 0)
-
-        expect_ops = [
-            "mul", "elementwise_add", "mean", "fill_constant", "mean_grad",
-            "elementwise_add_grad", "mul_grad"
-        ]
-        actual_ops = []
-        for op in block.ops:
-            actual_ops.append(op.type)
-        self.assertEqual(actual_ops, expect_ops)
-
     def test_program_clone_with_parameter(self):
         main_program = Program()
         startup_program = Program()
diff --git a/python/paddle/fluid/tests/unittests/test_recordio_reader.py b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
index 24a0074d9b9621d902d12eb8cb29d9b65be22ed3..7c8e7f634fdd3ee3f056a95df774402a7c29e906 100644
--- a/python/paddle/fluid/tests/unittests/test_recordio_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
@@ -65,8 +65,13 @@ class TestRecordIO(unittest.TestCase):
 
             # train a pass
             batch_id = 0
-            while not data_file.eof():
-                tmp, = exe.run(fetch_list=[avg_loss])
+            while True:
+                try:
+                    tmp, = exe.run(fetch_list=[avg_loss])
+                except fluid.core.EnforceNotMet as ex:
+                    self.assertIn("There is no next data.", ex.message)
+                    break
+
                 avg_loss_np.append(tmp)
                 batch_id += 1
             data_file.reset()
@@ -74,8 +79,8 @@ class TestRecordIO(unittest.TestCase):
             self.assertLess(avg_loss_np[-1], avg_loss_np[0])
 
     def test_shuffle_reader(self):
-        self.test_main(decorator_callback=lambda reader: fluid.layers.create_shuffle_reader(reader, buffer_size=200))
+        self.test_main(decorator_callback=lambda reader: fluid.layers.io.shuffle(reader, buffer_size=200))
 
     def test_double_buffer_reader(self):
-        self.test_main(decorator_callback=lambda reader: fluid.layers.create_double_buffer_reader(reader,
+        self.test_main(decorator_callback=lambda reader: fluid.layers.io.double_buffer(reader,
                                                                                                   place='cuda:0' if fluid.core.is_compiled_with_cuda() else 'cpu'))