diff --git a/.travis.yml b/.travis.yml
index 8c772030925dcad3909f142b08e4d8057a3f89b7..361136ac2c8d899a0d7a4d7945083fcc489551b5 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -27,15 +27,6 @@ script:
     # 43min timeout
     paddle/scripts/paddle_docker_build.sh ${JOB}
     if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
-  - |
-    if [[ "$JOB" != "doc" ]]; then exit 0; fi;
-    # For document only
-    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
-    if [[ "$TRAVIS_BRANCH" != "develop"  && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
-    export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
-    export DOCS_DIR=`pwd`
-    cd ..
-    curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/
 notifications:
   email:
     on_success: change
diff --git a/AUTHORS.md b/AUTHORS.md
index 8c4a113fc276783c945867ceae9612339b7f0bbc..41b7193677a0208ba2fa82b72862292572dcb6ef 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -46,6 +46,7 @@
 | tianbingsz | Tian-Bing Xu |
 | tpatejko | Tomasz Patejko |
 | typhoonzero | Yi Wu |
+| velconia | Qi-Yang Min |
 | wanghaoshuang | Hao-Shuang Wang |
 | wangyang59 | Yang Wang |
 | wangzhen-nlp | Zhen Wang |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 23bb27e77b9eab0c322a71a8ff570d12d1050377..ed3120f399b6444c801c6fffa435b9835fec90e0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,7 +65,14 @@ option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better d
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
+option(WITH_INFERENCE    "Compile fluid inference library"              ON)
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
+option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
+
+# PY_VERSION
+if(NOT PY_VERSION)
+  set(PY_VERSION 2.7)
+endif()
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -103,6 +110,11 @@ if(ANDROID OR IOS)
     add_definitions(-DPADDLE_MOBILE_INFERENCE)
 endif()
 
+if (APPLE OR WIN32)
+    set(WITH_MKL OFF CACHE STRING
+        "Disable MKL for building on mac and windows" FORCE)
+endif()
+
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
   "A path setting third party libraries download & build directories.")
 
@@ -125,6 +137,12 @@ else()
     set(THIRD_PARTY_BUILD_TYPE Release)
 endif()
 
+if(WITH_MKL)
+  option(MKL_SPLIT_GEMM "PaddlePaddle MKL gemm would split to small ones" OFF)
+  if (MKL_SPLIT_GEMM)
+    add_definitions(-DPADDLE_MKL_SPLIT_GEMM)
+  endif()
+endif()
 set(WITH_MKLML ${WITH_MKL})
 if (NOT DEFINED WITH_MKLDNN)
     if (WITH_MKL AND AVX2_FOUND)
@@ -141,6 +159,7 @@ endif()
 ########################################################################################
 
 include(external/mklml)     # download mklml package
+include(external/libxsmm)   # download, build, install libxsmm
 include(external/zlib)      # download, build, install zlib
 include(external/gflags)    # download, build, install gflags
 include(external/glog)      # download, build, install glog
@@ -156,6 +175,7 @@ include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/cares)
+include(external/cub)
 
 if(WITH_DISTRIBUTE)
     if(WITH_GRPC)
@@ -182,6 +202,14 @@ include(external/snappy)    # download snappy
 include(external/snappystream)
 include(external/threadpool)
 
+if(WITH_GPU)
+    include(cuda)
+    include(tensorrt)
+    include(external/anakin)
+else()
+  set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
+endif()
+
 include(cudnn)              # set cudnn libraries, must before configure
 include(cupti)
 include(configure)          # add paddle env configuration
@@ -210,14 +238,6 @@ set(EXTERNAL_LIBS
     ${PYTHON_LIBRARIES}
 )
 
-if(WITH_GPU)
-    include(cuda)
-    include(tensorrt)
-    include(external/anakin)
-else()
-  set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
-endif()
-
 if(WITH_AMD_GPU)
     find_package(HIP)
     include(hip)
@@ -227,6 +247,10 @@ if(WITH_MKLML)
     list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
 endif()
 
+if(WITH_LIBXSMM)
+    list(APPEND EXTERNAL_LIBS ${LIBXSMM_LIBS})
+endif()
+
 if(WITH_MKLDNN)
     list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
 endif()
@@ -266,7 +290,3 @@ if(WITH_DOC)
     find_python_module(recommonmark REQUIRED)
     add_subdirectory(doc)
 endif()
-
-if (WITH_CONTRIB)
-    add_subdirectory(paddle/contrib)
-endif()
diff --git a/Dockerfile b/Dockerfile
index 48c750358cfcb227667c429f19befcaa2f51ebbd..402adee2ea2822250ebc8f6229fd6a44545d58e5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -80,7 +80,7 @@ RUN pip install pre-commit 'ipython==5.3.0' && \
     pip install opencv-python
 
 #For docstring checker
-RUN pip install pylint pytest astroid isort
+RUN pip install pylint pytest astroid isort LinkChecker
 
 COPY ./python/requirements.txt /root/
 RUN pip install -r /root/requirements.txt
diff --git a/README.md b/README.md
index eb99ed21d02650ef16cc7da91836909c02895be9..a67cb8ad439f462c361cb6bac2449c3a4b042126 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,21 @@ learning to many products at Baidu.
 Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
 
-### Lastest PaddlePaddle Version: [Fluid](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid)
+
+### Latest PaddlePaddle Release: [Fluid 0.14.0](https://github.com/PaddlePaddle/Paddle/tree/v0.14.0)
+### Install Latest Stable Release:
+```
+# Linux CPU
+pip install paddlepaddle
+# Linux GPU cuda9cudnn7
+pip install paddlepaddle-gpu
+# Linux GPU cuda8cudnn7
+pip install paddlepaddle-gpu==0.14.0.post87
+# Linux GPU cuda8cudnn5
+pip install paddlepaddle-gpu==0.14.0.post85
+
+# For installation on other platform, refer to http://paddlepaddle.org/
+```
 
 ## Features
 
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index 94ea7bd6aca7c9595037a2dacc5e36d4c77827e7..6b22f8f520e3d9c6c89d41a7455a6f9ebbad6d80 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -85,8 +85,7 @@ def dist_transpile(trainer_id, args):
         trainer_id,
         pservers=pserver_endpoints,
         trainers=trainers,
-        sync_mode=not args.async_mode,
-        slice_var_up=not args.no_split_var)
+        sync_mode=not args.async_mode)
     if training_role == "PSERVER":
         pserver_program = t.get_pserver_program(current_endpoint)
         pserver_startup_program = t.get_startup_program(current_endpoint,
@@ -210,7 +209,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
     # generate fake:
     if args.use_fake_data:
         for var in feed_var_list:
-            v = startup_prog.global_block().clone_variable(var)
+            v = startup_prog.global_block()._clone_variable(var)
             var.persistable = True
             v.persistable = True
 
diff --git a/benchmark/paddle/image/run.sh b/benchmark/paddle/image/run.sh
index 717ed487ba7657db6535efcb1128a355a0f15eaf..5b58a8d773aab795e5439b0f0e5d81bec66b5f56 100755
--- a/benchmark/paddle/image/run.sh
+++ b/benchmark/paddle/image/run.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function train() {
diff --git a/benchmark/paddle/image/run_mkl_infer.sh b/benchmark/paddle/image/run_mkl_infer.sh
index 62c9bf6efd3810f506fd4592b2ba3a21b1b7f0e7..0fad5e04cc992a3ec97591d3833957bb7517a8f3 100755
--- a/benchmark/paddle/image/run_mkl_infer.sh
+++ b/benchmark/paddle/image/run_mkl_infer.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function clock_to_seconds() {
diff --git a/benchmark/paddle/image/run_mkl_train.sh b/benchmark/paddle/image/run_mkl_train.sh
index 03d2d378fb72e36f765d89af788f6ee96fe21d4e..1583bf134a276a08aa2f8e84dc63adbb205a83d6 100755
--- a/benchmark/paddle/image/run_mkl_train.sh
+++ b/benchmark/paddle/image/run_mkl_train.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function train() {
diff --git a/benchmark/paddle/image/run_openblas_infer.sh b/benchmark/paddle/image/run_openblas_infer.sh
index a9a7b8a66717c4be0543c3fe2db293fe199e3dc4..987381cabc2e793886099212660723c122b73bb0 100755
--- a/benchmark/paddle/image/run_openblas_infer.sh
+++ b/benchmark/paddle/image/run_openblas_infer.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function clock_to_seconds() {
diff --git a/benchmark/paddle/image/run_openblas_train.sh b/benchmark/paddle/image/run_openblas_train.sh
index 935cff6f2c97d25d6de556cfee25e27dbe49b5b6..cc64e1d09da02087b1737190a0b75dc7758600a6 100755
--- a/benchmark/paddle/image/run_openblas_train.sh
+++ b/benchmark/paddle/image/run_openblas_train.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function train() {
diff --git a/benchmark/paddle/rnn/run.sh b/benchmark/paddle/rnn/run.sh
index e9dfeb2e525979f47e4ef48f7610dc1007900f2c..f99a562b3f88a98560f4bf7aee98ceee9daefe67 100755
--- a/benchmark/paddle/rnn/run.sh
+++ b/benchmark/paddle/rnn/run.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function train() {
diff --git a/benchmark/tensorflow/image/run.sh b/benchmark/tensorflow/image/run.sh
index eade36beb9df5f8d3978939216e058203e024c1a..cf894fe3f2dca24e3acf863d625b3a7008793b83 100755
--- a/benchmark/tensorflow/image/run.sh
+++ b/benchmark/tensorflow/image/run.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function test() {
diff --git a/benchmark/tensorflow/image/run_multi.sh b/benchmark/tensorflow/image/run_multi.sh
index 69faa4331744f2276e7706185ae10bc507f95764..bf1435bc55b90669e0b8bd893b8ed7bbb99d51e2 100755
--- a/benchmark/tensorflow/image/run_multi.sh
+++ b/benchmark/tensorflow/image/run_multi.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function test() {
diff --git a/benchmark/tensorflow/rnn/run.sh b/benchmark/tensorflow/rnn/run.sh
index bb4c69cb95f965eff35f1c5a60376bf1e84f841b..db10eefdea8676ad34fb84a161f0fc1309147824 100755
--- a/benchmark/tensorflow/rnn/run.sh
+++ b/benchmark/tensorflow/rnn/run.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function test() {
diff --git a/benchmark/tensorflow/rnn/run_multi.sh b/benchmark/tensorflow/rnn/run_multi.sh
index c2d7dd597e6da54cd5c4cda311fbbd18486b4647..ec62fc26b51543f2f8ddfc5e73aa6ff7d611e4dd 100755
--- a/benchmark/tensorflow/rnn/run_multi.sh
+++ b/benchmark/tensorflow/rnn/run_multi.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 function test() {
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index 2c84061ff572de4687b4d496f8ded6deee8d1011..9eebea816cbfc91052c95ecf99ecc4b0bea4e4c2 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -21,6 +21,7 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
     ${CUDNN_ROOT}/lib64
     ${CUDNN_ROOT}/lib
     ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
+    ${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
     $ENV{CUDNN_ROOT}
     $ENV{CUDNN_ROOT}/lib64
     $ENV{CUDNN_ROOT}/lib
diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake
index d205e3958234cabfbfeba8c3d725fe618ce48ace..8b7d91f234594becdda805c089fac0bb4e4e8e44 100644
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -7,9 +7,20 @@ set(ANAKIN_INSTALL_DIR "${THIRD_PARTY_PATH}/install/anakin" CACHE PATH
 set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header files")
 set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library")
 
-set(ANAKIN_COMPILE_EXTRA_FLAGS -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp)
+set(ANAKIN_COMPILE_EXTRA_FLAGS 
+    -Wno-error=unused-but-set-variable -Wno-unused-but-set-variable
+    -Wno-error=unused-variable -Wno-unused-variable 
+    -Wno-error=format-extra-args -Wno-format-extra-args
+    -Wno-error=comment -Wno-comment 
+    -Wno-error=format -Wno-format 
+    -Wno-error=switch -Wno-switch
+    -Wno-error=return-type -Wno-return-type 
+    -Wno-error=non-virtual-dtor -Wno-non-virtual-dtor
+    -Wno-sign-compare
+    -Wno-reorder 
+    -Wno-error=cpp)
 
-set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/3.0/anakin_release_simple.tar.gz")
+set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/Version0.1.0/anakin.tar.gz")
 
 # A helper function used in Anakin, currently, to use it, one need to recursively include
 # nearly all the header files.
@@ -31,9 +42,9 @@ if (NOT EXISTS "${ANAKIN_INSTALL_DIR}")
     message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
     execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
     execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
-    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
+    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget --no-check-certificate -q ${ANAKIN_LIBRARY_URL}")
     execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
-    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
+    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin.tar.gz")
 endif()
 
 if (WITH_ANAKIN)
diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..c94849cf4b96746e6c507db2a6310c2f305dacf5
--- /dev/null
+++ b/cmake/external/cub.cmake
@@ -0,0 +1,35 @@
+if(NOT WITH_GPU)
+  return()
+endif()
+
+include(ExternalProject)
+
+set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub)
+set(CUB_INCLUDE_DIR ${CUB_SOURCE_DIR}/src/extern_cub)
+
+include_directories(${CUB_INCLUDE_DIR})
+
+ExternalProject_Add(
+  extern_cub
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  GIT_REPOSITORY "https://github.com/NVlabs/cub.git"
+  GIT_TAG        "v1.8.0"
+  PREFIX         ${CUB_SOURCE_DIR}
+  UPDATE_COMMAND ""
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   ""
+  TEST_COMMAND      ""
+)
+
+if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
+  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cub_dummy.c)
+  file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
+  add_library(cub STATIC ${dummyfile})
+else()
+  add_library(cub INTERFACE)
+endif()
+
+add_dependencies(cub extern_cub)
+
+LIST(APPEND externl_project_dependencies cub)
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index 85f40585da29bab9a107f5546e64870975f4c2d3..7fb67afbe15a5a019c978092d5ba3a4a0f66d996 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -50,6 +50,7 @@ ExternalProject_Add(
     UPDATE_COMMAND  ""
     CONFIGURE_COMMAND ""
     BUILD_IN_SOURCE 1
+    PATCH_COMMAND cp ${PADDLE_SOURCE_DIR}/patches/grpc/grpc_library.h ${GRPC_SOURCES_DIR}/src/extern_grpc/include/grpcpp/impl/codegen/grpc_library.h && cp ${PADDLE_SOURCE_DIR}/patches/grpc/completion_queue.h ${GRPC_SOURCES_DIR}/src/extern_grpc/include/grpcpp/impl/codegen/completion_queue.h
     # NOTE(yuyang18):
     # Disable -Werror, otherwise the compile will fail in MacOS.
     # It seems that we cannot configure that by make command.
diff --git a/cmake/external/libxsmm.cmake b/cmake/external/libxsmm.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..530f7ebe2813fb2f00c6b5b4d1f7b2f04fe650b0
--- /dev/null
+++ b/cmake/external/libxsmm.cmake
@@ -0,0 +1,57 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+OPTION(WITH_LIBXSMM "Compile with libxsmm" OFF)
+
+IF(NOT WITH_LIBXSMM)
+    return()
+ENDIF()
+
+IF(WIN32 OR APPLE OR ANDROID OR IOS)
+    MESSAGE(WARNING "Windows, Mac or Mobile are not supported with libxsmm in Paddle yet.")
+    SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM" FORCE)
+    return()
+ENDIF()
+
+INCLUDE (ExternalProject)
+
+SET(LIBXSMM_SOURCES_DIR ${THIRD_PARTY_PATH}/libxsmm)
+SET(LIBXSMM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/libxsmm)
+SET(LIBXSMM_INCLUDE_DIR "${LIBXSMM_INSTALL_DIR}/include" CACHE PATH "LIBXSMM include directory." FORCE)
+SET(LIBXSMM_LIBRARY_DIR "${LIBXSMM_INSTALL_DIR}/lib" CACHE PATH "LIBXSMM library directory." FORCE)
+SET(LIBXSMM_LIBS        "${LIBXSMM_LIBRARY_DIR}/libxsmm.a"
+                        "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a")
+
+ExternalProject_Add(
+    extern_libxsmm
+    GIT_REPOSITORY  "https://github.com/hfp/libxsmm.git"
+    GIT_TAG         "7cc03b5b342fdbc6b6d990b190671c5dbb8489a2"
+    PREFIX          ${LIBXSMM_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_IN_SOURCE 1
+    BUILD_COMMAND   $(MAKE) --silent PREFIX=${LIBXSMM_INSTALL_DIR} CXX=g++ CC=gcc WARP=0 install
+    INSTALL_COMMAND ""
+)
+ADD_LIBRARY(libxsmm STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmm.a")
+SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a")
+
+MESSAGE(STATUS "Libxsmm library: ${LIBXSMM_LIBS}")
+include_directories(${LIBXSMM_INCLUDE_DIR})
+ADD_DEFINITIONS(-DPADDLE_WITH_LIBXSMM)
+ADD_DEPENDENCIES(libxsmm extern_libxsmm)
+LIST(APPEND external_project_dependencies libxsmm)
+
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index ce6a88b51dc98ac46dd3935f12658d60d364ba8c..56024edf5be092f81ed893633a8e7cafc8c8d429 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -121,6 +121,11 @@ ELSE()
   TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
 ENDIF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
 
+IF(WITH_LIBXSMM)
+  TARGET_LINK_LIBRARIES(cblas ${LIBXSMM_LIBS})
+  ADD_DEPENDENCIES(cblas extern_libxsmm)
+ENDIF()
+
 IF(NOT ${CBLAS_FOUND})
     ADD_DEPENDENCIES(cblas extern_openblas)
     LIST(APPEND external_project_dependencies cblas)
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index d7e5571bdbd8ba58d8a08c9426971f1c7b186413..f17b8d46dc2d8ded81ced7de5827d5e7fd5109f0 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -18,8 +18,9 @@ ENDIF()
 
 INCLUDE(python_module)
 
-FIND_PACKAGE(PythonInterp 2.7)
-FIND_PACKAGE(PythonLibs 2.7)
+FIND_PACKAGE(PythonInterp ${PY_VERSION})
+FIND_PACKAGE(PythonLibs ${PY_VERSION})
+
 # Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
 ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index eafb11b6f21e226fc68556a78d675dea94080140..82c958073cba92f00a341121e36ba45531b22aec 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -263,8 +263,11 @@ function(cc_test TARGET_NAME)
              COMMAND ${TARGET_NAME} ${cc_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     if (${cc_test_SERIAL})
-        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
     endif()
   endif()
 endfunction(cc_test)
@@ -328,8 +331,11 @@ function(nv_test TARGET_NAME)
     add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     add_test(${TARGET_NAME} ${TARGET_NAME})
     if (nv_test_SERIAL)
-        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
     endif()
   endif()
 endfunction(nv_test)
@@ -577,7 +583,9 @@ function(py_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS ARGS ENVS)
     cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
-             COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
+             COMMAND env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
+             FLAGS_cpu_deterministic=true
+             PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
              ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index c6979713231f631f8757e4139d6f685d4554b54e..aeb081e76e5bc5b9d3d81ce625195c800174ab6c 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -138,29 +138,21 @@ copy(memory_lib
 
 set(inference_deps paddle_fluid_shared paddle_fluid)
 
-if(WITH_CONTRIB)
-    message(STATUS "installing contrib")
-    set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
-    if (WITH_ANAKIN AND WITH_GPU)
-        copy(contrib_anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
-            SRCS
-            ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libinference_anakin_api* # compiled anakin api
-            ${PADDLE_BINARY_DIR}/third_party/install/anakin/*.tar.gz # anakin release
-            DSTS ${contrib_dst_dir}/anakin ${contrib_dst_dir}/anakin)
-        list(APPEND inference_deps contrib_anakin_inference_lib)
-   endif()
-
-  copy(contrib_inference_lib DEPS paddle_inference_api paddle_inference_api_shared
-        SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h
-        ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api*
-        DSTS ${contrib_dst_dir} ${contrib_dst_dir})
-  list(APPEND inference_deps contrib_inference_lib)
+set(module "inference/api")
+if (WITH_ANAKIN AND WITH_GPU)
+    copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
+        SRCS
+        ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
+        ${PADDLE_BINARY_DIR}/third_party/install/anakin/*.tar.gz # anakin release
+        DSTS ${dst_dir}/inference/anakin ${dst_dir}/inference/anakin)
+     list(APPEND inference_deps anakin_inference_lib)
 endif()
 
 set(module "inference")
 copy(inference_lib DEPS ${inference_deps}
   SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}
+       ${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/demo_ci
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
 )
 
 set(module "platform")
diff --git a/cmake/version.cmake b/cmake/version.cmake
index cde650128a068faf32f4abfff5cdfdeb656d8577..ac10bdf067be549fe90112aef73fd6e1fbe0ac48 100644
--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@@ -1,23 +1,46 @@
 # Get the latest git tag.
 set(PADDLE_VERSION $ENV{PADDLE_VERSION})
 set(tmp_version "HEAD")
+set(TAG_VERSION_REGEX "[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?")
+set(COMMIT_VERSION_REGEX "[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+")
 while ("${PADDLE_VERSION}" STREQUAL "")
+  # Check current branch name
   execute_process(
-    COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 ${tmp_version}
+    COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref ${tmp_version}
     WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-    OUTPUT_VARIABLE GIT_TAG_NAME
-    RESULT_VARIABLE GIT_RESULT
+    OUTPUT_VARIABLE GIT_BRANCH_NAME
+    RESULT_VARIABLE GIT_BRANCH_RESULT
     ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-  if (NOT ${GIT_RESULT})
-    # Check the tag is a correct version
-    if (${GIT_TAG_NAME} MATCHES "v[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?")
-      string(REPLACE "v" "" PADDLE_VERSION ${GIT_TAG_NAME})
-    else()  # otherwise, get the previous git tag name.
-      set(tmp_version "${GIT_TAG_NAME}~1")
+  if (NOT ${GIT_BRANCH_RESULT})
+    execute_process(
+      COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 --always ${tmp_version}
+      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+      OUTPUT_VARIABLE GIT_TAG_NAME
+      RESULT_VARIABLE GIT_RESULT
+      ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+    if (NOT ${GIT_RESULT})
+      # Check if current branch is release branch
+      if (${GIT_BRANCH_NAME} MATCHES "release/${TAG_VERSION_REGEX}")
+        # Check the tag is a correct version
+        if (${GIT_TAG_NAME} MATCHES "${COMMIT_VERSION_REGEX}")
+          # if no tag was found, set PADDLE_VERSION to 0.0.0 to represent latest
+          set(PADDLE_VERSION "0.0.0")
+        elseif (${GIT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
+          string(REPLACE "v" "" PADDLE_VERSION ${GIT_TAG_NAME})
+        else()  # otherwise, get the previous git tag name.
+          set(tmp_version "${GIT_TAG_NAME}~1")
+        endif()
+      else()
+        # otherwise, we always set PADDLE_VERSION to 0.0.0 to represent latest
+        set(PADDLE_VERSION "0.0.0")
+      endif()
+    else()
+      set(PADDLE_VERSION "0.0.0")
+      message(WARNING "Cannot add paddle version from git tag")
     endif()
   else()
     set(PADDLE_VERSION "0.0.0")
-    message(WARNING "Cannot add paddle version from git tag")
+    message(WARNING "Cannot add paddle version for wrong git branch result")
   endif()
 endwhile()
 
diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index d443c49657b92583e527035f49e74462cf41487d..ecbd8191ccf5aa6046e7875fe8afa2ed0105e4a0 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -1768,3 +1768,11 @@ reverse
 ..  autofunction:: paddle.fluid.layers.reverse
     :noindex:
 
+.. _api_fluid_layers_rank_loss:
+
+rank_loss
+-------
+
+..  autofunction:: paddle.fluid.layers.rank_loss
+    :noindex:
+
diff --git a/doc/fluid/design/dist_train/distributed_lookup_table_design.md b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
index 97f890c88e778a59ea475e984ccbc28cf026fc5b..e284e1ec5cdd18d0049ce3c1a8349bbe1248cb48 100644
--- a/doc/fluid/design/dist_train/distributed_lookup_table_design.md
+++ b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
@@ -1,6 +1,6 @@
 # Design Doc: Distributed Lookup Table Operator
 
-A lookup table operator in PaddlePaddle where the table could be out
+A distribute lookup table operator in PaddlePaddle where the table could be out
 of the memory of a computer.
 
 ## Background
@@ -24,14 +24,14 @@ memory, so we'd need a distributed storage service, which supports the
 lookup of rows.
 
 The following figure illustrates the multiplication of x with two
-non-zero elements, or say, two symbols, and a lookup table W:
+non-zero elements, or say two symbols, and a lookup table W:
 
 ![lookup table](./src/lookup_table.png)
 
 ### The Backward Algorithm
 
 The backward algorithm computes W'(x) using W(x).  W'(x) has the same
-scale of size as W(x) and is much smaller than W.
+the scale of size as W(x) and is much smaller than W.
 
 To optimize W given W', we can do simple SGD update:
 
@@ -44,111 +44,46 @@ $$W = f(W, W')$$
 The following figure illustrates the backward pass of the lookup
 operator: ![lookup table training](./src/lookup_table_training.png)
 
-## Distributed Storage Service
-
-The forward algorithm requires a distributed storage service for W.
-The backward algorithm prefers that the storage system can apply the
-optimization algorithm on W.  The following two sections describe two
-solutions -- the former doesn't require that the storage service can
-do optimization, the latter does.
-
-### Storage Service Doesn't Optimize
-
-In this design, we use highly-optimized distributed storage, e.g.,
-memcached, as the storage service, and we run the optimization
-algorithm on parameter servers of PaddlePaddle.  The following figure
-illustrates the training process.
-
-<!--
-Note: please update the following URL when update this digraph.
-<img src='https://g.gravizo.com/svg?
-digraph G {
-  rankdir="LR";
-  subgraph cluster1 {
-  P1 [label="pserver 1"];
-  P2 [label="pserver 2"];
-  T1 [label="trainer 1"];
-  T2 [label="trainer 2"];
-  T3 [label="trainer 3"];
-  }
-  KV [label="memcached"];
-  T1 -> P1;
-  T1 -> P2;
-  T2 -> P1;
-  T2 -> P2;
-  T3 -> P1;
-  T3 -> P2;
-  P1 -> KV [color=gray, weight=0.1];
-  KV -> P1 [color=gray, weight=0.1];
-  P2 -> KV [color=gray, weight=0.1];
-  KV -> P2 [color=gray, weight=0.1];
-  KV -> T1 [color=gray, weight=0.1];
-  KV -> T2 [color=gray, weight=0.1];
-  KV -> T3 [color=gray, weight=0.1];
-}
-)
-'/>
--->
-
-<img src='https://g.gravizo.com/svg?%20digraph%20G%20{%20rankdir=%22LR%22;%20subgraph%20cluster1%20{%20P1%20[label=%22pserver%201%22];%20P2%20[label=%22pserver%202%22];%20T1%20[label=%22trainer%201%22];%20T2%20[label=%22trainer%202%22];%20T3%20[label=%22trainer%203%22];%20}%20KV%20[label=%22memcached%22];%20T1%20-%3E%20P1;%20T1%20-%3E%20P2;%20T2%20-%3E%20P1;%20T2%20-%3E%20P2;%20T3%20-%3E%20P1;%20T3%20-%3E%20P2;%20P1%20-%3E%20KV%20[color=gray,%20weight=0.1];%20KV%20-%3E%20P1%20[color=gray,%20weight=0.1];%20P2%20-%3E%20KV%20[color=gray,%20weight=0.1];%20KV%20-%3E%20P2%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T1%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T2%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T3%20[color=gray,%20weight=0.1];%20}'/>
-
-Each trainer runs the forward and backward passes using their local
-data:
-
-1. In the forward pass, when a trainer runs the forward algorithm of a
-   lookup operator, it retrieves W(x) from the storage service.
-1. The trainer computes W'(x) in the backward pass using W(x).
-
-During the global update process:
-
-1. Each trainer uploads its W'(x) to parameter servers.
-1. The parameter server runs the optimization algorithm, e.g., the
-   Adam optimization algorithm, which requires that
-   1. The parameter server retrieves W(x) from memcached, and
-   1. The parameter server pushes $\Delta W(x)=f(W(x), lambda \sum_j
-      W'(x))$ to memcached, where $f$ denotes the optimization
-      algorithm.
-
-### Storage Service Does Optimize
-
-This design is very similar to the above one, except that the
-optimization algorithm $f$ runs on the storage service.
-
-- Pro: parameter servers do not retrieve W(x) from the storage
-  service, thus saves half network communication.
-- Con: the storage service needs to be able to run the optimization
-  algorithm.
-
-## Distributed Sparse Table in Fluid
-
-For another design, we can implement a distributed sparse table in Fluid,
-and don't need to maintain an external storage component while training.
-
-You may need to read Fluid [Distributed Training Architecture](./distributed_architecture.md)
-and [Parameter Server](./parameter_server.md) before going on.
-
-![fluid lookup remote table](./src/fluid_lookup_remote_table.png)
-
-Partition a large table into multiple pserver instances
-1. `DistributeTranspiler` would split the table partitioned into some small
-table blocks with some partitioned algorithms such as
-[RoundRobin](https://en.wikipedia.org/wiki/Round-robin_scheduling),
-[Hash](https://en.wikipedia.org/wiki/Hash) and etc...
-1. For some cases, the range of input `Ids` is very wide and unpredictable, so the sparse
-table would be able to fill a new value for the id that didn't appear before with
-zero, uniform random or Gaussian distribution.
-
-For each Trainer's training process:
-1. In the forward pass, we use `pre-fetch` op to pre-fetch parameter blocks according to the
-input `Ids` from PServers instead of the local `lookup_table` op, and then merge the blocks
-into a parameter `W`.
-1. Compute `GRAD@W'` in the backward pass using the pre-fetched `W` and send it to PServer to
-execute the optimize pass.
-
-## Conclusion
-
-Let us do the "storage service does not optimize" solution first, as a
-baseline at least, because it is easier to use a well-optimized
-distributed storage service like memcached.  We can do the "storage
-service does optimize" solution later or at the same time, which, if
-implemented carefully, should have better performance than the former.
+## Distributed Lookup Table
+### Problem 1: The lookup table may be very large.
+
+ In the condition like the search engine and recommendation system, the number of feature Id may be very large, say 100,000,000,000, then for a float value lookup table of size 8, the total size of the table is:
+
+ ```
+ 100,000,000,000 * 8 * 4(Bytes) = 2980.23 GB
+ ```
+
+### Solution: Distributed storage
+
+1. Paddle use [SelectedRows](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/selected_rows.md) as the storage format for the lookup table, the lookup table parameter will be split to multi-machine according to the hash of the feature ID, and data will also be split and send to the same machine to prefetch the parameter.
+
+1. For common parameters, the trainer will get the whole parameter for training, but for the big lookup table, the trainer can not store the whole parameter. Because the input data feature is very sparse, every time we only need a few parameters for training, so we use `prefetch_op` to only prefetch the parameter needed to trainer.
+
+### Problem 2. The Id in the lookup table is not sure before training.
+
+ The feature Id is calculated by the hash function because the feature data source is so large, we can not get all the Id before training. So we can not initialize the table before training.
+
+### Solution: Id auto growth
+
+At the beginning of training, paddle only malloc the memory for the lookup table at parameter server side, the Id and it's value will not be initialized. During training, when a parameter server received an Id, if it is already in the lookup table, it will return the existing parameter, if the Id does not exist, paddle will add it into the lookup table and initialize the value for it.
+
+### Problem 3: parameter load and save
+
+For common parameters, paddle use trainer to save and load them. But for distributed lookup table, trainer cannot do this because it's large size.
+
+### Solution: Parameter server side save and load
+
+Paddle support parameter server side save and load for distribute lookup table. Each machine of parameter servers will only save and load part of the whole table.
+
+## Architecture
+The whole architecture of the distribute lookup table is as below:
+
+### Training steps:
+1. Read a batch of data, the data is feature ids.
+1. The input ids will be split by `split_ids_op` with the same hash function of the lookup table.
+1. The `prefetch_op` use the split result to prefetch parameters back from the lookup table.
+1. Run forward-backward to get the gradient of the lookup table.
+1. `split_ids_op` split the gradient and then use `send_op` to the parameter server.
+1. parameter server update the table with the received gradient.
+
+![distribute lookup table](./src/distributed_lookup_table.jpeg)
diff --git a/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle b/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..65dfdbbacd219739db6ddfdf243cc16c3c4e8d1e
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle differ
diff --git a/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg b/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..5353a16fd329f62ff893d32706b9c3c0bcc46a07
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg differ
diff --git a/doc/fluid/design/ir/draft.md b/doc/fluid/design/ir/draft.md
new file mode 100644
index 0000000000000000000000000000000000000000..c29337cba1fe859e4968cb800e4e7d9ff6a54d31
--- /dev/null
+++ b/doc/fluid/design/ir/draft.md
@@ -0,0 +1,185 @@
+## Motivation
+
+There is a `gap` between the `Program` defined by
+user and the `Executable` that can be scheduled
+efficiently on heterogeneous hardware, either locally
+or distributedly.
+
+Usually, the `gap` is bridged by
+
+* A serious transformations with defined order.
+
+* These transformations usually involve
+`insert, delete, clustering, split, dependency analysis`.
+
+* Has a simple way to verify and debug each transformation.
+
+* Flexible to add, remove or customize transformations to fit
+the requirements of various algorithms (models) and hardware secenarios.
+
+Some other events also push us to a better unified pattern.
+
+* The deep learning framework is built around the concepts of graphs.
+To leverage tools such as compilation (e.g. TVM and nGraph) or
+cross-framework conversion (e.g. ONNX), we also need a intermediate
+representation that can be connected to the rest of the ecosystem.
+
+
+We need a unified pattern to naturally support the requirements
+described above. The pattern should fit both training, inference
+and other offline serielized model transformations.
+Learned from LLVM and other deep learning framework, we draft the
+design below.
+
+
+## Design
+
+### Major Concepts
+
+#### Node
+
+`Node` represents an operation that performs some computation or
+a variable that is input or output of operation.
+
+`Node`s are connected to other `Node`s via inputs and outputs.
+
+Other properties (maybe device placement information) can be added
+to `Node` in the future if it's a
+common requirement of many other `Pass`es. Otherwise, it should live
+in a `Node` wrapper class that is private to some `Pass` or be
+a local member of a `Pass`.
+
+#### Graph
+
+`Graph` contains a list of `Node`s, which are connected to
+each other via inputs and outputs.
+
+TODO: Better definitions for the graph.
+
+`Graph` can also contain `Attribute`s. `Attribute`s
+can be `any` thing. For example, it can be a list of "wraper"
+nodes. The `wrapper` nodes compose `Node`s and provide
+helper method for execution or transformation. `Attribute`
+can also contain other things that describe some properties of
+the `Graph` or `Graph` nodes. `Attribute` can be passed
+across `Pass`. However, it should be used with care.
+
+```cpp
+class Graph {
+ public:
+  explicit Graph(const ProgramDesc &program);
+
+  bool Has(const std::string &attr_name) const;
+
+  template <typename AttrType>
+  AttrType &Get(const std::string &attr_name) const;
+
+  template <typename AttrType>
+  void Set(const std::string &attr_name, AttrType *attr);
+  const std::unordered_set<ir::Node *> &Nodes() const;
+
+  // Create a normal variable with non-null VarDesc.
+  ir::Node *CreateVarNode(VarDesc *var_desc);
+
+  // Create a normal runnable operator with OpDesc.
+  ir::Node *CreateOpNode(OpDesc *op_desc);
+
+  // Create a control dependency var that connects 2 operations. The
+  // var doesn't hold any data. Other than that, it's no different from
+  // other var, considering dependency analysis.
+  ir::Node *CreateControlDepVar();
+
+  // A more free style way of creating a graph node. Mostly use for test
+  // or "copy" from another node. Avoid using it if possible.
+  ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type);
+
+  // Clear all node information of the graph and return the ownership of the
+  // nodes.
+  std::vector<std::unique_ptr<ir::Node>> ReleaseNodes();
+};
+```
+
+#### Pass
+
+`Pass` represents a transformation of `Graph`. Its input
+is a `Graph` and its output is also a `Graph`. For example,
+a `Pass` can simply print out the `Graph`. A `Pass`
+can also fuse some `Graph`'s `Node`s.
+
+```cpp
+class Pass {
+ public:
+
+  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const {
+    // Some correctness check.
+    auto new_graph = ApplyImpl(std::move(graph));
+    // Some correctness check.
+    return new_graph;
+  }
+
+  // Get a reference to the attributed previously set.
+  template <typename AttrType>
+  AttrType &Get(const std::string &attr_name) const;
+
+  // Set a pointer to the attribute. Pass takes ownership of the attribute.
+  template <typename AttrType>
+  void Set(const std::string &attr_name, AttrType *attr) ;
+
+  // Set a pointer to the attribute. Pass doesn't take ownership. Caller
+  // should delete the attribute.
+  template <typename AttrType>
+  void SetNotOwned(const std::string &attr_name, AttrType *attr);
+
+ protected:
+  virtual std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const = 0;
+};
+
+// In my_pass.cc
+class MyPass : public Pass {
+ protected:
+  std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const override {
+    // do something.
+    return graph;
+  }
+}
+REGISTER_PASS(my_pass, MyPass)
+.RequirePassAttr("places")
+.RequireGraphAttr("dep_vars");
+
+
+// To use the pass.
+auto my_pass = ir::PassRegistry::Instance().Get("my_pass");
+graph = my_pass->Apply(std::move(graph));
+// Note: to force link my_pass.cc, in the code:
+USE_PASS(my_pass);
+```
+
+#### Optimize
+
+`Optimize` contains a series of `Pass` with defined order.
+`Optimize` transforms a `Graph` that only contains raw
+modeling logic to a `Graph` that can be run efficiently while
+maintaining the original modeling logic.
+
+
+### Optimize Process
+
+* Program is first converted to Graph.
+* Graph goes through a series of Pass
+* Graph is transformed from raw model logic to a
+form that is efficient to execute.
+
+```
+// Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor
+auto graph = Graph(program);
+graph = PassRegistry::Instance().Get("op_fuse_pass").Apply(std::move(grah));
+// For more complex Pass, Optimize Process can provide Pass attributes.
+auto mem_opt_pass = PassRegistry::Instance().Get("memory_optimization_pass");
+mem_opt_pass.SetNotOwned<int>("optimize_level", 1);
+mem_opt_pass->Apply(std::move(graph));
+graph = PassRegistry::Instance().Get("multi_device_pass").Apply(std::move(grah));
+graph = PassRegistry::Instance().Get("multi_device_check_pass").Apply(std::move(grah));
+Executor exe;
+exe.Run(graph);
+
+```
diff --git a/doc/fluid/design/modules/python_api.md b/doc/fluid/design/modules/python_api.md
index 265732a348ea77d21005e335390d99abcdfbd045..83af4e55485c079265d3f2b1e15070825b532c02 100644
--- a/doc/fluid/design/modules/python_api.md
+++ b/doc/fluid/design/modules/python_api.md
@@ -98,13 +98,13 @@ class Block(objects):
     def append_operator(self, ...):
         self.ops.append(Operator(self, ...))
 
-    def prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
+    def _prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
        self.ops.prepend(Operator(self, ...))
 ```
 
 `create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator.
 
-`prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.
+`_prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.
 
 ### Operator
 
diff --git a/doc/fluid/design/quantization/fixed_point_quantization.md b/doc/fluid/design/quantization/fixed_point_quantization.md
new file mode 100644
index 0000000000000000000000000000000000000000..085352fc5614d693e63a2f7241e868a9649456af
--- /dev/null
+++ b/doc/fluid/design/quantization/fixed_point_quantization.md
@@ -0,0 +1,110 @@
+Fixed-point quantization uses lower bits, for example, 2-bit, 3-bit or 8-bit fixed point to represent weights and activations, which usually are in singe-precision float-point with 32 bits. The fixed-point representation has advantages in reducing memory bandwidth, lowering power consumption and computational resources as well as the model storage requirements.  It is especially important for the inference in embedded-device deployment.
+
+According to some experiments, the apporach to quantize the model trained in float point directly works effectively on the large models, like the VGG model having many parameters. But the accuracy drops a lot for the small model. In order to improve the tradeoff between accuracy and latency, many quantized training apporaches are proposed.
+
+This document is to design a quantized training framework on Fluid. The first part will introduce how to quantize, The second part will describe the quantized training framework. The last part will illustrate how to calculate the quantization scale.
+
+
+### How to quantize
+
+There are many ways to quantize the float value to fixed-point value. For example:
+
+$$ r = min(max(x, a), b)$$
+$$ s = \frac{b - a}{n - 1} $$
+$$ q = \left \lfloor \frac{r - a}{s} \right \rceil $$
+
+where, $x$ is the float value to be quantized, $[a, b]$ is the quantization range, $a$ is the minimum value and $b$ is the maximal value. $\left \lfloor \right \rceil$  denotes rounding to the nearest integer. If the quantization level is $k$, $n$ is $2^k$, for example, $k$ is 8 and $n$ is 256. $q$ is the quantized integer. 
+
+
+The quantization we applied is parameterized by the number of quantization levels and maximum absolute value:
+
+$$ M  = max(abs(x))  $$
+$$ q = \left \lfloor \frac{x}{M} * (n - 1) \right \rceil $$
+
+where, $x$ is the float value to be quantized, $M$ is maximum absolute value. $\left \lfloor \right \rceil$ denotes rounding to the nearest integer.  For 8 bit quantization, $n=2^{8}=256$. $q$ is the quantized integer. 
+
+
+Wether the *min-max* quantization or *max-abs* quantization, they also can be represent:
+
+$q = scale * r + b$
+
+We call *min-max*, *max-abs* as the quantization arguments, also call them quantization scale or quantization range.
+
+
+How to calculate the quantization scale (or maximum absolute value) for inference will be described in the last part.
+
+
+### Training Framework
+
+#### Forward pass
+
+The forward pass is simulated quantization, see Figure 1.
+
+The training framework is as following figure. 
+
+<p align="center"> 
+<img src="quantization_forward.png" width="300" height="340"><br/>
+Figure 1. Forward in training with simulated quantization.
+</p>
+
+- Firstly, both input and weight will be quantized to 8-bit integers. 
+- Second, do the multiplication (or convolution) operation with integers.
+- Third, dequantize the multiplication (or convolution) results to 32-bit float point.
+- Finally, do bias-addition in float type of 32 bit. Here, the bias is not quantized.
+
+For general matrix multiplication (GEMM), quantize for $X$ and $W$:
+
+$$ X_q = \left \lfloor \frac{X}{X_m} * (n - 1) \right \rceil  $$
+$$ W_q = \left \lfloor \frac{W}{W_m} * (n - 1) \right \rceil $$
+
+Do GEMM:
+
+$$ Y = X_q * W_q $$
+
+
+Dequantize $Y$:
+
+$$
+\begin{align}
+Y_{dq} &=\frac{Y}{(n - 1) * (n - 1)} * X_m * W_m \\\
+       &=\frac{X_q * W_q}{(n - 1) * (n - 1)} * X_m * W_m \\\
+       &=(\frac{X_q}{n - 1} * X_m) * (\frac{W_q}{n - 1} * W_m) 
+\end{align}
+$$
+
+From these formulas, dequantization also can be moved before GEMM, do dequantization for $Xq$ and $Wq$ at first, then do GEMM. The forward workflow in training is equivalent to following framework.
+
+<p align="center"> 
+<img src="quantization_equivalent_forward.png"  width="300" height="330"><br/>
+Figure 2. Equivalent forward in training with simulated quantization.
+</p>
+
+We use this equivalent workflow in the training. In our desigin, there is a quantization transpiler to insert the quantization operator and the de-quantization operator in the Fluid `ProgramDesc`. Since the outputs of quantization and de-quantization operator are still in floating point, they are called faked quantization and de-quantization operator. And the training framework is called simulated quantization.
+
+#### Backward pass
+
+See Figure 3. The gradients are calculated by dequantized weights and activations. All inputs and outputs are float point with 32-bit. And in the weight updating process, the gradients will be added to the original weight, not the quantized or dequantized weights.
+
+<p align="center"> 
+<img src="quantization_backward_and_optimization.png"><br/>
+Figure 3. Backward and weight updating in training with simulated quantization.
+</p>
+
+So the quantization transipler will change some inputs of the corresponding backward operators. 
+
+### How to calculate quantization scale
+
+There are two strategies to calculate quantization scale, we call them dynamic and static strategy. The dynamic strategy calculates the quantization scale value each iteration. The static strategy keeps the quantization scale for different inputs.
+
+For weights, we apply the dynamic strategy in the training, that is to say, the quantization scale will be recalculated during each iteration until the traning is finished.
+
+For activations, the quantization scales are estimated during training, then used in inference. There are several different ways to estimate them:
+
+
+1. Calculate the mean of maximum absolute during a window.
+2. Calculate the max of maximum absolute during a window.
+3. Calculate the running mean of maximum absolute during a window, as follows:
+
+    $$ Vt = (1 - k) * V +  k * V_{t-1}  $$
+    
+    where, $V$ is the maximum absolute value of current batch, $Vt$ is the running mean value. $k$ is a factor, such as 0.9.
diff --git a/doc/fluid/design/quantization/quantization_backward_and_optimization.png b/doc/fluid/design/quantization/quantization_backward_and_optimization.png
new file mode 100644
index 0000000000000000000000000000000000000000..84f8235ab87cb631992b691f8e05b9c0b6c93da2
Binary files /dev/null and b/doc/fluid/design/quantization/quantization_backward_and_optimization.png differ
diff --git a/doc/fluid/design/quantization/quantization_equivalent_forward.png b/doc/fluid/design/quantization/quantization_equivalent_forward.png
new file mode 100644
index 0000000000000000000000000000000000000000..df49c864537c047c785da12d24893e54ce0a5341
Binary files /dev/null and b/doc/fluid/design/quantization/quantization_equivalent_forward.png differ
diff --git a/doc/fluid/design/quantization/quantization_forward.png b/doc/fluid/design/quantization/quantization_forward.png
new file mode 100644
index 0000000000000000000000000000000000000000..0913f61621bb6533bcb10bd1d18120ccaaa96cff
Binary files /dev/null and b/doc/fluid/design/quantization/quantization_forward.png differ
diff --git a/doc/fluid/howto/inference/build_and_install_lib_cn.rst b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
index 84005b54e07cf810649370d2c1f6b6c522434bf6..91357dd8c8da19f2f33c6f285ed7eb234428b1ab 100644
--- a/doc/fluid/howto/inference/build_and_install_lib_cn.rst
+++ b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
@@ -7,13 +7,13 @@
 ======================   ========================================
 版本说明                            C++预测库   
 ======================   ========================================
-cpu_avx_mkl              `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/fluid.tgz>`_ 
-cpu_avx_openblas         `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/fluid.tgz>`_
-cpu_noavx_openblas       `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/fluid.tgz>`_
-cuda7.5_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
-cuda8.0_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
-cuda8.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid.tgz>`_
-cuda9.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/fluid.tgz>`_
+cpu_avx_mkl              `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_ 
+cpu_avx_openblas         `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
+cpu_noavx_openblas       `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
+cuda7.5_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
+cuda8.0_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
+cuda8.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
+cuda9.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
 ======================   ========================================
 
 从源码编译
diff --git a/doc/fluid/howto/optimization/timeline_cn.md b/doc/fluid/howto/optimization/timeline_cn.md
index 5d061e1c00d2ca0194153730a39486b8357fa5b0..faf39f276dbddcd4961407ba2d082c9826051cbe 100644
--- a/doc/fluid/howto/optimization/timeline_cn.md
+++ b/doc/fluid/howto/optimization/timeline_cn.md
@@ -1,21 +1,27 @@
 # 如何使用timeline工具做性能分析
 
-1. 在训练的主循环外加上`with profiler.profiler(...)`。运行之后，代码会在`/tmp/profile`目录下生成一个profile的记录文件。
+1. 在训练的主循环外加上`profiler.start_profiler(...)`和`profiler.stop_profiler(...)`。运行之后，代码会在`/tmp/profile`目录下生成一个profile的记录文件。
 
 	**提示：**
 	请不要在timeline记录信息时运行太多次迭代，因为timeline中的记录数量和迭代次数是成正比的。
 
 	```python
-	with profiler.profiler('All', 'total', '/tmp/profile') as prof:
-	    for pass_id in range(pass_num):
-	        for batch_id, data in enumerate(train_reader()):
-	            exe.run(fluid.default_main_program(),
-	                    feed=feeder.feed(data),
-	                    fetch_list=[])
+    for pass_id in range(pass_num):
+        for batch_id, data in enumerate(train_reader()):
+            if pass_id == 0 and batch_id == 5:
+                profiler.start_profiler("All")
+            elif pass_id == 0 and batch_id == 10:
+                profiler.stop_profiler("total", "/tmp/profile")
+            exe.run(fluid.default_main_program(),
+                    feed=feeder.feed(data),
+                    fetch_list=[])
 	            ...
 	```
 
 1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`，这个程序默认会生成一个`/tmp/timeline`文件，你也可以用命令行参数来修改这个路径，请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)。
+```python
+python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
+```
 
 1. 打开chrome浏览器，访问<chrome://tracing/>，用`load`按钮来加载生成的`timeline`文件。
 
diff --git a/doc/fluid/howto/optimization/timeline_en.md b/doc/fluid/howto/optimization/timeline_en.md
index 96481ae2a6e4442d40803f8d5361e5f942502df3..6f963c6b4da6967fb2f493ada917a4b08917fa4c 100644
--- a/doc/fluid/howto/optimization/timeline_en.md
+++ b/doc/fluid/howto/optimization/timeline_en.md
@@ -1,15 +1,17 @@
 # how to use timeline tool to do profile
 
-1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
+1. Add `profiler.start_profiler(...)`和`profiler.stop_profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
 
 	```python
-	with profiler.profiler('All', 'total', '/tmp/profile') as prof:
-	    for pass_id in range(pass_num):
-	        for batch_id, data in enumerate(train_reader()):
-	            exe.run(fluid.default_main_program(),
-	                    feed=feeder.feed(data),
-	                    fetch_list=[],
-	                    use_program_cache=True)
+    for pass_id in range(pass_num):
+        for batch_id, data in enumerate(train_reader()):
+            if pass_id == 0 and batch_id == 5:
+                profiler.start_profiler("All")
+            elif pass_id == 0 and batch_id == 10:
+                profiler.stop_profiler("total", "/tmp/profile")
+            exe.run(fluid.default_main_program(),
+                    feed=feeder.feed(data),
+                    fetch_list=[])
 	            ...
 	```
 
@@ -17,6 +19,10 @@
 file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at
 [timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details.
 
+```python
+python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
+```
+
 1. Open chrome and visit <chrome://tracing/>, use `load` button to load the generated `timeline` file.
 
 	![chrome tracing](./tracing.jpeg)
diff --git a/doc/fluid/howto/performance/error_clip.md b/doc/fluid/howto/performance/error_clip.md
index 58aa73b8cd38d01e2426278a3479714e4fb6a3b0..749cf7693c75696feb17f8556224ed03649baa80 100644
--- a/doc/fluid/howto/performance/error_clip.md
+++ b/doc/fluid/howto/performance/error_clip.md
@@ -78,7 +78,7 @@ def error_clip_callback(block, context):
     op_desc = block.desc.op(block.desc.op_size() - 1)
     for grad_n in filter(lambda n: grad_to_var.has_key(n),
                          op_desc.output_arg_names()):
-        fwd_var = block.var_recursive(grad_to_var[grad_n])
+        fwd_var = block.__var_recursive(grad_to_var[grad_n])
         error_clip = getattr(fwd_var, "error_clip", None)
         if not (error_clip is None or isinstance(error_clip,
                                                  BaseErrorClipAttr)):
diff --git a/doc/fluid/index_cn.rst b/doc/fluid/index_cn.rst
index d878d192cae7ee9e8b8fdb4f615839c186fdf334..6b1ef3ceed4f7ed5073d42c13ce103e2ab467e58 100644
--- a/doc/fluid/index_cn.rst
+++ b/doc/fluid/index_cn.rst
@@ -1,12 +1,16 @@
- PaddlePaddle Fluid
-==========================
+.. PaddlePaddle Fluid documentation master file, created by
+   sphinx-quickstart on Thu Jun  7 17:04:53 2018.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+##############
+欢迎使用 Fluid
+##############
 
 ..  toctree::
-  :maxdepth: 1
+    :maxdepth: 1
 
-  getstarted/index_cn.rst
-  build_and_install/index_cn.rst
-  design/index_cn.rst
-  howto/index_cn.rst
-  dev/index_cn.rst
-  faq/index_cn.rst
+    new_docs/beginners_guide/index.rst
+    new_docs/user_guides/index.rst
+    new_docs/advanced_usage/index.rst
+    new_docs/faq/index_cn.rst
diff --git a/doc/fluid/new_docs/advanced_usage/benchmark.rst b/doc/fluid/new_docs/advanced_usage/benchmark.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7854263bf8f64c840492550fb22152582c7d2361
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/benchmark.rst
@@ -0,0 +1,120 @@
+#################
+如何进行基准测试
+#################
+
+本文介绍如何给深度学习框架做基准测试。基准测试主要包含验证模型的精度和性能两方面，下文包含搭建测试环境，选择基准测试模型，验证测试结果等几方面内容。
+
+验证深度学习框架，可分为训练和测试两个阶段， 验证指标略有不同，本文只介绍训练阶段的指标验证。训练阶段关注的是模型训练集上的精度，训练集是完备的，因此关注大batch\_size下的训练速度,关注吞吐量，例如图像模型常用的batch\_size=128, 多卡情况下会加大；预测阶段关注的是在测试集上的精度，线上服务测试数据不能提前收集，因此关注小batch\_size下的预测速度，关注延迟，例如预测服务常用的batch\_size=1, 4等。
+
+`Fluid <https://github.com/PaddlePaddle/Paddle>`__ 是PaddlePaddle从0.11.0版本开始引入的设计，本文的基准测试在该版本上完成。
+
+
+环境搭建
+""""""""""""
+
+基准测试中模型精度和硬件、框架无关，由模型结构和数据共同决定；性能方面由测试硬件和框架性能决定。框架基准测试为了对比框架之间的差异，控制硬件环境，系统库等版本一致。下文中的对比实验都在相同的硬件条件和系统环境条件下进行.
+
+
+不同架构的GPU卡性能差异巨大，在验证模型在GPU上训练性能时，可使用NVIDIA提供的工具:code `nvidia-smi` 检验当前使用的GPU型号，如果测试多卡训练性能，需确认硬件连接是 `nvlink <https://zh.wikipedia.org/zh/NVLink>`__ 或 `PCIe <https://zh.wikipedia.org/zh-hans/PCI_Express>`__ 。 同样地，CPU型号会极大影响模型在CPU上的训练性能。可读取`/proc/cpuinfo`中的参数，确认当前正在使用的CPU型号。
+
+下载GPU对应的Cuda Tool Kit和 Cudnn，或者使用NVIDIA官方发布的nvidia-docker镜像 `nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`__, 镜像内包含了Cuda和Cudnn，本文采用这种方式。 Cuda Tool Kit包含了GPU代码使用到的基础库，影响在此基础上编译出的Fluid二进制运行性能。
+
+准备好Cuda环境后，从github上的下载Paddle并源码编译，会生成对应的最适合当前GPU的sm\_arch二进制\ `sm\_arch <https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html>`__\ 。另外，cudnn对卷积类任务影响巨大，在基准测试中需要小版本一致，例如Cudnn7.0.2与Cudnn7.1.4在Resnet上有5%以上差异。
+
+
+选择基准模型
+""""""""""""
+
+对框架做基准测试，需要覆盖不同训练任务和不同大小的模型，本文中选取了图像和NLP的最为常用的5个模型。
+
+============  ============  =================  ============
+任务种类        模型名称       网络结构         数据集     
+============  ============  =================  ============
+图像分类      mnist         Lenet              mnist
+图像分类      VGG           VGG-16             Flowers102
+图像分类      Resnet        Resnet-50          Flowers102
+文本分类      Stacked-LSTM  Stacked-LSTM       IMDB 
+机器翻译      seq-seq       Stacked-LSTM       wmt14 
+============  ============  =================  ============
+
+其中mnist, VGG, Resnet属于CNN模型, stacked-lstm, seq2seq代表RNN模型。
+`benchmark <https://github.com/PaddlePaddle/Paddle/tree/develop/benchmark/fluid>`__
+基准模型测试脚本中，均跳过了前几个batch的训练过程，原因是加载数据和分配显存受系统当前运行情况影响，会导致统计性能不准确。运行完若干个轮次后，统计对应指标。
+
+
+基准模型的数据的选择方面，数据量大且验证效果多的公开数据集为首选。图像模型VGG和resnet, 本文选择了 `flowers102 <http://www.robots.ox.ac.uk/~vgg/data/flowers/102/>`__ ，图像大小预处理为和Imagenet相同大小，因此性能可直接对比
+NLP模型的公开且影响力大数据集较少，seq2seq模型选择了wmt14数据，stacked-lstm模型中选择了 `imdb <https://www.imdb.com/interfaces/>`__ 数据。
+
+
+注意，图像模型每条样本大小相同，图像经过变换后大小一致，因此经过的计算路径基本相同，计算速度和显存占用波动较小，可以从若干个batch的数据中采样得到当前的训练性能数据。而NLP模型由于样本长度不定，计算路径和显存占用也不相同，因此只能完整运行若干个轮次后，统计速度和显存消耗。
+显存分配是特别耗时的操作，因此Fluid默认会占用所有可用显存空间形成显存池，用以加速计算过程中的显存分配。如果需要统计模型真实显存消耗，可设置环境变量`FLAGS_fraction_of_gpu_memory_to_use=0.0`，观察最大显存开销。
+
+
+测试过程
+""""""""""""
+
+-  CPU 单机单线程测试
+
+测试CPU上单线程的性能，先设置CUDA的环境变量为空，``CUDA_VISIBLE_DEVICES=``，并通过环境变量关闭OpenMP和MKL的多线程 ``OMP_NUM_THREADS=1``， ``MKL_NUM_THREADS=1;``。
+然后代码中设置为使用CPUPlace，如果使用Paddle代码库中的脚本，只需要命令行参数传入 use_gpu=False即可。
+
+.. code-block:: python
+
+    >>> import paddle.fluid as fluid
+    >>> place = fluid.CPUPlace() 
+
+.. code:: bash
+
+    docker run -it --name CASE_NAME --security-opt seccomp=unconfined -v $PWD/benchmark:/benchmark paddlepaddle/paddle:latest-dev /bin/bash
+
+
+-  GPU 单机单卡测试
+
+本教程使用了Cuda8, Cudnn7.0.1。来源为:code `nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04`
+
+.. code:: bash
+
+    nvidia-docker run -it --name CASE_NAME --security-opt seccomp=unconfined -v $PWD/benchmark:/benchmark -v /usr/lib/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu paddlepaddle/paddle:latest-dev /bin/bash
+在单卡上测试，设置CUDA的环境变量使用一块GPU，``CUDA_VISIBLE_DEVICES=0``
+然后代码中设置为使用CUDAPlace，如果使用Paddle代码库中的脚本，只需要命令行参数传入 use_gpu=True即可。
+
+.. code-block:: python
+
+    >>> import paddle.fluid as fluid
+    >>> place = fluid.CUDAPlace(0) // 0 指第0块GPU
+
+
+测试结果
+""""""""""""
+
+本教程对比相同环境下的Fluid0.12.0和TensorFlow1.4.0的性能表现。
+硬件环境为 CPU: Intel(R) Xeon(R) CPU E5-2660 v4 @ 2.00GHz, GPU: TITAN X(Pascal) 12G x 1, Nvidia-Driver 384.90。
+系统环境为Ubuntu 16.04.3 LTS, 本文中采用了docker环境，系统版本为nvidia-docker17.05.0-ce。
+测试的Fluid版本为\ `v.0.12.0 <https://github.com/PaddlePaddle/Paddle/releases/tag/v.0.12.0>`__ 。
+TensorFlow版本为\ `v.1.4.0-rc1 <https://github.com/tensorflow/tensorflow/tree/v1.4.0-rc1>`__ 。
+使用的脚本和配置见\ `benchmark <https://github.com/PaddlePaddle/Paddle/tree/develop/benchmark/fluid>`__ 。
+图表中统计单位为samples/秒。
+
+- CPU 单机单线程测试结果
+
+  ================  ====================  ===================
+   Speed            Fluid CPU              TensorFlow CPU    
+  ================  ====================  ===================
+  mnist             1298.75 samples/s     637.57 samples/s  
+  VGG-16            0.4147 images/s       0.1229 images/s   
+  Resnet-50         1.6935 images/s       0.3657 images/s   
+  Stacked-LSTM      472.3225 words/s      48.2293words/s    
+  Seq2Seq           217.1655 words/s      28.6164 words/s   
+  ================  ====================  ===================
+
+- GPU 单机单卡测试结果
+
+  =============== =====================  =================
+   Speed           Fluid GPU              TensorFlow GPU      
+  =============== =====================  =================
+   mnist           19710.90 samples/s    15576.3 samples/s        
+   VGG-16          59.83327 images/s     40.9967 images/s    
+   Resnet-50       105.84412             97.8923 images/s    
+   Stacked-LSTM    1319.99315            1608.2526 words/s   
+   Seq2Seq         7147.89081            6845.1161 words/s   
+  =============== =====================  =================
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/anakin_arm_benchmark.md b/doc/fluid/new_docs/advanced_usage/deploy/anakin_arm_benchmark.md
new file mode 100644
index 0000000000000000000000000000000000000000..08ea379f81d16407ed5f82770b55a34bcf138da8
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/anakin_arm_benchmark.md
@@ -0,0 +1,56 @@
+# Anakin ARM 性能测试
+
+## 测试环境和参数:
++ 测试模型Mobilenetv1, mobilenetv2, mobilenet-ssd
++ 采用android ndk交叉编译，gcc 4.9，enable neon， ABI： armveabi-v7a with neon -mfloat-abi=softfp
++ 测试平台
+   - 荣耀v9(root): 处理器:麒麟960, 4 big cores in 2.36GHz, 4 little cores in 1.8GHz
+   - nubia z17:处理器:高通835, 4 big cores in 2.36GHz, 4 little cores in 1.9GHz
+   - 360 N5:处理器:高通653, 4 big cores in 1.8GHz, 4 little cores in 1.4GHz
++ 多线程：openmp
++ 时间：warmup10次，运行10次取均值
++ ncnn版本：来源于github的master branch中commits ID：307a77f04be29875f40d337cfff6df747df09de6（msg:convert            LogisticRegressionOutput)版本
++ TFlite版本：来源于github的master branch中commits ID：65c05bc2ac19f51f7027e66350bc71652662125c（msg:Removed unneeded file copy that was causing failure in Pi builds)版本
+
+在BenchMark中本文将使用**`ncnn`**、**`TFlite`**和**`Anakin`**进行性能对比分析
+
+## BenchMark model
+
+> 注意在性能测试之前，请先将测试model通过[External Converter](#10003)转换为Anakin model
+> 对这些model，本文在ARM上进行多线程的单batch size测试。
+
+- [Mobilenet v1](#11)  *caffe model 可以在[这儿](https://github.com/shicai/MobileNet-Caffe)下载*
+- [Mobilenet v2](#22)  *caffe model 可以在[这儿](https://github.com/shicai/MobileNet-Caffe)下载*
+- [mobilenet-ssd](#33)  *caffe model 可以在[这儿](https://github.com/chuanqi305/MobileNet-SSD)下载*
+
+### <span id = '11'> mobilenetv1 </span>
+
+   |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)| 
+   |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:|
+   |麒麟960|107.7ms|61.1ms|38.2ms|152.8ms|85.2ms|51.9ms|152.6ms|nan|nan|
+   |高通835|105.7ms|63.1ms|~~46.8ms~~|152.7ms|87.0ms|~~92.7ms~~|146.9ms|nan|nan|
+   |高通653|120.3ms|64.2ms|46.6ms|202.5ms|117.6ms|84.8ms|158.6ms|nan|nan| 
+
+### <span id = '22'> mobilenetv2 </span>
+
+   |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)| 
+   |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:|
+   |麒麟960|93.1ms|53.9ms|34.8ms|144.4ms|84.3ms|55.3ms|100.6ms|nan|nan|
+   |高通835|93.0ms|55.6ms|41.1ms|139.1ms|88.4ms|58.1ms|95.2ms|nan|nan|
+   |高通653|106.6ms|64.2ms|48.0ms|199.9ms|125.1ms|98.9ms|108.5ms|nan|nan|
+
+### <span id = '33'> mobilenet-ssd </span>
+
+   |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)| 
+   |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:|
+   |麒麟960|213.9ms|120.5ms|74.5ms|307.9ms|166.5ms|104.2ms|nan|nan|nan|
+   |高通835|213.0ms|125.7ms|~~98.4ms~~|292.9ms|177.9ms|~~167.8ms~~|nan|nan|nan|
+   |高通653|236.0ms|129.6ms|96.0ms|377.7ms|228.9ms|165.0ms|nan|nan|nan
+
+## How to run those Benchmark models?
+
+1. 首先, 使用[External Converter](../docs/Manual/Converter_en.md)对caffe model 进行转换
+2. 然后将转换后的Anakin model和编译好的benchmark_arm 二进制文件通过'adb push'命令上传至测试机
+3. 接着在测试机含有Anakin model的目录中运行'./benchmark_arm ./ anakin_model.anakin.bin 1 10 10 1' 命令
+4. 最后，终端显示器上将会打印该模型的运行时间
+5. 其中运行命令的参数个数和含义可以通过运行'./benchmark_arm'看到
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/anakin_example.md b/doc/fluid/new_docs/advanced_usage/deploy/anakin_example.md
new file mode 100644
index 0000000000000000000000000000000000000000..e6b9e18fe2d64b3fda6382bb23a6a818a3e17fbe
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/anakin_example.md
@@ -0,0 +1,28 @@
+# Example
+Anakin目前只支持NCHW的格式
+示例文件在test/framework/net下
+
+## 在NV的GPU上运行CNN模型
+示例文件为打开example_nv_cnn_net.cpp，整体流程如下：
+- 将模型的的path设置为anakin模型的路径，初始化NV平台的图对象。 anakin模型可以通过转换器转化caffe或fluid的模型得到
+- 根据模型设置网络图的输入尺寸，进行图优化
+- 根据优化后的网络图初始化网络执行器
+- 取出网络的输入tensor，将数据拷贝到输入tensor
+- 运行推导
+- 取出网络的输出tensor
+
+以NV平台为例演示Anakin框架的使用方法，注意编译时需要打开GPU编译开关
+
+## 在X86上运行RNN模型
+示例文件为example_x86_rnn_net.cpp
+整体流程与在NV的GPU上运行CNN模型相似，不同之处如下：
+- 使用X86标识初始化图对象和网络执行器对象
+- rnn模型的输入尺寸是可变的，初始化图时的输入维度是维度的最大值，输入维度N代表总的词的个数。还需要设置输入tensor的seq_offset来标示这些词是如何划分为句子的,如{0,5,12}表示共有12个词，其中第0到第4个词是第一句话，第5到第11个词是第二句话
+
+以X86平台为例演示Anakin框架的使用方法，注意编译时需要打开X86编译开关
+
+## 在NV的GPU上使用Anakin的线程池运行CNN模型
+示例文件为example_nv_cnn_net_multi_thread.cpp ，示例使用worker的同步预测接口
+整体流程与在NV的GPU上运行CNN模型相似，不同之处如下：
+- 用模型地址和线程池大小初始化worker对象
+- 将输入tensor注入任务队列,获得输出tensor
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/anakin_gpu_benchmark.md b/doc/fluid/new_docs/advanced_usage/deploy/anakin_gpu_benchmark.md
new file mode 100644
index 0000000000000000000000000000000000000000..667f9396f1169a0d891b9e6b0e912aa5527ab0b8
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/anakin_gpu_benchmark.md
@@ -0,0 +1,170 @@
+# Anakin GPU Benchmark
+
+## Machine:
+
+>  CPU: `12-core Intel(R) Xeon(R) CPU E5-2620 v2 @2.10GHz`
+>  GPU: `Tesla P4`
+>  cuDNN: `v7`
+
+
+## Counterpart of anakin  :
+
+The counterpart of **`Anakin`** is the acknowledged high performance inference engine **`NVIDIA TensorRT 3`** ,   The models which TensorRT 3 doesn't support we use the custom plugins  to support.
+
+## Benchmark Model
+
+The following convolutional neural networks are tested with both `Anakin` and `TenorRT3`.
+ You can use pretrained caffe model or the model trained by youself.
+
+> Please note that you should transform caffe model or others into anakin model with the help of [`external converter ->`](../docs/Manual/Converter_en.md)
+
+
+- [Vgg16](#1)   *caffe model can be found [here->](https://gist.github.com/jimmie33/27c1c0a7736ba66c2395)*
+- [Yolo](#2)  *caffe model can be found [here->](https://github.com/hojel/caffe-yolo-model)*
+- [Resnet50](#3)  *caffe model can be found [here->](https://github.com/KaimingHe/deep-residual-networks#models)*
+- [Resnet101](#4)  *caffe model can be found [here->](https://github.com/KaimingHe/deep-residual-networks#models)*
+- [Mobilenet v1](#5)  *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)*
+- [Mobilenet v2](#6)  *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)*
+- [RNN](#7)  *not support yet*
+
+We tested them on single-GPU with single-thread.
+
+### <span id = '1'>VGG16 </span>
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 8.8690 | 8.2815 |
+| 2 | 15.5344 | 13.9116 |
+| 4 | 26.6000 | 21.8747 |
+| 8 | 49.8279 | 40.4076 |
+| 32 | 188.6270 | 163.7660 |
+
+- GPU Memory Used (`MB`)
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 963 | 997 |
+| 2 | 965 | 1039 |
+| 4 | 991 | 1115 |
+| 8 | 1067 | 1269 |
+| 32 | 1715 | 2193 |
+
+
+### <span id = '2'>Yolo </span>
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 16.4596| 15.2124 |
+| 2 | 26.6347| 25.0442 |
+| 4 | 43.3695| 43.5017 |
+| 8 | 80.9139 | 80.9880 |
+| 32 | 293.8080| 310.8810 |
+
+- GPU Memory Used (`MB`)
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 1569 | 1775 |
+| 2 | 1649 | 1815 |
+| 4 | 1709 | 1887 |
+| 8 | 1731 | 2031 |
+| 32 | 2253 | 2907 |
+
+### <span id = '3'> Resnet50 </span>
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 4.2459   |  4.1061 |
+| 2 |  6.2627  |  6.5159 |
+| 4 | 10.1277  | 11.3327 |
+| 8 | 17.8209  | 20.6680 |
+| 32 | 65.8582 | 77.8858 |
+
+- GPU Memory Used (`MB`)
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 531  | 503 |
+| 2 | 543  | 517 |
+| 4 | 583 | 541 |
+| 8 | 611 | 589 |
+| 32 |  809 | 879 |
+
+### <span id = '4'> Resnet101 </span>
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 7.5562 | 7.0837 |
+| 2 | 11.6023 | 11.4079 |
+| 4 | 18.3650 | 20.0493 |
+| 8 | 32.7632 | 36.0648 |
+| 32 | 123.2550 | 135.4880 |
+
+- GPU Memory Used (`MB)`
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 701  | 683 |
+| 2 | 713  | 697 |
+| 4 | 793 | 721 |
+| 8 | 819 | 769 |
+| 32 | 1043 | 1059 |
+
+###  <span id = '5'> MobileNet V1 </span>
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 45.5156  |  1.3947 |
+| 2 |  46.5585  |  2.5483 |
+| 4 | 48.4242  | 4.3404 |
+| 8 |  52.7957 |  8.1513 |
+| 32 | 83.2519 | 31.3178 |
+
+- GPU Memory Used (`MB`)
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 329  | 283 |
+| 2 | 345  | 289 |
+| 4 | 371 | 299 |
+| 8 | 393 | 319 |
+| 32 |  531 | 433 |
+
+###  <span id = '6'> MobileNet V2</span>
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 65.6861 | 2.9842 |
+| 2 | 66.6814 | 4.7472 |
+| 4 | 69.7114 | 7.4163 |
+| 8 | 76.1092 | 12.8779 |
+| 32 | 124.9810 | 47.2142 |
+
+- GPU Memory Used (`MB`)
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 341 | 293 |
+| 2 | 353 | 301 |
+| 4 | 385 | 319 |
+| 8 | 421 | 351 |
+| 32 | 637 | 551 |
+
+## How to run those Benchmark models?
+
+> 1. At first, you should parse the caffe model with [`external converter`](https://github.com/PaddlePaddle/Anakin/blob/b95f31e19993a192e7428b4fcf852b9fe9860e5f/docs/Manual/Converter_en.md).
+> 2. Switch to *source_root/benchmark/CNN* directory. Use 'mkdir ./models' to create ./models and put anakin models into this file.
+> 3. Use command 'sh run.sh', we will create files in logs to save model log with different batch size. Finally, model latency summary will be displayed on the screen.
+> 4. If you want to get more detailed information with op time, you can modify CMakeLists.txt with setting `ENABLE_OP_TIMER` to `YES`, then recompile and run. You will find detailed information in  model log file.
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/anakin_tutorial.md b/doc/fluid/new_docs/advanced_usage/deploy/anakin_tutorial.md
new file mode 100644
index 0000000000000000000000000000000000000000..5efbc89abd469871b318c306e8cb03dd95f0c85b
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/anakin_tutorial.md
@@ -0,0 +1,639 @@
+# Anakin 使用教程 ##
+
+本教程将会简略的介绍Anakin的工作原理，一些基本的Anakin API，以及如何调用这些API。
+  
+## 内容 ###
+
+- [Anakin的工作原理](#principle)
+- [Anakin APIs](#api)
+- [示例代码](#example)
+
+## <span id = 'principle'> Anakin的工作原理</span> ###
+
+![Anakin_principle](../pics/anakin_fm_ch.png)
+
+用Anakin来进行前向计算主要分为三个步骤：
+
+- 将外部模型通过[Anakin Parser](Converter_ch.md)解析为Anakin模型  
+  在使用Anakin之前，用户必须将所有其他模型转换成Anakin模型，我们提供了转换脚本，用户可通过[Anakin Parser](Converter_ch.md)进行模型转换。
+- 生成Anakin计算图
+  加载Anakin模型生成原始计算图，然后需要对原始计算图进行优化。你只需要调用相应的API优化即可。
+- 执行计算图  
+  Anakin会选择不同硬件平台执行计算图。
+
+
+## <span id ='api'>Anakin APIs </span> ###
+### Tensor ####
+
+`Tensor`提供基础的数据操作和管理，为ops提供统一的数据接口。`Tensor`包含以下几个属性：   
+
+- Buffer  
+   数据存储区
+- Shape  
+   数据的维度信息
+- Event  
+   用于异步计算的同步
+
+ `Tensor` 类包含三个`Shape`对象， 分别是`_shape`, `_valid_shape`和 `offset`。 `_shape`为`tensor`真正空间信息，`_valid_shape`表示当前`tensor`使用的空间信息， `_offset`表示当前`tensor`数据指针相对于真正数据空间的信息。 `Tensor`不同维度与分别与数学中的向量、矩阵等相对应如下表所示。
+
+
+Dimentions | Math entity |
+ :----: | :----:
+1 | vector
+2 | matrix
+3 | 3-tensor
+n | n-tensor
+
+#### 声明tensor对象
+
+`Tensor`接受三个模板参数:
+
+
+```c++
+ template<typename TargetType, DataType datatype, typename LayOutType = NCHW>
+ class Tensor .../* Inherit other class */{
+  //some implements
+  ...
+ };
+```
+
+TargetType是平台类型，如X86，GPU等等，在Anakin内部有相应的标识与之对应；datatype是普通的数据类型，在Anakin内部也有相应的标志与之对应；[LayOutType](#layout)是数据分布类型，如batch x channel x height x width [NxCxHxW], 在Anakin内部用一个struct来标识。 Anakin中数据类型与基本数据类型的对应如下:
+
+1. <span id='target'>TargetType</sapn>
+
+ Anakin TargetType | platform
+  :----: | :----:|
+  NV | NVIDIA GPU
+  ARM | ARM
+  AMD | AMD GPU
+  X86 | X86
+  NVHX86 | NVIDIA GPU with Pinned Memory
+
+2. <sapn id='datatype'>DataType</span>
+
+Anakin DataType | C++ | Description 
+:---: | :---: | :---: |
+AK_HALF | short | fp16
+AK_FLOAT | float | fp32
+AK_DOUBLE | double | fp64
+AK_INT8 | char | int8
+AK_INT16 | short | int16
+AK_INT32 | int | int32
+AK_INT64 | long | int64
+AK_UINT8 | unsigned char | uint8
+AK_UINT16 | unsigned short | uint8
+AK_UINT32 | unsigned int | uint32
+AK_STRING | std::string | /
+AK_BOOL | bool | /
+AK_SHAPE | / | Anakin Shape 
+AK_TENSOR | / | Anakin Tensor 
+
+
+3. <span id = 'layout'>LayOutType </span>
+
+Anakin LayOutType ( Tensor LayOut ) | Tensor Dimention | Tensor Support | Op Support
+:---: | :---: | :---: | :---: |
+W | 1-D | YES | NO
+HW | 2-D | YES | NO
+WH | 2-D | YES | NO
+NW | 2-D | YES | YES
+NHW | 3-D | YES |YES
+NCHW ( default ) | 4-D | YES | YES
+NHWC | 4-D | YES | NO
+NCHW_C4 | 5-D | YES | YES
+
+
+理论上，Anakin支持申明1维以上的tensor，但是对于Anakin中的Op来说，只支持NW、NHW、NCHW、NCHW_C4这四种LayOut，其中NCHW是默认的LayOutType，NCHW_C4是专门针对于int8这种数据类型的。
+
+
+例子
+
+> 下面的代码将展示如何使用tensor， 我们建议先看看这些示例。
+
+> 要想获得更多关于tensor的信息， 请参考 *soure_path/core/tensor.h*
+
+> 1. 使用shape对象初始化tensor
+``` c++  
+  //create a null tensor. A null tensor holds for nothing.
+  //tensor's buffer  is resident at CPU and its datatype is AK_FLOAT.
+  //tensor's Layout is NCHW(default)
+   Tensor<X86, AK_FLOAT> mytensor;
+
+   //1. using shape object to create a tensor.
+   Shape shape1(NUM); //1-D shape. NUM is the number of dimention.
+   Tensor<X86, AK_FLOAT, W> mytensor1(shape1); //1-D tensor.
+
+  // A 4-D shape
+   Shape shape2(N, C, H, W); // batch x channel x height x width
+```
+
+>`注意：Shape的维度必须和tensor的`[LayoutType](#layout)`相同，比如Shape(N,C,H,W), 那么Tensor的 LayoutType必须是NCHW，否则会出错。如下列代码所示`  
+
+
+```c++
+   // A 4-D tensor.
+   Tensor<X86, AK_FLOAT> mytensor2(shape2);  //right
+
+   //A 4-D tensor which is resident at GPU and its datatype is AK_INT8
+   Tensor<NV, AK_INT8> mytensor3(shape2);   //right
+   
+   Tensor<X86, AK_FLOAT, NHW> mytensor4(shape2); //wrong!! shape's dimetion must be equal to tensor's Layout.
+   Tensor<NV, AK_FLOAT, NCHW_C4> mytensor5(shape2); //wrong!!!!
+
+```
+
+> 2. 使用现有的数据和shape初始化tensor
+
+```c++
+
+   /**
+   *  A construtor of Tensor.
+   *  data_ptr is a pointer to any data type of data
+   *  TargetType is type of a platform [Anakin TargetType]
+   *  id : device id
+   *  shape: a Anakin shape
+   */
+   Tensor(Dtype* data_ptr, TargetType_t target, int id, Shape shape);
+
+   //using existing data feed to a tensor
+   Tensor<X86, AK_FLOAT> mytensor(data_ptr, TargetType, device_id, shape); //shape must has dimention (N, C, H, W).
+
+```
+
+> 3. 使用tensor初始化tensor
+
+```c++
+   Tensor<NV, AK_FLOAT> tensor(exist_tensor);
+```
+
+
+> 提示： 你可以用` typedef Tensor<X86, AK_FLOAT> Tensor4d_X86 `方便定义tensor
+
+
+#### 填充tensor数据区
+
+
+填充数据区得看你申明tensor的方式， 下面展示了如何填充tensor的数据区。
+
+```c++
+首先来看看tensor的四种声明方式：
+
+1. Tensor<X86, AK_FLOAT> mytensor;
+2. Tensor<X86, AK_FLOAT, W> mytensor1(shape1);
+3. Tensor<X86, AK_FLOAT> mytensor(data_ptr, TargetType, device_id, shape);
+4. Tensor<NV, AK_FLOAT> tensor(exist_tensor);
+
+
+相关的声明方式的数据填充方法如下：
+
+1：声明一个空的tensor，此时没有为其分配内存，所以，我们需要手动的为其分配内存。
+            
+            //parama shape
+            mytensor.re_alloc(Shape shape); 
+
+            //Get writable pointer to mytensor.
+            //parama index (int): where you start to write.
+            //Dtype is your data type such int, float or double.
+            Dtype *p = mytensor.mutable_data(index/*=0*/);
+            //write data to mytensor
+            for(int i = 0; i < mytensor.size(); i++){
+              p[i] = 1.0f;
+            }
+            //do something ...
+
+2: 这种声明方式会自动分配内存 
+
+          //Get writable pointer to mytensor.
+          //parama index (int): where you start to write.
+          //Dtype is your data type such int, float or double.
+          Dtype *p = mytensor1.mutable_data(index/*=0*/);
+          //write data to mytensor
+          for(int i = 0; i < mytensor.size(); i++){
+            p[i] = 1.0f;
+          }
+          //do something ...
+
+ 
+3：在该种声明方式中，我们仍不需要手动为其分配内存。但在构造函数内部是否为其分配内存，得依情况而定。如果data_ptr和申明的
+tensor都在都一个目标平台上，那么该tensor就会与data_ptr共享内存空间，相反，如果他们不在同一个平台上（如data_ptr在X86上，而
+tensor在GPU上），那么此时tensor就会开辟一个新的内存空间，并将data_ptr所指向的数据拷贝到tensor的buffer中。
+
+          //Get writable pointer to mytensor.
+          //parama index (int): where you start to write.
+          //Dtype is your data type such int, float or double.
+          Dtype *p = mytensor.mutable_data(index/*=0*/);
+          //write data to mytensor
+          for(int i = 0; i < mytensor.size(); i++){
+            p[i] = 1.0f;
+          }
+          //do something ...
+
+4：该种方式仍不需要手动分配内存
+
+          //Get writable pointer to mytensor.
+          //parama index (int): where you start to write.
+          //Dtype is your data type such int, float or double.
+          Dtype *p = mytensor.mutable_data(index/*=0*/);
+          //write data to mytensor
+          for(int i = 0; i < mytensor.size(); i++){
+            p[i] = 1.0f;
+          }
+          //do something ...
+
+
+另外，你还可以获取一个tensor的可读指针，示例如下：
+        //Get read-only pointer to mytensor.
+        //parama index (int): where you start to read.
+        //Dtype is your data type such int, float or double.
+         Dtype *p = mytensor.data(index/*=0*/);
+        //do something ...
+```
+
+如果想更详细的了解tensor，请查阅*soure_path/saber/core/tensor.h*
+
+#### 获取tensor的shape
+
+```c++
+//some declarations
+// ...
+Shape shape = mytensor.shape();
+
+//Get a first dimetion size of tesor, if it has.
+int d1 = shape[0];
+
+//Get a second dimention size of tensor, if it has.
+int d2 = shape[1];
+
+...
+
+//Get a n-th dimention size of tensor, if it has.
+int dn = shape[n-1];
+
+
+//Get a tensor's dimention
+int dims = mytensor.dims();
+
+//Get the size of tensor.
+//size = d1 x d2 x ... x dn.
+int size = mytensor.size();
+
+//Get the size of tensor at interval [Di, Dj)
+// form i-th dimention to j-th dimention, but not including the j-th dimention.
+// which means di x (di+1) x ... x (dj -1)
+int size = mytensor.count(start, end);
+```
+
+#### 设置tensor的shape
+
+我们可以用tensor的成员函数set_shape来设置tensor的shape。 下面是set_shape的定义
+
+
+```c++
+/**
+ * \brief set a tensor's shape
+ * \param valid_shape [a Shape object]
+ * \param shape [a Shape object]
+ * \param offset [a Shape object]
+ * \return the status of this operation, that means whether it success * or not.
+ */
+SaberStatus set_shape(Shape valid_shape, Shape shape = Shape::zero(TensorAPI::layout_dims::value), Shape offset = Shape::minusone(TensorAPI::layout_dims::value)); 
+```
+
+这个成员函数只设置tensor的shape。这些shape对象(valid_shape, shape, offset)的[LayOutType](#layout)必须和当前的tensor的相应三个shape对象的LayOutType相同，如果不同就会出错，返回SaberInvalidValue。 如果相同，那么将成功设置tensor的shape。
+
+```c++
+
+// some declarations
+// ...
+//valid_shape, shape , offset are Shape object;
+//All these Shape object's LayOutType must be equal to mytensor's.
+mytensor.set_shape(valid_shape, shape, offset);
+
+```
+
+#### 重置 tensor的shape
+
+```c++
+//some declarations
+Shape shape, valid_shape, offset;
+
+//do some initializations
+... 
+mytensor.reshape(valid_shape, shape, offset);
+```
+
+注意： Reshape操作仍然需要shape的[LayOutType](#layout) 与tensor的相同
+
+
+### Graph ###
+
+`Graph`类负责加载Anakin模型生成计算图、对图进行优化、存储模型等操作。
+
+#### 图的声明
+
+与`Tensor`一样，graph也接受三个模板参数。
+
+```c++
+
+template<typename TargetType, DataType Dtype, Precision Ptype>
+class Graph ... /* inherit other class*/{
+  
+  //some implements
+  ...
+
+};
+```
+
+前面已经介绍过[TargetType](#target)和[DataType](#datatype)是Anakin内部自定义数据类型。[TargetType](#target)表示平台类型 (如NV、X86), [DataType](#datatype)是Anakin基本数据类型与C++/C中的基本数据类型相对应。 [Precision](#precision)为op所支持的精度类型, 稍后我们在介绍它。
+
+
+```c++
+
+//Create a empty graph object.
+Graph graph = Graph<NV, AK_FLOAT, Precision::FP32> tmp();
+
+//Create a pointer to a empty graph.
+Graph *graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
+
+//Create a pointer to a empty graph.
+auto graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
+
+```
+
+#### 加载 Anakin 模型
+
+```c++
+//some declarations
+...
+auto graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
+std::string model_path = "the/path/to/where/your/models/are";
+const char *model_path1 = "the/path/to/where/your/models/are";
+
+//Loading Anakin model to generate a compute graph.
+auto status = graph->load(model_path);
+
+//Or this way.
+auto status = graph->load(model_path1);
+//Check whether load operation success.
+if(!status){
+  std::cout << "error" << endl;
+  //do something...
+}
+
+```
+
+#### 优化计算图
+
+```c++
+//some declarations
+...
+//Load graph.
+...
+//According to the ops of loaded graph, optimize compute graph.
+graph->Optimize();
+
+```
+
+> 注意： 第一次加载原始图，必须要优化。
+
+#### 保存模型
+
+你可以在任何时候保存模型， 特别的， 你可以保存一个优化的模型，这样，下次再加载模型时，就不必进行优化操作。
+
+
+```c++
+//some declarations
+...
+//Load graph.
+...
+// save a model
+//save_model_path: the path to where your model is.
+auto status = graph->save(save_model_path);
+
+//Checking
+if(!status){
+  cout << "error" << endl;
+  //do somethin...
+}
+```
+
+#### 重新设置计算图里的tensor的shape
+
+```c++
+//some declarations
+...
+//Load graph.
+...
+vector<int> shape{10, 256, 256, 10};
+//input_name : std::string.
+//Reshape a tensor named input_name.
+graph->Reshape(input_name, shape);//Note: shape is a vector, not a Shape object.
+```
+
+#### 设置 batch size
+
+`Graph` 支持重新设置batch size的大小。
+
+```c++
+//some declarations
+...
+//Load graph.
+...
+//input_name : std::string.
+//Reset a tensor named input_name.
+int new_batch_size = 4;
+graph->ResetBatchSize(input_name, new_batch_size);
+```
+
+###  Net ###
+
+
+`Net` 是计算图的执行器。你可以通过Net对象获得输入和输出
+#### Creating a graph executor
+
+`Net`接受四个模板参数。  
+
+
+```c++
+template<typename TargetType, DataType Dtype, Precision PType OpRunType RunType = OpRunType::ASYNC>
+class Net{
+  //some implements
+  ...
+
+};
+```
+由于有些Op可能支持多种精度，我们可以通过Precision来指定。OpRunType表示同步或异步类型，异步是默认类型。OpRunType::SYNC表示同步，在GPU上只有单个流；OpRunType::ASYNC表示异步，在GPU上有多个流并以异步方式执行。实际上，Precision和OpRunType都是enum class, 详细设计请参考*source_root/framework/core/types.h*.
+
+
+1. <span id = 'precision'> Precision </span>
+
+Precision | Op support
+:---: | :---:
+Precision::INT4 | NO
+Precision::INT8 | NO
+Precision::FP16 | NO
+Precision::FP32 | YES
+Precision::FP64 | NO
+
+现在Op的精度只支持FP32， 但在将来我们会支持剩下的Precision.
+
+
+
+2. OpRunType
+
+OpRunType | Sync/Aync |Description
+:---: | :---: | :---:
+OpRunType::SYNC | Synchronization | single-stream on GPU
+OpRunType::ASYNC | Asynchronization | multi-stream on GPU
+
+用graph对象创建一个执行器。
+```c++
+//some declarations
+...
+//Create a pointer to a graph.
+auto graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
+//do something...
+...
+
+//create a executor
+Net<NV, AK_FLOAT, Precision::FP32> executor(*graph);
+
+```
+
+#### 获取输入输出tensor
+
+
+获取输入输出tensor，并填充输入tensor的buffer。如果想要获取输入和输出tensor，那么必须指定输入的名字，如"input_0", "input_1", "input_2", ..., 必须传入如上字符串才能够获得输入tensor。另外，如果想知道input_i对应哪个输入，你需要去dash board查看，如何使用dash board请看[Anakin Parser](Converter_ch.md)。请看如下示例代码
+
+```c++
+//some declaratinos
+...
+
+//create a executor
+//TargetType is NV [NVIDIA GPU]
+Net<NV, AK_FLOAT, Precision::FP32> executor(*graph);
+
+//Get the first input tensor.
+//The following tensors(tensor_in0, tensor_in2 ...) are resident at GPU.
+//Note: Member function get_in returns an pointer to tensor.
+Tensor<NV, AK_FLOAT>* tensor_in0 = executor.get_in("input_0");
+
+//If you have multiple input tensors
+//You just type this code below.
+Tensor<NV, AK_FLOAT>* tensor_in1 = executor.get_in("input_1");
+...
+auto tensor_inn = executor.get_in("input_n");
+```
+
+当得到输入tensor之后，就可以填充它的数据区了。
+
+```c++
+//This tensor is resident at GPU.
+auto tensor_d_in = executor.get_in("input_0");
+
+//If we want to feed above tensor, we must feed the tensor which is resident at host. And then copy the host tensor to the device's one.
+
+//using Tensor4d = Tensor<Ttype, Dtype>;
+Tensor4d<X86, AK_FLOAT> tensor_h_in; //host tensor;
+//Tensor<X86, AK_FLOAT> tensor_h_in; 
+
+//Allocate memory for host tensor.
+tensor_h_in.re_alloc(tensor_d_in->valid_shape());
+//Get a writable pointer to tensor.
+float *h_data = tensor_h_in.mutable_data();
+
+//Feed your tensor.
+/** example
+for(int i = 0; i < tensor_h_in.size(); i++){
+  h_data[i] = 1.0f;
+}
+*/
+//Copy host tensor's data to device tensor.
+tensor_d_in->copy_from(tensor_h_in);
+
+// And then
+```
+
+
+类似的，我们可以利用成员函数get_out来获得输出tensor。但与获得输入tensor不同的是， 我们需要指定输入tensor结点的名字，这个可以从dash board中看到，请从[Anakin Parser](Converter_ch.md)中查看dash board的使用方法。假如有个输出结点叫pred_out, 那么我们可以通过如下代码获得相应的输出tensor：
+```c++
+//Note: this tensor are resident at GPU.
+Tensor<NV, AK_FLOAT>* tensor_out_d = executor.get_out("pred_out");
+
+```
+
+
+#### Executing graph
+
+
+当一切准备就绪后，我们就可以执行真正的计算了！
+```c++
+executor.prediction();
+```
+ 
+## <span id='example'> 示例代码 </span> ##
+
+下面的例子展示了如何调用Anakin。
+
+在这儿之前， 请确保你已经有了Anakin模型。如果还没有，那么请使用[Anakin Parser](Converter_ch.md)转换你的模型。
+
+### Single-thread
+
+单线程例子在 *source_root/test/framework/net/net_exec_test.cpp`*
+
+```c++
+
+std::string model_path = "your_Anakin_models/xxxxx.anakin.bin";
+// Create an empty graph object.
+auto graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
+// Load Anakin model.
+auto status = graph->load(model_path);
+if(!status ) {
+    LOG(FATAL) << " [ERROR] " << status.info();
+}
+// Reshape
+graph->Reshape("input_0", {10, 384, 960, 10});
+// You must optimize graph for the first time.
+graph->Optimize();
+// Create a executer.
+Net<NV, AK_FLOAT, Precision::FP32> net_executer(*graph);
+
+//Get your input tensors through some specific string such as "input_0", "input_1", and 
+//so on. 
+//And then, feed the input tensor.
+//If you don't know Which input do these specific string ("input_0", "input_1") correspond with, you can launch dash board to find out.
+auto d_tensor_in_p = net_executer.get_in("input_0");
+Tensor4d<X86, AK_FLOAT> h_tensor_in;
+auto valid_shape_in = d_tensor_in_p->valid_shape();
+for (int i=0; i<valid_shape_in.size(); i++) {
+    LOG(INFO) << "detect input dims[" << i << "]" << valid_shape_in[i]; //see tensor's dimentions
+}
+h_tensor_in.re_alloc(valid_shape_in);
+float* h_data = h_tensor_in.mutable_data();
+for (int i=0; i<h_tensor_in.size(); i++) {
+    h_data[i] = 1.0f;
+}
+d_tensor_in_p->copy_from(h_tensor_in);
+
+//Do inference.
+net_executer.prediction();
+
+//Get result tensor through the name of output node.
+//And also, you need to see the dash board again to find out how many output nodes are and remember their name.
+
+//For example, you've got a output node named obj_pre_out
+//Then, you can get an output tensor.
+auto d_tensor_out_0_p = net_executer.get_out("obj_pred_out"); //get_out returns a pointer to output tensor.
+auto d_tensor_out_1_p = net_executer.get_out("lc_pred_out"); //get_out returns a pointer to output tensor.
+//......
+// do something else ...
+//...
+//save model.
+//You might not optimize the graph when you load the saved model again.
+std::string save_model_path = model_path + std::string(".saved");
+auto status = graph->save(save_model_path);
+if (!status ) {
+    LOG(FATAL) << " [ERROR] " << status.info();
+}
+
+```
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/build_and_install_lib_cn.rst b/doc/fluid/new_docs/advanced_usage/deploy/build_and_install_lib_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3884284ea020fe94ed9c03ec84c856ee44aa8c3f
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/build_and_install_lib_cn.rst
@@ -0,0 +1,99 @@
+.. _install_or_build_cpp_inference_lib:
+
+安装与编译C++预测库
+===========================
+
+直接下载安装
+-------------
+
+======================   ========================================
+版本说明                            C++预测库   
+======================   ========================================
+cpu_avx_mkl              `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/fluid.tgz>`_ 
+cpu_avx_openblas         `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/fluid.tgz>`_
+cpu_noavx_openblas       `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/fluid.tgz>`_
+cuda7.5_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
+cuda8.0_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
+cuda8.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid.tgz>`_
+cuda9.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/fluid.tgz>`_
+======================   ========================================
+
+从源码编译
+----------
+用户也可以从 PaddlePaddle 核心代码编译C++预测库，只需在编译时配制下面这些编译选项：
+
+=================   =========
+选项                 值   
+=================   =========
+CMAKE_BUILD_TYPE    Release
+FLUID_INSTALL_DIR   安装路径    
+WITH_FLUID_ONLY     ON（推荐）
+WITH_SWIG_PY        OFF（推荐
+WITH_PYTHON         OFF（推荐）
+WITH_GPU            ON/OFF
+WITH_MKL            ON/OFF
+=================   =========
+
+建议按照推荐值设置，以避免链接不必要的库。其它可选编译选项按需进行设定。
+
+下面的代码片段从github拉取最新代码，配制编译选项（需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径）：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+     PADDLE_ROOT=/path/of/capi
+     git clone https://github.com/PaddlePaddle/Paddle.git
+     cd Paddle
+     mkdir build
+     cd build
+     cmake -DFLUID_INSTALL_DIR=$PADDLE_ROOT \
+           -DCMAKE_BUILD_TYPE=Release \
+           -DWITH_FLUID_ONLY=ON \
+           -DWITH_SWIG_PY=OFF \
+           -DWITH_PYTHON=OFF \
+           -DWITH_MKL=OFF \
+           -DWITH_GPU=OFF  \
+           ..
+      make
+      make inference_lib_dist
+
+成功编译后，使用C++预测库所需的依赖（包括：（1）编译出的PaddlePaddle预测库和头文件；（2）第三方链接库和头文件；（3）版本信息与编译选项信息）
+均会存放于PADDLE_ROOT目录中。目录结构如下：
+
+  .. code-block:: text
+
+     PaddleRoot/
+     ├── CMakeCache.txt
+     ├── paddle
+     │   └── fluid
+     │       ├── framework
+     │       ├── inference
+     │       ├── memory
+     │       ├── platform
+     │       ├── pybind
+     │       └── string
+     ├── third_party
+     │   ├── boost
+     │   │   └── boost
+     │   ├── eigen3
+     │   │   ├── Eigen
+     │   │   └── unsupported
+     │   └── install
+     │       ├── gflags
+     │       ├── glog
+     │       ├── mklml
+     │       ├── protobuf
+     │       ├── snappy
+     │       ├── snappystream
+     │       └── zlib
+     └── version.txt
+     
+version.txt 中记录了该预测库的版本信息，包括Git Commit ID、使用OpenBlas或MKL数学库、CUDA/CUDNN版本号，如：
+
+  .. code-block:: text
+
+     GIT COMMIT ID: c95cd4742f02bb009e651a00b07b21c979637dc8
+     WITH_MKL: ON
+     WITH_GPU: ON
+     CUDA version: 8.0
+     CUDNN version: v5
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/convert_paddle_to_anakin.md b/doc/fluid/new_docs/advanced_usage/deploy/convert_paddle_to_anakin.md
new file mode 100644
index 0000000000000000000000000000000000000000..56ca582b2b47f404ede777712830731ea7f4e9b5
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/convert_paddle_to_anakin.md
@@ -0,0 +1,73 @@
+# 模型转换指南
+
+Anakin 支持不同框架的模型预测。但由于格式的差别，Anakin 需要您预先转换模型。本文档介绍如何转换模型。
+
+## 简介
+
+Anakin 模型转换器输入支持 Caffe 和 Fluid 两种格式的预测模型，模型包含网络结构（model 或 prototxt）和权重参数（param 或 caffemodel）。   
+
+模型转换的输出是一个 bin 文件，它作为 Anakin 框架的 graph 参数导入。   
+
+您还可以使用模型转换器的 launch board 功能生成网络结构的 HTML 预览。   
+
+
+## 系统要求
+
+- python 2.7+
+- pyyaml
+- flask
+- protobuf 3.5+
+
+
+## 用法
+
+### 1、环境
+转换器所需的依赖标注于 *系统要求* 一节。
+
+### 2、配置
+您需要对 *config.yaml* 文件进行修改以告知您的需求。工程中给出了 *config.yaml* 示例，下面作进一步说明。
+
+#### config.yaml
+```bash
+OPTIONS:
+    Framework: CAFFE       # 依框架类型填写 CAFFE 或 FLUID
+    SavePath: ./output     # 转换结束后模型的保存位置
+    ResultName: googlenet  # 输出模型的名字
+    Config:
+        LaunchBoard: ON    # 是否生成网络结构预览页面
+        Server:
+            ip: 0.0.0.0
+            port: 8888     # 从一个可用端口访问预览页面
+        OptimizedGraph:    # 当您使用了 Anakin 框架的 Optimized 功能时，才应该打开此项
+            enable: OFF
+            path: /path/to/anakin_optimized_anakin_model/googlenet.anakin.bin.saved
+    LOGGER:
+        LogToPath: ./log/  # 生成日志的路径
+        WithColor: ON
+
+TARGET:
+    CAFFE:
+        # 当 Framework 为 CAFFE 时需填写
+        ProtoPaths:
+            - /path/to/caffe/src/caffe/proto/caffe.proto
+        PrototxtPath: /path/to/your/googlenet.prototxt
+        ModelPath: /path/to/your/googlenet.caffemodel
+
+    FLUID:
+        # 当 Framework 为 FLUID 时需填写
+        Debug: NULL
+        ProtoPaths:
+            - /
+        PrototxtPath: /path/to/fluid/inference_model
+        ModelPath: /path/to/fluid/inference_model
+	# ...
+```
+
+### 3、转换
+在完成配置文件的修改后，您只需执行 ```python converter.py``` 就可以进行模型转换了。
+
+
+### 4、预览
+最后一步，就是在浏览器中查看令人振奋的转换结果！网址是在 *config.yaml* 中配置的，例如 http://0.0.0.0:8888 。
+
+> 注意：若您使用了默认的 IP 地址 0.0.0.0，请在预览时使用真实的服务器地址 real_ip:port 替代它。
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/how_to_add_anakin_op.md b/doc/fluid/new_docs/advanced_usage/deploy/how_to_add_anakin_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..f2783eb9f591a31443f2a692ce0eb1bcc9b1063a
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/how_to_add_anakin_op.md
@@ -0,0 +1,405 @@
+# 如何增加新的Operator
+
+## 基本概念
+
+简单介绍下几个同Operator相关的基本概念，详情请参考设计文档。
+
+```framework```: 上层的逻辑代码，负责从parser中获取参数及weights，添加op时主要修改framework/operator目录下的内容。
+
+```saber```: 底层的实现代码，Anakin通过saber封装了不同的backends，不同的实现(impl)分别特化出自己的实现，外层framework通过不同的template进入各自的impl完成调用。各个op的parameter放在saber/saber_funcs_param.h文件中，增加op主要修改saber/funcs下的内容。
+
+saber的文件结构：
+* saber/funcs下的是各个funcs的外部接口，这一层的op与具体的设备实现无关，只与各op完成的功能有关。由于跟实现(impl)无关，本层文件明均不带impl。
+* saber/funcs/impl下是各个op的impl声明，特定设备需要完成该层声明的特化版本，如saber/funcs/impl/x86实现了上一层impl声明的x86特化版本，saber/funcs/impl/cuda实现了上一层impl声明的NV特化版本。当增加新的backends时需要特化出新的实现。本层代码同实现相关，均带有```impl_```前缀。
+* saber/funcs/impl/cuda/base/cuda_c内有cuda```.cu```扩展名的文件，添加cuda的kernel需要在该文件目录下添加。
+* saber/funcs/impl/cuda/base/sass 内有不同架构的汇编代码编译的静态库。
+
+### 涉及到的基类及各个类之前的关系
+
+简单介绍相关的基类
+
+* ```anakin::Operator```: framework的operator基类，位于framework/core/operator/operator.h
+
+* ```anakin::saber::BaseFunc```: saber对外的op接口基类，提供统一的对外接口，位于saber/funcs/base.h。BaseFunc的```compute_output_shape```接口只根据input的shape和param的参数计算输出的shape，并通过```tensor```的```set_shape```接口(只设置shape，不分配空间)设置到output中。```operator()```接口为各个op的计算接口。
+
+* ```ankain::saber::ImplBase```: saber设备实现的op的接口，所有设备相关实现的基类。位于saber/funcs/impl/impl_base.h。实现版本中这里分为两类，一类以```vender_```为前缀，带有```vender_```代码意为使用第三方库来实现该op，如cudnn的conv，或mkl的conv等等，这类op的性能我们难以调优，因此单独列为一类。另一类是带有源码的saber实现，这些实现都带有```saber_```为前缀，此类实现带有源码，能够通过后续优化不断提升性能，实现起名时需要注意这一点。
+
+## 添加operator
+
+添加一个新的op需要以下几步：
+
+1. 添加saber的param
+2. 定义saber的Operator类
+3. 定义新的impl声明
+3. 完成新的impl实现
+4. 增加framework的实现或特化
+
+接下来就针对这几步，以一个简单例子为例介绍实现。
+
+例如我们要添加新的Mul op。给出计算公式如下：$$Out = alpha \dot X * Y$$
+
+### 为operator增加param
+
+涉及到的文件：```saber/saber_funcs_param.h```。如果之前已经存在需要添加的op的param，这一步可以跳过。
+这里```XXXParam```是一个```struct```。包含一个无参数的构造函数，含参数的构造函数，复制构造函数，```operator=()```及```operator==()```。
+```
+template <typename opTensor> // 能够获得target, datatype, layout
+struct MulParam{
+  MulParam()
+    : alpha(0)
+  {}
+  MulParam(float alpha_in)
+    : alpha(alpha_in)
+  {}
+  MulParam(const MulParam& right)
+    : alpha(right.alpha)
+  {}
+  MulParam &operator=(const MulParam &right) {
+    alpha = right.alpha;
+  }
+  bool operator==(const MulParam &right) {
+    return alpha == right.alpha;
+  }
+  float alpha;
+};
+```
+
+### 定义Operator类
+涉及到的文件:```saber/funcs/mul.h```。如果之前定义过该op的类，这里需要修改输入的impl定义头文件。
+下面给出一个相对完整的定义结构供参考。
+```
+//不同的设备需要包含对应的operator实现.[详见](#impl)
+#ifdef NVIDIA_GPU
+#include "saber/funcs/impl/cuda/saber_mul.h"
+#include "saber/funcs/impl/cuda/vender_mul.h"
+#endif
+//如果一个设备现在还没有对应的operator实现，需要包含声明。[详见](#declare)
+#ifdef USE_X86_PLACE
+#include "saber/funcs/impl/impl_mul.h"
+#endif
+namespace anakin {
+namespace saber {
+template<typename TargetType,
+        DataType OpDtype,
+        DataType inDtype = AK_FLOAT,
+        DataType outDtype = AK_FLOAT,
+        typename LayOutType_op = NCHW,
+        typename LayOutType_in = NCHW,
+        typename LayOutType_out = NCHW>
+class Mul : public BaseFunc<
+        Tensor<TargetType, inDtype, LayOutType_in>,
+        Tensor<TargetType, outDtype, LayOutType_out>,
+        Tensor<TargetType, OpDtype, LayOutType_op>,
+        ImplBase, MulParam> {
+public:
+    using BaseFunc<
+            Tensor<TargetType, inDtype, LayOutType_in>,
+            Tensor<TargetType, outDtype, LayOutType_out>,
+            Tensor<TargetType, OpDtype, LayOutType_op>,
+            ImplBase, MulParam>::BaseFunc;
+    Mul() = default;
+    typedef Tensor<TargetType, inDtype, LayOutType_in> InDataTensor;
+    typedef Tensor<TargetType, outDtype, LayOutType_out> OutDataTensor;
+    typedef Tensor<TargetType, OpDtype, LayOutType_op> OpTensor;
+    typedef MulParam<OpTensor> Param_t;
+    typedef std::vector<InDataTensor *> Input_v;
+    typedef std::vector<OutDataTensor *> Output_v;
+    typedef std::vector<Shape> Shape_v;
+
+    virtual SaberStatus compute_output_shape(const Input_v &input,
+                                             Output_v &output, Param_t &param) override {
+        //计算输出的shape，
+        Shape output_shape = (input[0]->valid_shape());
+        /* code */
+        return output[0]->set_shape(output_shape);
+    }
+    virtual SaberStatus init_impl(ImplEnum implenum) override {
+      // 不同设备均使用此init_impl, 此接口创建对应impl的实现。
+      switch (implenum) {
+            case VENDER_IMPL:
+                this->_impl.push_back(new VenderMul <TargetType,
+                OpDtype, inDtype, outDtype,
+                LayOutType_op, LayOutType_in, LayOutType_out>);
+                return SaberSuccess;
+            case SABER_IMPL:
+                this->_impl.push_back(new SaberMul <TargetType,
+                OpDtype, inDtype, outDtype,
+                LayOutType_op, LayOutType_in, LayOutType_out>);
+                return SaberSuccess;
+            default:
+                return SaberUnImplError;
+        }
+    }
+private:
+    virtual void pick_best_static() override {
+        if (true) // some condition?
+            this->_best_impl = this->_impl[0];
+    }
+    virtual void pick_best_specify(ImplEnum implenum) override {
+        this->_best_impl = this->_impl[0];
+    }
+};
+} // namespace saber
+} // namespace anakin
+```
+
+### 为operator增加新的impl<span id="declare">声明</span>
+
+涉及的文件:```saber/funcs/impl/impl_mul.h```。不同的设备都特化同一个声明，特化版本放在对应的文件夹下，这里的声明就是给出所有设备的统一声明。下面给出一个参考。
+```
+#include "saber/funcs/impl/impl_macro.h"
+namespace anakin{
+namespace saber{
+DEFINE_OP_CLASS(Mul, MulParam); // 第一个参数是op的名字，第二个是对应param的名字
+}
+}
+```
+
+### 完成新的operator特定后端<span id="impl">实现</span>
+
+涉及的文件:```saber/funcs/impl/xxx/vender_mul.h```或```saber/funcs/impl/xxx/saber_mul.h```
+这里```xxx```指代特定的一种设备。```vender```是指的使用第三方库实现的op，```saber```指的源码实现的op。这里以cuda的vender实现为例，简单介绍一下特化出的函数的几个基本接口。
+
+```
+// include 对应的声明
+#include "saber/funcs/impl/impl_mul.h"
+
+namespace anakin{
+namespace saber{
+template <DataType OpDtype,
+    DataType inDtype,
+    DataType outDtype,
+    typename LayOutType_op,
+    typename LayOutType_in,
+    typename LayOutType_out>
+class VenderMul<NV, //偏特化出需要的后端。
+    OpDtype, inDtype, outDtype,
+    LayOutType_op, LayOutType_in, LayOutType_out> :
+    public ImplBase<
+        Tensor<NV, inDtype, LayOutType_in>,
+        Tensor<NV, outDtype, LayOutType_out>,
+        Tensor<NV, OpDtype, LayOutType_op>,
+        MulParam<Tensor<NV, OpDtype, LayOutType_op> > >
+{
+public:
+    typedef Tensor<NV, inDtype, LayOutType_in> DataTensor_in;
+    typedef Tensor<NV, outDtype, LayOutType_out> DataTensor_out;
+    typedef Tensor<NV, OpDtype, LayOutType_op> OpTensor;
+    typedef typename DataTensor_in::Dtype InDataType;
+    typedef typename DataTensor_out::Dtype OutDataType;
+    typedef typename OpTensor::Dtype OpDataType;
+    VenderMul(){}
+    ~VenderMul() {}
+
+    virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            MulParam<OpTensor>& param, Context<NV>& ctx) {
+        this->_ctx = ctx;
+        create(inputs, outputs, param, ctx);
+    }
+
+    virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            MulParam<OpTensor>& param, Context<NV>& ctx) {
+        // set内部参数
+    }
+
+    virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
+                          std::vector<DataTensor_out*>& outputs,
+                        MulParam<OpTensor>& param) {
+        // dispatch kernel.
+    }
+
+private:
+};
+}
+}
+```
+```init```和```create```的区别：```init```接口是第一次初始化op的时候进入的接口，此函数只在第一次初始化op时调用，这个接口一般放一些只需要执行一次的代码，如malloc或者create之类的函数。```create```函数除了第一次init执行外，在输入发生变化或者param发生变化时会再次触发，create一般放置set函数，设置内部变量，当input发生变化时这里执行一些同input或weights直接相关的代码。但create因为触发位置在网络内，如果```create```函数执行了一些严重耗时的操作，这里会拖慢整个op的执行时间，需要慎重选择操作放置的位置。
+### 添加framework的特化
+
+涉及的文件:```framework/operators/mul.h```和```framework/operators/mul.cpp```。
+这里简单介绍下如果添加或修改framework内的operator
+
+```
+#include "framework/core/base.h"
+#include "framework/core/data_types.h"
+#include "framework/core/operator/operator.h"
+#include "utils/logger/logger.h"
+#include "saber/funcs/mul.h" // 需要包对应的saber头文件
+namespace anakin {
+namespace ops {
+template<typename Ttype, DataType Dtype, Precision Ptype>
+class MulHelper;
+
+template<typename Ttype, DataType Dtype, Precision Ptype>
+class Mul : public Operator<Ttype, Dtype, Ptype> {
+public:
+    Mul() {}
+    /// forward impl
+    virtual void operator() (OpContext<Ttype> &ctx,
+                             const std::vector<Tensor4dPtr<Ttype, Dtype> >& ins,
+                             std::vector<Tensor4dPtr<Ttype, Dtype> >& outs) {
+        LOG(ERROR) << "Not Impl Yet Operator power<TargetType:"<<"unknown"<<","
+                   <<type_id<typename DataTypeWarpper<Dtype>::type>().type_info()<<">";
+    }
+    friend class MulHelper<Ttype, Dtype, Ptype>;
+};
+template<typename Ttype, DataType Dtype, Precision Ptype>
+class MulHelper : public OperatorHelper<Ttype, Dtype, Ptype> {
+public:
+    MulHelper() = default;
+    ~MulHelper();
+    Status InitParam() override;
+
+    Status Init(OpContext<Ttype> &ctx,
+                const std::vector<Tensor4dPtr<Ttype, Dtype> >& ins,
+                std::vector<Tensor4dPtr<Ttype, Dtype> >& outs) override;
+    Status InferShape(const std::vector<Tensor4dPtr<Ttype, Dtype> >& ins,
+                      std::vector<Tensor4dPtr<Ttype, Dtype> >& outs) override;
+
+public:
+    saber::MulParam<Tensor4d<Ttype, Dtype>> _param_mul;
+    saber::Mul<Ttype, Dtype> _funcs_mul;
+};
+}
+} /* namespace anakin */
+```
+对应的```.cpp```文件如下：
+```
+#include "framework/operators/mul.h"
+
+namespace anakin {
+namespace ops {
+
+#ifdef USE_CUDA
+template<>
+void Mul<NV, AK_FLOAT, Precision::FP32>::operator()(
+    OpContext<NV>& ctx,
+    const std::vector<Tensor4dPtr<NV, AK_FLOAT> >& ins,
+    std::vector<Tensor4dPtr<NV, AK_FLOAT> >& outs) {
+    auto* impl =
+        static_cast<MulHelper<NV, AK_FLOAT, Precision::FP32>*>(this->_helper);
+    auto& param =
+        static_cast<MulHelper<NV, AK_FLOAT, Precision::FP32>*>(this->_helper)->_param_mul;
+    impl->_funcs_mul(ins, outs, param, ctx);
+}
+#endif
+
+template<typename Ttype, DataType Dtype, Precision Ptype>
+Status MulHelper<Ttype, Dtype, Ptype>::InitParam() {
+    auto alpha = GET_PARAMETER(float, alpha);
+    MulParam<Tensor4d<Ttype, Dtype>> param_mul(alpha);
+    _param_mul = param_mul;
+    return Status::OK();
+}
+
+template<typename Ttype, DataType Dtype, Precision Ptype>
+Status MulHelper<Ttype, Dtype, Ptype>::Init(OpContext<Ttype>& ctx,
+        const std::vector<Tensor4dPtr<Ttype, Dtype> >& ins,
+        std::vector<Tensor4dPtr<Ttype, Dtype> >& outs) {
+
+    SABER_CHECK(_funcs_mul.init(ins, outs, _param_mul, SPECIFY, VENDER_IMPL, ctx));
+    return Status::OK();
+}
+
+template<typename Ttype, DataType Dtype, Precision Ptype>
+Status MulHelper<Ttype, Dtype, Ptype>::InferShape(const
+        std::vector<Tensor4dPtr<Ttype, Dtype> >& ins,
+        std::vector<Tensor4dPtr<Ttype, Dtype> >& outs) {
+    SABER_CHECK(_funcs_mul.compute_output_shape(ins, outs, _param_mul));
+    return Status::OK();
+}
+
+#ifdef USE_CUDA
+template class MulHelper<NV, AK_FLOAT, Precision::FP32>;
+#endif
+#ifdef USE_ARM_PLACE
+template class MulHelper<ARM, AK_FLOAT, Precision::FP32>;
+#endif
+// register helper
+#ifdef USE_CUDA
+ANAKIN_REGISTER_OP_HELPER(Mul, MulHelper, NV, AK_FLOAT, Precision::FP32);
+#endif
+#ifdef USE_ARM_PLACE
+ANAKIN_REGISTER_OP_HELPER(Mul, MulHelper, ARM, AK_FLOAT, Precision::FP32);
+#endif
+//! register op
+ANAKIN_REGISTER_OP(Mul)
+.Doc("Mul operator")
+#ifdef USE_CUDA
+.__alias__<NV, AK_FLOAT, Precision::FP32>("mul")
+#endif
+#ifdef USE_ARM_PLACE
+.__alias__<ARM, AK_FLOAT, Precision::FP32>("mul")
+#endif
+.num_in(1)
+.num_out(1)
+.Args<float>("alpha", " alpha of Mul "); //注册
+
+} /* namespace ops */
+
+} /* namespace anakin */
+```
+
+## 实现单元测试
+涉及的文件:```test/saber/xxx/test_saber_funcs_mul_xxx.cpp```
+在对应的test下需要添加新的单元测试
+
+```
+TEST(TestSaberFuncNV, test_depthwise_conv) {
+
+    // init tensors and some param.
+
+    // start Reshape & doInfer
+    Context<NV> ctx1(0, 1, 1);
+
+    // create param
+    MulParam<Tensor<NV, AK_FLOAT, NCHW> > param(alpha);
+
+    std::vector<Tensor<NV, AK_FLOAT, NCHW>*> input;
+    std::vector<Tensor<NV, AK_FLOAT, NCHW>*> output;
+
+    // create saber op
+    Mul<NV, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> mul;
+
+    // compute output shape
+    mul.compute_output_shape(input, output, param);
+
+    // re_alloc output tensors memory based on output shape
+    output[0]->re_alloc(output[0]->shape());
+
+    // init saber op(calling init and create)
+    mul.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
+
+    // call operator()
+    mul(input, output, param, ctx1);
+
+    // cuda specified, record events
+    cudaStream_t cuda_stream = ctx1.get_compute_stream();
+    output[0]->record_event(cuda_stream);
+    output_dev.sync();
+    
+    // param changed 
+    param.alpha = 2.0;
+    // auto calling saber op(create and dispatch)
+    mul(input, output, param, ctx1);
+
+    cudaDeviceSynchronize();
+    CUDA_CHECK(cudaPeekAtLastError());
+}
+
+int main(int argc, const char** argv){
+    anakin::saber::Env<NV>::env_init();
+
+    // initial logger
+    //logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+
+```
+## 调试及注意事项
+
+一个op需要有对外的op接口和内部实现，由于存在saber/funcs/impl的非特化版本声明，当有op在某种设备下没有对应实现时，也能够编译，但此时是没有任何实现的空实现，
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/how_to_support_new_device_in_anakin.md b/doc/fluid/new_docs/advanced_usage/deploy/how_to_support_new_device_in_anakin.md
new file mode 100644
index 0000000000000000000000000000000000000000..a1f75f5e95cfb90f26d3782ba30a6d1887a70424
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/how_to_support_new_device_in_anakin.md
@@ -0,0 +1,459 @@
+# 如何支持一个新的设备
+
+## 概览
+
+添加一个新的设备需要以下3个步骤：
+
+* [在`CMakeList`中添加设备的支持](#0001)
+* [在`saber`中添加设备的实现](#0002)
+* [在`framework`中添加设备的具体化或实例化](#0003)
+
+假设新设备的名称为`TNEW`, 以下将以这个设备名称进行演示。
+
+## <span id = '0001'> 在`CMakeList`中添加设备的支持 </span> ##
+
+* 修改根目录`CMakeList.txt`
+```cmake
+#select the plantform to build
+anakin_option(USE_GPU_PLACE "Select the build mode for GPU place." NO)
+anakin_option(USE_X86_PLACE "Select the build mode for X86 place." NO)
+anakin_option(USE_ARM_PLACE "Select the build mode for ARM place." NO)
+anakin_option(USE_TNEW_PLACE "Select the build mode for ARM place." YES)
+```
+
+* 修改`saber/CMakeList.txt`
+
+根据新增设备的目录完善`saber`目录下的`CMakeList.txt`。
+```cmake
+if(USE_TNEW_PLACE)
+    anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/tnew "cpp" ANAKIN_SABER_BASE_SRC)
+    anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/tnew "cpp" ANAKIN_SABER_BASE_SRC)
+endif()
+```
+
+* 修改`test/CMakeList.txt`
+
+新增设备的单测文件放在`test/saber/tnew`目录下，修改`test`目录下的`CMakeList.txt`。
+```cmake
+if(USE_TNEW_PLACE)
+    anakin_fetch_files_with_suffix(${ANAKIN_UNIT_TEST}/saber/tnew "cpp" ANAKIN_TEST_CASE_SRC)
+endif()
+```
+
+* 修改`cmake/anakin_config.h.in`
+```c++
+// plantform to use
+#cmakedefine USE_GPU_PLACE
+
+#cmakedefine USE_X86_PLACE
+
+#cmakedefine USE_ARM_PLACE
+
+#cmakedefine USE_TNEW_PLACE
+```
+
+* 其他依赖和编译选项    
+修改`cmake`目录下的`compiler_options.cmake`和`find_modules.cmake`
+
+
+## <span id = '0002'> 在`saber`中添加设备的实现 </span> ##
+`saber`是`Anakin`的基础计算库，对外提供设备无关的统一的API，设备相关的实现都会封装到`TargetWrapper`中。
+
+### 在`saber/saber_types.h`中添加设备
+
+```c++
+enum TargetTypeEnum {
+    eINVALID = -1,
+    eNV = 1,
+    eAMD = 2,
+    eARM = 3,
+    eX86 = 4,
+    eNVHX86 = 5,
+    eTNEW = 6
+};
+
+typedef TargetType<eNV> NV;
+typedef TargetType<eARM> ARM;
+typedef TargetType<eAMD> AMD;
+typedef TargetType<eX86> X86;
+typedef TargetType<eTNEW> TNEW;
+
+```
+
+### 在`saber/core`中添加设备的实现
+
+1. 在`target_traits.h`中添加新设备
+
+* 增加设备类型
+```c++
+struct __cuda_device{};
+struct __arm_device{};
+struct __amd_device{};
+struct __x86_device{};
+struct __tnew_device{};
+```
+
+* `TargetTypeTraits`模板具体化
+```c++
+template <>
+struct TargetTypeTraits<TNEW> {
+    typedef __xxx_target target_category;//根据实际设备是host端还是device端进行选择
+    typedef __tnew_device target_type;
+};
+```
+
+2. 在`data_traits.h`中特化`DataTrait`模板类
+
+如果设备需要特殊的数据类型，则特化出设备的`DataTrait`类的实现，例如opencl数据类型的实现如下：
+```c++
+#ifdef USE_OPENCL
+struct ClMem{
+    ClMem(){
+        dmem = nullptr;
+        offset = 0;
+    }
+
+    ClMem(cl_mem* mem_in, int offset_in = 0) {
+        dmem = mem_in;
+        offset = offset_in;
+    }
+
+    ClMem(ClMem& right) {
+        dmem = right.dmem;
+        offset = right.offset;
+    }
+
+    ClMem& operator=(ClMem& right) {
+        this->dmem = right.dmem;
+        this->offset = right.offset;
+        return *this;
+    }
+
+    ClMem& operator+(int offset_in) {
+        this->offset += offset_in;
+        return *this;
+    }
+
+    int offset{0};
+    cl_mem* dmem;
+};
+
+template <>
+struct DataTrait<AMD, AK_FLOAT> {
+    typedef ClMem Dtype;
+    typedef float dtype;
+};
+
+template <>
+struct DataTrait<AMD, AK_DOUBLE> {
+    typedef ClMem Dtype;
+    typedef double dtype;
+};
+
+template <>
+struct DataTrait<AMD, AK_INT8> {
+    typedef ClMem Dtype;
+    typedef char dtype;
+};
+#endif //use_opencl
+```
+
+3. 在`target_wrapper.h`中特化`TargetWrapper`模板类
+
+特化`TargetWrapper`模板类，在`target_wrapper.h`中声明函数，具体如下：
+```c++
+template <>
+struct TargetWrapper<TNEW, __xxx_target> { //根据TNEW的具体类型修改__xxx_target，__host_target或者__device_target
+
+    typedef xxx_event event_t;          //根据设备实现xxx_event
+    typedef xxx_stream stream_t;        //根据设备实现xxx_stream
+
+    static void get_device_count(int& count);
+
+    static void set_device(int id);
+
+    //We should add strategy to avoid malloc directly
+    static void mem_alloc(void** ptr, size_t n);
+
+    static void mem_free(void* ptr);
+
+    static void mem_set(void* ptr, int value, size_t n);
+
+    static void create_event(event_t& event, bool flag = false);
+
+    static void create_stream(stream_t& stream);
+
+    static void create_stream_with_flag(stream_t& stream, unsigned int flag);
+
+    static void create_stream_with_priority(stream_t& stream, unsigned int flag, int priority);
+
+    static void destroy_stream(stream_t& stream);
+
+    static void destroy_event(event_t& event);
+
+    static void record_event(event_t& event, stream_t stream);
+
+    static void query_event(event_t& event);
+
+    static void sync_event(event_t& event);
+
+    static void sync_stream(event_t& event, stream_t& stream);
+
+    static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+                            size_t count, __DtoD);
+
+    static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+                             size_t count, stream_t& stream, __DtoD);
+
+    static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+                            size_t count, __HtoD);
+
+    static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+                             size_t count, stream_t& stream, __HtoD);
+
+    static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+                            size_t count, __DtoH);
+
+    static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+                             size_t count, stream_t& stream, __DtoH);
+
+    static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
+                                int src_dev, size_t count);
+
+    static void async_memcpy_p2p(void* dst, int dst_dev, const void* src, \
+                                 int src_dev, size_t count, stream_t& stream);
+
+    static int get_device_id();
+};
+
+```
+
+4. 在`impl/`目录下添加设备目录和实现
+
+在`saber/core/impl`目录下添加设备目录`tnew`。
+* 实现`TargetWrapper<TNEW, __xxx_target>`结构体中各函数的定义。    
+如果`TargetWrapper<TNEW, __xxx_target>`的实现与默认的模板类一致，则不用特化出该类。
+
+```c++
+typedef TargetWrapper<TNEW, __xxx_target> TNEW_API;
+void TNEW_API::get_device_count(int &count) {
+    // add implementation
+}
+
+void TNEW_API::set_device(int id){
+    // add implementation
+}
+        
+void TNEW_API::mem_alloc(void** ptr, size_t n){
+    // add implementation
+}
+        
+void TNEW_API::mem_free(void* ptr){
+    if(ptr != nullptr){
+        // add implementation
+    }
+}
+...
+
+```
+
+* 特化实现`device.h`中的`Device<TNEW>`
+
+```c++
+template <>
+void Device<TNEW>::create_stream() {
+    // add implementation
+}
+
+template <>
+void Device<TNEW>::get_info() {
+
+    // add implementation
+}
+
+```
+
+### 在`saber/funcs`中实现设备相关的op
+
+参考[如何增加新的Operator](addCustomOp.md)
+
+
+## <span id = '0003'> 在`framework`中添加设备的具体化或实例化 </span> ##
+
+### `framework/core`
+
+* `net.cpp`中添加实例化
+
+```c++
+#ifdef USE_TNEW_PLACE
+template class Net<TNEW, AK_FLOAT, Precision::FP32, OpRunType::ASYNC>;
+template class Net<TNEW, AK_FLOAT, Precision::FP32, OpRunType::SYNC>;
+#endif
+```
+
+* `operator_func.cpp`中添加实例化
+
+```c++
+#ifdef USE_TNEW_PLACE
+template class OperatorFunc<TNEW, AK_FLOAT, Precision::FP32>;
+#endif
+```
+
+* `worker.cpp`中添加实例化
+
+```c++
+#ifdef USE_TNEW_PLACE
+template class Worker<TNEW, AK_FLOAT, Precision::FP32, OpRunType::ASYNC>;
+template class Worker<TNEW, AK_FLOAT, Precision::FP32, OpRunType::SYNC>;
+#endif
+```
+
+* `operator_attr.cpp`中添加实例化
+
+```c++
+template
+OpAttrWarpper& OpAttrWarpper::__alias__<TNEW, AK_FLOAT, Precision::FP32>(const std::string& op_name);
+template
+OpAttrWarpper& OpAttrWarpper::__alias__<TNEW, AK_FLOAT, Precision::FP16>(const std::string& op_name);
+template
+OpAttrWarpper& OpAttrWarpper::__alias__<TNEW, AK_FLOAT, Precision::INT8>(const std::string& op_name);
+```
+
+* `parameter.h`中添加设备的实现
+
+```c++
+#ifdef USE_TNEW_PLACE
+template<typename Dtype>
+class PBlock<Dtype, TNEW> {
+public:
+	typedef Tensor4d<TNEW, DataTypeRecover<Dtype>::type> type;
+
+	PBlock() {
+		_inner_tensor = std::make_shared<type>(); 
+	}
+	...
+}
+#endif //TNEW
+```
+
+* `type_traits_extend.h`中添加设备的实现
+
+```c++
+template<>
+struct target_host<saber::TNEW> {
+    typedef saber::X86 type; //根据TNEW选择正确的host type
+};
+```
+
+### `framework/graph`
+
+* `graph.cpp`中添加实例化
+  
+```c++
+  #ifdef USE_TNEW_PLACE
+  template class Graph<TNEW, AK_FLOAT, Precision::FP32>;
+  template class Graph<TNEW, AK_FLOAT, Precision::FP16>;
+  template class Graph<TNEW, AK_FLOAT, Precision::INT8>;
+  #endif
+```
+
+### `framework/model_parser`
+
+* `parser.cpp`中添加实例化
+  
+```c++
+  #ifdef USE_TNEW_PLACE
+  template
+  Status load<TNEW, AK_FLOAT, Precision::FP32>(graph::Graph<TNEW, AK_FLOAT, Precision::FP32>* graph,
+          const char* model_path);
+  template
+  Status load<TNEW, AK_FLOAT, Precision::FP16>(graph::Graph<TNEW, AK_FLOAT, Precision::FP16>* graph,
+          const char* model_path);
+  template
+  Status load<TNEW, AK_FLOAT, Precision::INT8>(graph::Graph<TNEW, AK_FLOAT, Precision::INT8>* graph,
+          const char* model_path);
+  
+  template
+  Status save<TNEW, AK_FLOAT, Precision::FP32>(graph::Graph<TNEW, AK_FLOAT, Precision::FP32>* graph,
+          std::string& model_path);
+  template
+  Status save<TNEW, AK_FLOAT, Precision::FP16>(graph::Graph<TNEW, AK_FLOAT, Precision::FP16>* graph,
+          std::string& model_path);
+  template
+  Status save<TNEW, AK_FLOAT, Precision::INT8>(graph::Graph<TNEW, AK_FLOAT, Precision::INT8>* graph,
+          std::string& model_path);
+  
+  template
+  Status load<TNEW, AK_FLOAT, Precision::FP32>(graph::Graph<TNEW, AK_FLOAT, Precision::FP32>* graph,
+          std::string& model_path);
+  template
+  Status load<TNEW, AK_FLOAT, Precision::FP16>(graph::Graph<TNEW, AK_FLOAT, Precision::FP16>* graph,
+          std::string& model_path);
+  template
+  Status load<TNEW, AK_FLOAT, Precision::INT8>(graph::Graph<TNEW, AK_FLOAT, Precision::INT8>* graph,
+          std::string& model_path);
+  
+  template
+  Status save<TNEW, AK_FLOAT, Precision::FP32>(graph::Graph<TNEW, AK_FLOAT, Precision::FP32>* graph,
+          const char* model_path);
+  template
+  Status save<TNEW, AK_FLOAT, Precision::FP16>(graph::Graph<TNEW, AK_FLOAT, Precision::FP16>* graph,
+          const char* model_path);
+  template
+  Status save<TNEW, AK_FLOAT, Precision::INT8>(graph::Graph<TNEW, AK_FLOAT, Precision::INT8>* graph,
+          const char* model_path);
+  #endif
+```
+
+* `model_io.cpp`中添加实例化
+
+```c++
+#ifdef USE_TNEW_PLACE
+template class NodeIO<TNEW, AK_FLOAT, Precision::FP32>;
+template class NodeIO<TNEW, AK_FLOAT, Precision::FP16>;
+template class NodeIO<TNEW, AK_FLOAT, Precision::INT8>;
+#endif
+```
+
+### `framework/operators`
+
+为`framework/operators`目录下所有op添加实例化或具体化
+以`activation.cpp`为例，实例化如下：
+
+```c++
+#ifdef USE_TNEW_PLACE
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP32);
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP16);
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::INT8);
+template class ActivationHelper<TNEW, AK_FLOAT, Precision::FP32>;
+ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, TNEW, AK_FLOAT, Precision::FP32);
+#endif
+```
+
+如果TNEW设备函数的实现与现有模板实现不一致，可以特化实现如下（以init()为例）：
+```c++
+#ifdef USE_TNEW_PLACE
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP32);
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP16);
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::INT8);
+template <>
+Status ActivationHelper<TNEW, AK_FLOAT, Precision::FP32>::Init(OpContext<TNEW> &ctx,\
+        const std::vector<Tensor4dPtr<TNEW, AK_FLOAT> >& ins, \
+                std::vector<Tensor4dPtr<TNEW, AK_FLOAT> >& outs) {
+    SABER_CHECK(_funcs_activation.init(ins, outs, _param_activation, SPECIFY, SABER_IMPL, ctx)); //在这里选择实现方式
+    return Status::OK();
+}
+ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, TNEW, AK_FLOAT, Precision::FP32);
+#endif
+```
+
+在`ANAKIN_REGISTER_OP(Activation)`中添加TNEW的注册
+
+```c++
+#ifdef USE_TNEW_PLACE
+.__alias__<TNEW, AK_FLOAT, Precision::FP32>("activation")
+#endif
+```
+
+## 注意事项
+不要修改`Tensor`/`Buffer`/`Env`/`Context`这些类函数的接口和实现
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/index_anakin.rst b/doc/fluid/new_docs/advanced_usage/deploy/index_anakin.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b782242a6632a5d42a512cf3b830d6e047c064ab
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/index_anakin.rst
@@ -0,0 +1,26 @@
+服务器端部署 - Anakin
+#####################
+
+
+使用文档
+~~~~~~~
+
+.. toctree::
+   :maxdepth: 1
+
+   install_anakin.md
+   convert_paddle_to_anakin.md
+   run_anakin_on_arm.md
+   anakin_tutorial.md
+   anakin_example.md
+   anakin_gpu_benchmark.md
+   anakin_arm_benchmark.md
+
+开发文档
+~~~~~~~
+
+.. toctree::
+   :maxdepth: 1
+
+   how_to_add_anakin_op.md
+   how_to_support_new_device_in_anakin.md
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/index_mobile.rst b/doc/fluid/new_docs/advanced_usage/deploy/index_mobile.rst
new file mode 100644
index 0000000000000000000000000000000000000000..47df6392c123d520c701089db6ee1ae72e4f8ea5
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/index_mobile.rst
@@ -0,0 +1,9 @@
+移动端部署
+##########
+
+.. toctree::
+   :maxdepth: 2
+
+   mobile_build.md
+   mobile_dev.md
+
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/index_native.rst b/doc/fluid/new_docs/advanced_usage/deploy/index_native.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a5209e8560b31e9f0f776fba9a2b8c5bc150165c
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/index_native.rst
@@ -0,0 +1,8 @@
+服务器端部署 - 原生引擎
+#######################
+
+..  toctree::
+    :maxdepth: 2
+
+    build_and_install_lib_cn.rst
+    native_infer.rst
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/install_anakin.md b/doc/fluid/new_docs/advanced_usage/deploy/install_anakin.md
new file mode 100644
index 0000000000000000000000000000000000000000..bb7c1950308622e3de292268a718e6ec688e6ae6
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/install_anakin.md
@@ -0,0 +1,69 @@
+## 从源码编译安装Anakin ##
+
+我们已经在CentOS 7.3上成功的安装和测试了Anakin，对于其他操作系统，我们将很快支持。
+
+### 安装概览 ###
+
+* [在CentOS上安装 Anakin]()
+* [在Ubuntu上安装 Anakin]()
+* [在ARM上安装 Anakin](run_on_arm_ch.md)
+* [验证安装]()
+
+
+### 在CentOS上安装 Anakin ###
+#### 1. 系统要求 ####
+
+*  make 3.82+
+*  cmake 2.8.12+
+*  gcc 4.8.2+
+*  g++ 4.8.2+
+*  其他需要补充的。。。
+
+#### 2. 编译CPU版Anakin ####
+
+暂时不支持
+
+#### 3. 编译支持NVIDIA GPU的Anakin ####
+
+- 3.1. 安装依赖
+  - 3.1.1 protobuf  
+    >$ git clone https://github.com/google/protobuf  
+    >$ cd protobuf  
+    >$ git submodule update --init --recursive  
+    >$ ./autogen.sh  
+    >$ ./configure --prefix=/path/to/your/insall_dir  
+    >$ make  
+    >$ make check  
+    >$ make install  
+    >$ sudo ldconfig
+
+
+    如安装protobuf遇到任何问题，请访问[这里](https://github.com/google/protobuf/blob/master/src/README.md)
+
+- 3.2 CUDA Toolkit
+  - [CUDA 8.0](https://developer.nvidia.com/cuda-zone) or higher. 具体信息参见[NVIDIA's documentation](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/).
+  - [cuDNN v7](https://developer.nvidia.com/cudnn). 具体信息参见[NVIDIA's documentation](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/). 
+- 3.3  编译Anakin
+  >$ git clone https:/xxxxx  
+  >$ cd anakin  
+  >$ mkdir build  
+  >$ camke ..  
+  >$ make
+
+
+#### 4. 编译支持AMD GPU的Anakin ####
+
+暂时还不支持
+
+
+### 在Ubuntu上安装 Anakin ###
+
+暂时还不支持
+
+
+### 在ARM上安装 Anakin ###
+
+暂时还不支持
+
+### 验证安装 ###
+we are coming soon...
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/mobile_build.md b/doc/fluid/new_docs/advanced_usage/deploy/mobile_build.md
new file mode 100644
index 0000000000000000000000000000000000000000..e51593164987d548e256ddebbc5fa8d960fb5255
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/mobile_build.md
@@ -0,0 +1,59 @@
+# 环境搭建
+## 使用 docker
+### 1. 安装 docker
+安装 docker 的方式，参考官方文档 [https://docs.docker.com/install/](https://docs.docker.com/install/)
+### 2. 使用 docker 搭建构建环境
+首先进入 paddle-mobile 的目录下，执行 `docker build`
+以 Linux/Mac 为例 (windows 建议在 'Docker Quickstart Terminal' 中执行)
+```
+$ docker build -t paddle-mobile:dev - < Dockerfile
+```
+使用 `docker images` 可以看到我们新建的 image
+```
+$ docker images
+REPOSITORY      TAG     IMAGE ID       CREATED         SIZE
+paddle-mobile   dev     33b146787711   45 hours ago    372MB
+```
+### 3. 使用 docker 构建
+进入 paddle-mobile 目录，执行 docker run
+```
+$ docker run -it --mount type=bind,source=$PWD,target=/paddle-mobile paddle-mobile:dev
+root@5affd29d4fc5:/ # cd /paddle-mobile
+# 生成构建 android 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-android-neon.cmake
+# 生成构建 linux 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-linux-gnueabi.cmake
+```
+### 4. 设置编译选项
+可以通过 ccmake 设置编译选项
+```
+root@5affd29d4fc5:/ # ccmake .
+                                                     Page 1 of 1
+ CMAKE_ASM_FLAGS
+ CMAKE_ASM_FLAGS_DEBUG
+ CMAKE_ASM_FLAGS_RELEASE
+ CMAKE_BUILD_TYPE
+ CMAKE_INSTALL_PREFIX             /usr/local
+ CMAKE_TOOLCHAIN_FILE             /paddle-mobile/tools/toolchains/arm-android-neon.cmake
+ CPU                              ON
+ DEBUGING                         ON
+ FPGA                             OFF
+ LOG_PROFILE                      ON
+ MALI_GPU                         OFF
+ NET                              googlenet
+ USE_EXCEPTION                    ON
+ USE_OPENMP                       OFF
+```
+修改选项后，按 `c`, `g` 更新 Makefile
+### 5. 构建
+使用 make 命令进行构建
+```
+root@5affd29d4fc5:/ # make
+```
+### 6. 查看构建产出
+构架产出可以在 host 机器上查看，在 paddle-mobile 的目录下，build 以及 test/build 下，可以使用 adb 指令或者 scp 传输到 device 上执行
+
+## 不使用 docker
+不使用 docker 的方法，可以直接用 cmake 生成 makefile 后构建。使用 ndk 构建 android 应用需要正确设置 NDK_ROOT。构建 linux 应用需要安装 arm-linux-gnueabi-gcc 或者类似的交叉编译工具，可能需要设置 CC，CXX 环境变量，或者在 tools/toolchains/ 中修改 arm-linux-gnueabi.cmake，或者增加自己需要的 toolchain file。
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/mobile_dev.md b/doc/fluid/new_docs/advanced_usage/deploy/mobile_dev.md
new file mode 100644
index 0000000000000000000000000000000000000000..474380f9dbfd2fb8a06630cb1ca3ca5cd14ca9d9
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/mobile_dev.md
@@ -0,0 +1,72 @@
+# iOS开发文档
+
+## 编译
+
+### 一. 使用 build.sh 编译
+
+```sh 
+sh build.sh ios
+
+# 如果只想编译某个特定模型的 op, 则需执行以下命令
+sh build.sh ios googlenet
+
+# 在这个文件夹下, 你可以拿到生成的 .a 库
+cd ../build/release/ios/build
+
+```
+
+### 二. 使用 xcode 编译
+
+我们提供了 ios 开发更为熟悉的 xcode 编译环境:
+在 ios/ 目录下打开 PaddleMobile.xcworkspace 即可编译 PaddleMobile 或者 运行 Demo
+
+### 三. 集成
+
+#### 如使用 c++ 接口
+将 
+
+```
+libpaddle-mobile.a 
+io.h  
+program.h 
+types.h 
+lod_tensor.h 
+tensor.h
+```
+拖入工程, io.h 为接口文件, 可在 [github](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/src/io/io.h)上查看接口注释
+
+#### 如使用 oc 接口
+将在xcode 编译生成的
+```
+libPaddleMobile.a 
+PaddleMobile.h
+```
+拖入工程, 接口如下:
+
+```
+/*
+	创建单例对象
+*/
++ (instancetype)sharedInstance;
+
+/*
+	load 模型, 开辟内存
+*/
+- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
+
+/*
+	进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
+*/
+- (NSArray *)predict:(CGImageRef)image means:(NSArray<NSNumber *> *)means scale:(float)scale;
+
+/*
+	进行预测
+*/
+- (NSArray *)predict:(CGImageRef)image;
+
+/*
+	清理内存
+*/
+- (void)clear;
+
+```
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst b/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e1eee3f818796e895362caab10846cf59b557162
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst
@@ -0,0 +1,108 @@
+Paddle 预测 API
+===============
+
+为了更简单方便的预测部署，Fluid 提供了一套高层 API
+用来隐藏底层不同的优化实现。
+
+`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/contrib/inference>`__
+包括
+
+-  头文件 ``paddle_inference_api.h`` 定义了所有的接口
+-  库文件\ ``libpaddle_fluid.so`` 或 ``libpaddle_fluid.a``
+-  库文件 ``libpaddle_inference_api.so`` 或
+   ``libpaddle_inference_api.a``
+
+编译和依赖可以参考 :ref:`install_or_build_cpp_inference_lib` 。
+
+下面是一些 API 概念的介绍
+
+PaddleTensor
+------------
+
+PaddleTensor 定义了预测最基本的输入输出的数据格式，其定义是
+
+.. code:: cpp
+
+    struct PaddleTensor {
+      std::string name;  // variable name.
+      std::vector<int> shape;
+      PaddleBuf data;  // blob of data.
+      PaddleDType dtype;
+    };
+
+-  ``name`` 用于指定输入数据对应的 模型中variable 的名字
+   （暂时没有用，但会在后续支持任意 target 时启用）
+-  ``shape`` 表示一个 Tensor 的 shape
+-  ``data`` 数据以连续内存的方式存储在\ ``PaddleBuf``
+   中，\ ``PaddleBuf``
+   可以接收外面的数据或者独立\ ``malloc``\ 内存，详细可以参考头文件中相关定义。
+-  ``dtype`` 表示 Tensor 的数据类型
+
+engine
+------
+
+高层 API 底层有多种优化实现，我们称之为 engine，目前有三种 engine
+
+-  原生 engine，由 paddle 原生的 forward operator
+   组成，可以天然支持所有paddle 训练出的模型，
+-  Anakin engine，封装了
+   `Anakin <https://github.com/PaddlePaddle/Anakin>`__
+   ，在某些模型上性能不错，但只能接受自带模型格式，无法支持所有 paddle
+   模型，
+-  TensorRT mixed engine，用子图的方式支持了
+   `TensorRT <https://developer.nvidia.com/tensorrt>`__ ，支持所有paddle
+   模型，并自动切割部分计算子图到 TensorRT 上加速（WIP）
+
+其实现为
+
+.. code:: cpp
+
+    enum class PaddleEngineKind {
+      kNative = 0,       // Use the native Fluid facility.
+      kAnakin,           // Use Anakin for inference.
+      kAutoMixedTensorRT // Automatically mixing TensorRT with the Fluid ops.
+    };
+
+预测部署过程
+------------
+
+总体上分为以下步骤
+
+1. 用合适的配置创建 ``PaddlePredictor``
+2. 创建输入用的 ``PaddleTensor``\ ，传入到 ``PaddlePredictor`` 中
+3. 获取输出的 ``PaddleTensor`` ，将结果取出
+
+下面完整演示一个简单的模型，部分细节代码隐去
+
+.. code:: cpp
+
+    #include "paddle_inference_api.h"
+
+    // 创建一个 config，并修改相关设置
+    paddle::NativeConfig config;
+    config.model_dir = "xxx";
+    config.use_gpu = false;
+    // 创建一个原生的 PaddlePredictor
+    auto predictor =
+          paddle::CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+    // 创建输入 tensor
+    int64_t data[4] = {1, 2, 3, 4};
+    paddle::PaddleTensor tensor{.name = "",
+                                .shape = std::vector<int>({4, 1}),
+                                .data = PaddleBuf(data, sizeof(data)),
+                                .dtype = PaddleDType::INT64};
+    // 创建输出 tensor，输出 tensor 的内存可以复用
+    std::vector<paddle::PaddleTensor> outputs;
+    // 执行预测
+    CHECK(predictor->Run(slots, &outputs));
+    // 获取 outputs ...
+
+编译时，联编 ``libpaddle_fluid.a/.so`` 和
+``libpaddle_inference_api.a/.so`` 便可。
+
+详细代码参考
+------------
+
+-  `inference
+   demos <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/contrib/inference/demo>`__
+-  `复杂单线程/多线程例子 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/inference/test_paddle_inference_api_impl.cc>`__
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/run_anakin_on_arm.md b/doc/fluid/new_docs/advanced_usage/deploy/run_anakin_on_arm.md
new file mode 100644
index 0000000000000000000000000000000000000000..ebeb38f534ebfc8cb5a41d103abe3bb1de7e379a
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/run_anakin_on_arm.md
@@ -0,0 +1,151 @@
+## 源码编译 Anakin ##
+
+目前Anakin支持ARM Android平台，采用Android NDK交叉编译工具链，已在mac os和centos上编译和测试通过。
+
+### 安装概览 ###
+
+* [系统需求](#0001)
+* [安装第三方依赖](#0002)
+* [Anakin源码编译](#0003)
+* [验证安装](#0004)
+
+
+### <span id = '0001'> 1. 系统需求 </span> ###
+
+*  宿主机: linux, mac    
+*  cmake 3.8.2+    
+*  Android NDK r14, Linux 版本[从这里下载](https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip)
+
+### <span id = '0002'> 2. 安装第三方依赖 </span> ###
+
+- 2.1 protobuf3.4.0     
+   源码从这里[下载](https://github.com/google/protobuf/releases/tag/v3.4.0)    
+ - 2.1.1 为宿主机编译protobuf     
+ ```bash
+   $ tar -xzf protobuf-3.4.0.tar.gz  
+   $ cd protobuf-3.4.0   
+   $ ./autogen.sh  
+   $ ./configure    
+   $ make  
+   $ make check   
+   $ make install
+   ```
+   上述 $make install 执行后，可在 /usr/local/include/google 找到 libprotobuf 所需的头文件,将整个google文件夹拷贝至Anakin/third-party/arm-android/protobuf/下，
+   如有问题，请点[这里](https://github.com/google/protobuf/blob/v3.4.0/src/README.md)。
+   然后将已经生成文件清除。
+ ```bash
+   $ make distclean
+   ```
+ - 2.1.1 交叉编译Android`armeabi-v7a`的protobuf，注意设置ANDROID_NDK的路径，以及ARCH_ABI、HOSTOSN的值，   
+ ```bash
+
+   $ export ANDROID_NDK=your_ndk_path 
+   $ ARCH_ABI="arm-linux-androideabi-4.9"
+   $ HOSTOSN="darwin-x86_64"
+   $ export SYSROOT=$ANDROID_NDK/platforms/android-9/arch-arm  
+   $ export PREBUILT=$ANDROID_NDK/toolchains/$ARCH_ABI
+   $ export LDFLAGS="--sysroot=$SYSROOT"
+   $ export LD="$ANDROID_NDK/toolchains/$ARCH_ABI/prebuilt/$HOSTOSN/arm-linux-androideabi/bin/ld $LDFLAGS"
+   $ export LIBS="-llog $ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi-v7a/libgnustl_static.a"
+   $ export CPPFLAGS=""
+   $ export INCLUDES="-I$ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/include/ -I$ANDROID_NDK/platforms/android-9/arch-arm/usr/include/ -I$ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi-v7a/include/"
+   $ export CXXFLAGS="-march=armv7-a -mfloat-abi=softfp -DGOOGLE_PROTOBUF_NO_RTTI --sysroot=$SYSROOT"
+   $ export CCFLAGS="$CXXFLAGS"
+   $ export CXX="$PREBUILT/prebuilt/$HOSTOSN/bin/arm-linux-androideabi-g++ $CXXFLAGS"
+   $ export CC="$CXX"
+   $ export RANLIB="$ANDROID_NDK/toolchains/$ARCH_ABI/prebuilt/$HOSTOSN/bin/arm-linux-androideabi-ranlib"  
+   $ ./autogen.sh  
+   $ ./configure --host=arm-linux-androideabi --with-sysroot=$SYSROOT --enable-cross-compile --with-protoc=protoc --disable-shared CXX="$CXX" CC="$CC" LD="$LD"  
+   $ make
+  ```
+  
+  编译生成 *.a 静态库，若希望编译*.so 动态链接库 ，请在./configure参数中改--disable-shared为--disable-static --enable-shared。  
+  生成文件在src/.libs/下，将生成的文件拷贝至Anakin/third-party/arm-android/protobuf/lib下。  
+  在[cmake](../../cmake/find_modules.cmake)中更新`ARM_RPOTO_ROOT`的路径。        
+  ```cmake
+  set(ARM_RPOTO_ROOT "${CMAKE_SOURCE_DIR}/third-party/arm-android/protobuf")
+  ```
+  
+- 2.2 opencv 2.4.3+(optional)    
+    Anakin只在examples示例中使用opencv   
+    Android系统的opencv从[这里下载](https://opencv.org/releases.html)    
+    解压后将 `3rdparty/libs/armeabi-v7a`中的库文件拷贝到`libs/armeabi-v7a`    
+    在[cmake](../../cmake/find_modules.cmake)中搜索`anakin_find_opencv`, 
+    并设置 `include_directories` 和 `LINK_DIRECTORIES`为自己安装的库的路径。   
+    ```cmake
+    include_directories(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/jni/include/)
+    LINK_DIRECTORIES(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/libs/armeabi-v7a/)
+    ```
+### <span id = '0003'> 3. Anakin源码编译 </span> ###
+
+#### 编译Android版本
+
+   克隆[源码](https://github.com/PaddlePaddle/Anakin/tree/arm)
+```bash
+    cd your_dir
+    git clone https://github.com/PaddlePaddle/Anakin.git
+    cd Anakin
+    git fetch origin arm
+    git checkout arm
+  ```
+  修改`android_build.sh`    
+- 修改NDK路径    
+  ```bash
+    #modify "your_ndk_path" to your NDK path
+    export ANDROID_NDK=your_ndk_path
+  ```
+- 修改ARM 处理器架构     
+  对于32位ARM处理器, 将ANDROID_ABI 设置为 `armeabi-v7a with NEON`， 
+  对于64位ARM处理器, 可以将ANDROID_ABI 设置为 `armeabi-v7a with NEON`或者`arm64-v8a`。        
+  目前我们只支持 `armeabi-v7a with NEON`；`arm64-v8a` 还在开发中。      
+  ```bash
+      -DANDROID_ABI="armeabi-v7a with NEON"
+  ```
+- 设置Android API    
+  根据Android系统的版本设置API level， 例如API Level 21 -> Android 5.0.1    
+  ```bash
+      -DANDROID_NATIVE_API_LEVEL=21
+  ```
+
+- 选择编译静态库或动态库    
+  设置`BUILD_SHARED=NO`编译静态库    
+  设置`BUILD_SHARED=YES`编译动态库    
+  ```bash
+      -DBUILD_SHARED=NO
+  ```
+- OpenMP多线程支持    
+  设置`USE_OPENMP=YES`开启OpenMP多线程    
+  ```bash
+      -DUSE_OPENMP=YES
+  ```
+  
+- 编译单测文件    
+  设置`BUILD_WITH_UNIT_TEST=YES`将会编译单测文件    
+    ```bash
+        -DBUILD_WITH_UNIT_TEST=YES
+    ```
+
+- 编译示例文件    
+  设置`BUILD_EXAMPLES=YES`将会编译示例文件    
+    ```bash
+        -DBUILD_EXAMPLES=YES
+    ```
+  
+- 开启opencv    
+  如果使用opencv，设置`USE_OPENCV=YES`    
+    ```bash
+        -DUSE_OPENCV=YES
+    ```
+    
+- 开始编译    
+  运行脚本 `android_build.sh` 将自动编译Anakin     
+  ```bash
+      ./android_build.sh
+  ```
+
+### <span id = '0004'> 4. 验证安装 </span> ###    
+  编译好的库会放在目录`${Anakin_root}/output`下；    
+  编译好的单测文件会放在`${Anakin_root}/output/unit_test`目录下；    
+  编译好的示例文件会放在`${Anakin_root}/output/examples`目录下。
+  
+  对于Android系统，打开设备的调试模式，通过ADB可以访问的目录是`data/local/tmp`，通过ADB push将测试文件、模型和数据发送到设备目录， 运行测试文件。
diff --git a/doc/fluid/new_docs/advanced_usage/development/contribute_to_paddle.md b/doc/fluid/new_docs/advanced_usage/development/contribute_to_paddle.md
new file mode 120000
index 0000000000000000000000000000000000000000..1126df7a829ab6d98e58a44e8f9c6459feae9a8b
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/development/contribute_to_paddle.md
@@ -0,0 +1 @@
+../../../dev/contribute_to_paddle_cn.md
\ No newline at end of file
diff --git a/doc/fluid/new_docs/advanced_usage/development/cpu_profiling_cn.md b/doc/fluid/new_docs/advanced_usage/development/cpu_profiling_cn.md
new file mode 120000
index 0000000000000000000000000000000000000000..1381a3b05f6761c60742eb9365708d94ad8a2642
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/development/cpu_profiling_cn.md
@@ -0,0 +1 @@
+../../../howto/optimization/cpu_profiling_cn.md
\ No newline at end of file
diff --git a/doc/fluid/new_docs/advanced_usage/development/gpu_profiling_cn.rst b/doc/fluid/new_docs/advanced_usage/development/gpu_profiling_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f2396716bddd4810fa77c738d41f5482aa6d6055
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/development/gpu_profiling_cn.rst
@@ -0,0 +1,242 @@
+============
+GPU性能调优
+============
+
+..  contents::
+
+此教程将向您分步介绍如何使用内置的定时工具、 **nvprof** 或 **nvvp** 来运行性能分析和调优。
+
+- 什么是性能分析？
+- 为什么需要性能分析？
+- 如何进行性能分析？
+- 性能分析工具介绍
+- 详细教程
+- 性能分析小技巧
+
+什么是性能分析？
+================
+在软件工程的范畴里，性能分析（Profiling）是一个动态程序分析的术语，它可以指测量一个程序的空间（内存）复杂度或时间复杂度，
+也可以说是某些特定指令的使用情况，或者是函数调用的频率和耗时等。通常情况下，分析得到的信息用于协助进行程序的优化。
+
+简单来说，性能分析工具是用于给应用程序的性能做定量分析的。如果想很好的理解程序的行为，那程序分析工具是必不可少的利器。简单的性能分析，可以告诉您某个操作到底花了多长时间？而更深入的分析，甚至能解释为什么某个操作花了很长时间？
+
+为什么需要性能分析？
+============================
+训练好一个深层神经网络通常要耗费非常长的时间，所以性能也就逐步变成了深度学习领域最重要的指标。
+而优化性能的首要任务，是需要了解哪些步骤拖慢了整体。
+如果某一块根本就不怎么耗时，那也就不需要急着优化性能啦！
+
+如何进行性能分析？
+========================
+为了达到性能最优，您可以采用下面五个步骤：
+
+- 对代码进行性能分析
+- 找到运行慢的部分
+- 找到运行慢的原因
+- 修改成更快的版本
+- 再次对代码进行性能分析
+
+Usually, processor has two key performance limits include float point throughput and
+memory throughput. For GPU,  it also need more parallelism to fulfill its potential.
+This is why they can be so fast.
+
+通常情况下，处理器有两个关键性能限制：一个是浮点计算量，另一个是内存操作量。
+GPU则还需要高并行性，才能发挥其全部能力。这正是它们速度快的原因。
+
+性能分析工具介绍
+======================
+就通常的GPU性能分析来说，市面上已经有NVIDIA或第三方提供的众多工具。
+
+**nvprof** 是Nvidia性能分析工具， **nvvp** 则是带GUI的Nvidia可视化性能分析工具。
+在这个教程中，我们主要会介绍nvprof和nvvp。
+
+:code:`test_GpuProfiler` from :code:`paddle/legacy/math/tests` directory will be used to evaluate
+above profilers.
+
+:code:`paddle/legacy/math/test` 目录中的 :code:`test_GpuProfiler` 就是用于展示上述分析工具的用法。
+
+.. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
+   :language: c++
+   :lines: 137-151
+   :linenos:
+
+上述的代码片段包含了两种方法，您可以任意使用一个或两个来对感兴趣的代码段做性能分析。
+
+1. :code:`REGISTER_TIMER_INFO` 是一个内置的定时器封装，可以用来计算CPU函数或cuda内核的时间消耗。
+
+2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid
+program crashes when CPU version of PaddlePaddle invokes them.
+
+3. :code:`REGISTER_GPU_PROFILER` 是一个封装对象，封装了 :code:`cudaProfilerStart` 和 :code:`cudaProfileStop` 两个操作；同时其内部实现可以避免纯CPU版本PaddlePaddle在执行本语句时发生崩溃。
+
+您会在接下来的部分中获得更多的细节介绍。
+
+详细教程
+============
+
+内置定时器
+------------
+
+如果想要启用PaddlePaddle的内置定时器，您首先需要在相关代码段中加入 :code:`REGISTER_TIMER_INFO`。
+接下来就可以使用 :code:`printStatus` 或者 :code:`printAllStatus` 函数来将信息输出到界面中。
+下面举个简单的例子：
+
+1. 加入 :code:`REGISTER_TIMER_INFO` 和 :code:`printAllStatus` 函数（如高亮部分）。
+
+    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 8-12,14
+        :linenos:
+
+2. cmake配置中将 **WITH_TIMER** 打开，重新编译PaddlePaddle。
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_TIMER=ON
+        make
+
+3. 执行您的代码，并观察结果(如高亮部分）。
+
+    .. code-block:: bash
+        :emphasize-lines: 1,12-15
+
+        > ./paddle/legacy/math/tests/test_GpuProfiler
+        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/legacy/math/tests/test_GpuProfiler
+        I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
+        I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
+        [==========] Running 1 test from 1 test case.
+        [----------] Global test environment set-up.
+        [----------] 1 test from Profiler
+        [ RUN      ] Profiler.BilinearFwdBwd
+        I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im
+        gSizeX = 64, imgSizeY = 64"
+        I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751
+        I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======
+        I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd     total=136.141    avg=136.141    max=136.141    min=136.141   count=1
+        I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======
+        I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------
+        [       OK ] Profiler.BilinearFwdBwd (136 ms)
+        [----------] 1 test from Profiler (136 ms total)
+
+        [----------] Global test environment tear-down
+        [==========] 1 test from 1 test case ran. (136 ms total)
+        [  PASSED  ] 1 test.
+
+nvprof 工具
+----------------
+
+要使用命令行分析工具 **nvprof**，您按如下步骤操作即可：
+
+1. 将 :code:`REGISTER_GPU_PROFILER` 函数加到代码中（参考强调部分）。
+
+    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 6-7
+        :linenos:
+
+2. cmake中将 **WITH_PROFILER** 配置打开，重新编译PaddlePaddle。
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_PROFILER=ON
+        make
+
+3. 使用 **nvprof** 来分析执行文件。
+
+    .. code-block:: bash
+
+        nvprof  ./paddle/legacy/math/tests/test_GpuProfiler
+
+然后，您就能获得如下的分析结果：
+
+.. code-block:: bash
+
+    ==78544== Profiling application: ./paddle/legacy/math/tests/test_GpuProfiler
+    ==78544== Profiling result:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]
+    26.07%  9.0957ms         1  9.0957ms  9.0957ms  9.0957ms  KeBilinearInterpBw
+    23.78%  8.2977ms         1  8.2977ms  8.2977ms  8.2977ms  KeBilinearInterpFw
+    22.55%  7.8661ms         2  3.9330ms  1.5798ms  6.2863ms  [CUDA memcpy DtoH]
+
+    ==78544== API calls:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    46.85%  682.28ms         8  85.285ms  12.639us  682.03ms  cudaStreamCreateWithFlags
+    39.83%  580.00ms         4  145.00ms     302ns  550.27ms  cudaFree
+    9.82%   143.03ms         9  15.892ms  8.7090us  142.78ms  cudaStreamCreate
+    1.23%   17.983ms         7  2.5690ms  23.210us  6.4563ms  cudaMemcpy
+    1.23%   17.849ms         2  8.9247ms  8.4726ms  9.3768ms  cudaStreamSynchronize
+    0.66%   9.5969ms         7  1.3710ms  288.43us  2.4279ms  cudaHostAlloc
+    0.13%   1.9530ms        11  177.54us  7.6810us  591.06us  cudaMalloc
+    0.07%   1.0424ms         8  130.30us  1.6970us  453.72us  cudaGetDevice
+    0.04%   527.90us        40  13.197us     525ns  253.99us  cudaEventCreateWithFlags
+    0.03%   435.73us       348  1.2520us     124ns  42.704us  cuDeviceGetAttribute
+    0.03%   419.36us         1  419.36us  419.36us  419.36us  cudaGetDeviceCount
+    0.02%   260.75us         2  130.38us  129.32us  131.43us  cudaGetDeviceProperties
+    0.02%   222.32us         2  111.16us  106.94us  115.39us  cudaLaunch
+    0.01%   214.06us         4  53.514us  28.586us  77.655us  cuDeviceGetName
+    0.01%   115.45us         4  28.861us  9.8250us  44.526us  cuDeviceTotalMem
+    0.01%   83.988us         4  20.997us     578ns  77.760us  cudaSetDevice
+    0.00%   38.918us         1  38.918us  38.918us  38.918us  cudaEventCreate
+    0.00%   34.573us        31  1.1150us     279ns  12.784us  cudaDeviceGetAttribute
+    0.00%   17.767us         1  17.767us  17.767us  17.767us  cudaProfilerStart
+    0.00%   15.228us         2  7.6140us  3.5460us  11.682us  cudaConfigureCall
+    0.00%   14.536us         2  7.2680us  1.1490us  13.387us  cudaGetLastError
+    0.00%   8.6080us        26     331ns     173ns     783ns  cudaSetupArgument
+    0.00%   5.5470us         6     924ns     215ns  2.6780us  cuDeviceGet
+    0.00%   5.4090us         6     901ns     328ns  3.3320us  cuDeviceGetCount
+    0.00%   4.1770us         3  1.3920us  1.0630us  1.8300us  cuDriverGetVersion
+    0.00%   3.4650us         3  1.1550us  1.0810us  1.2680us  cuInit
+    0.00%      830ns         1     830ns     830ns     830ns  cudaRuntimeGetVersion
+
+
+nvvp 工具
+--------------
+
+如果想使用可视化的分析器 **nvvp**，您可以导入 :code:`nvprof -o ...` 的输出，或者从工具的界面里运行您的应用。
+
+**备注: nvvp 也支持CPU的性能分析** (需在nvvp界面中选上才能开启）
+
+..  image:: nvvp1.png
+    :align: center
+    :scale: 33%
+
+从内核函数的角度， **nvvp** 可以精确说明一个长耗时操作的具体原因。
+同时，如下图所示， **nvvp** 的内核block使用情况、寄存器使用情况和共享内存使用情况能让我们对GPU的整体使用有更好的理解。
+
+
+..  image:: nvvp2.png
+    :align: center
+    :scale: 33%
+
+而从应用的角度， **nvvp** 可以帮您提供一些定位性能瓶颈的建议。
+例如，下图中就展示了一些关于内存数据迁徙和计算资源利用率的建议，为您做性能调优提供了方向。
+
+..  image:: nvvp3.png
+    :align: center
+    :scale: 33%
+
+..  image:: nvvp4.png
+    :align: center
+    :scale: 33%
+
+性能分析小技巧
+==================
+
+- 开始阶段，从 **nvprof** 和 **nvvp** 的输出信息入手是个不错的选择。
+- 接下来可以考虑下时间线的分析。
+- 如果真想挖掘内核深处的某个秘密，您最好先确认：这一块的耗时比例真的太高，值得深入分析。
+- 可能的情况下，试着让输出的分析数据和理论值对应。
+
+    1) 例如，如果我知道内核花了10ms来移动1GB数据，那我会期望分析工具统计到速度是100GB/s。
+    2) 若有不一致之处，很有可能实际应用就是没有按照您的预期情况运行。
+- 了解您的硬件：如果您的GPU理论可以达到6 TFLOPs（6万亿次浮点运算每秒），而当前已经有5.5 TFLOPs了，那估计这里的潜力就没啥好挖的了……
+
+性能分析是性能优化的关键一步。有的时候简简单单的改变就能在性能上产生明显的优化效果！
+当然，具体情况因人而异。
+
+参考资料
+===========
+Jeremy Appleyard, `GPU Profiling for Deep Learning <http://www.robots.ox.ac.uk/~seminars/seminars/Extra/2015_10_08_JeremyAppleyard.pdf>`_, 2015
diff --git a/doc/fluid/new_docs/advanced_usage/development/host_memory_profiling_cn.md b/doc/fluid/new_docs/advanced_usage/development/host_memory_profiling_cn.md
new file mode 120000
index 0000000000000000000000000000000000000000..904968ba4a8d6cc6489c91a0a751e0a33dcc873c
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/development/host_memory_profiling_cn.md
@@ -0,0 +1 @@
+../../../howto/optimization/host_memory_profiling_cn.md
\ No newline at end of file
diff --git a/doc/fluid/new_docs/advanced_usage/development/new_op.md b/doc/fluid/new_docs/advanced_usage/development/new_op.md
new file mode 120000
index 0000000000000000000000000000000000000000..dce0348585b8c484c1418a03a5fde5d78b0afcc9
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/development/new_op.md
@@ -0,0 +1 @@
+../../../dev/new_op_cn.md
\ No newline at end of file
diff --git a/doc/fluid/new_docs/advanced_usage/development/nvvp1.png b/doc/fluid/new_docs/advanced_usage/development/nvvp1.png
new file mode 100644
index 0000000000000000000000000000000000000000..1af23ac3c52929b2b0645d2f9fa4d4c6db1f6e77
Binary files /dev/null and b/doc/fluid/new_docs/advanced_usage/development/nvvp1.png differ
diff --git a/doc/fluid/new_docs/advanced_usage/development/nvvp2.png b/doc/fluid/new_docs/advanced_usage/development/nvvp2.png
new file mode 100644
index 0000000000000000000000000000000000000000..177c9db708da6863d1075f3e615f5962dbe18b29
Binary files /dev/null and b/doc/fluid/new_docs/advanced_usage/development/nvvp2.png differ
diff --git a/doc/fluid/new_docs/advanced_usage/development/nvvp3.png b/doc/fluid/new_docs/advanced_usage/development/nvvp3.png
new file mode 100644
index 0000000000000000000000000000000000000000..d8f393667d6569b6f1e61ffccac43fae5888b6db
Binary files /dev/null and b/doc/fluid/new_docs/advanced_usage/development/nvvp3.png differ
diff --git a/doc/fluid/new_docs/advanced_usage/development/nvvp4.png b/doc/fluid/new_docs/advanced_usage/development/nvvp4.png
new file mode 100644
index 0000000000000000000000000000000000000000..51f2f3e183295de6cf8ddaf2b3b8a0862aa35f01
Binary files /dev/null and b/doc/fluid/new_docs/advanced_usage/development/nvvp4.png differ
diff --git a/doc/fluid/new_docs/advanced_usage/development/pprof_1.png b/doc/fluid/new_docs/advanced_usage/development/pprof_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..8e9edbf377672d0ef40f2fc7bd39e746923550cb
Binary files /dev/null and b/doc/fluid/new_docs/advanced_usage/development/pprof_1.png differ
diff --git a/doc/fluid/new_docs/advanced_usage/development/pprof_2.png b/doc/fluid/new_docs/advanced_usage/development/pprof_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..172ba20399ba974d27f4c072425277b69b02520b
Binary files /dev/null and b/doc/fluid/new_docs/advanced_usage/development/pprof_2.png differ
diff --git a/doc/fluid/new_docs/advanced_usage/development/timeline.jpeg b/doc/fluid/new_docs/advanced_usage/development/timeline.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..38ec3f80c982857531f30a8bb0fa26ea5bf05385
Binary files /dev/null and b/doc/fluid/new_docs/advanced_usage/development/timeline.jpeg differ
diff --git a/doc/fluid/new_docs/advanced_usage/development/timeline_cn.md b/doc/fluid/new_docs/advanced_usage/development/timeline_cn.md
new file mode 120000
index 0000000000000000000000000000000000000000..a05540e82a7fa795dcd8e7306261ef9bef57426f
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/development/timeline_cn.md
@@ -0,0 +1 @@
+../../../howto/optimization/timeline_cn.md
\ No newline at end of file
diff --git a/doc/fluid/new_docs/advanced_usage/development/tracing.jpeg b/doc/fluid/new_docs/advanced_usage/development/tracing.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..3a49fc4f8a401a9463b0157e2f38c164ca02dcc5
Binary files /dev/null and b/doc/fluid/new_docs/advanced_usage/development/tracing.jpeg differ
diff --git a/doc/fluid/new_docs/advanced_usage/development/write_docs.rst b/doc/fluid/new_docs/advanced_usage/development/write_docs.rst
new file mode 120000
index 0000000000000000000000000000000000000000..dc536c8bdd4924758d4418bac8e4181ffbb1f780
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/development/write_docs.rst
@@ -0,0 +1 @@
+../../../dev/write_docs_cn.rst
\ No newline at end of file
diff --git a/doc/fluid/new_docs/advanced_usage/index.rst b/doc/fluid/new_docs/advanced_usage/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..dea7c236619a0bdbf402f371571d947d1cdbba65
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/index.rst
@@ -0,0 +1,23 @@
+########
+进阶使用
+########
+
+
+..  todo::
+
+    Complete this guide
+
+..  toctree::
+    :maxdepth: 2
+
+    deploy/index_native.rst
+    deploy/index_anakin.rst
+    deploy/index_mobile.rst
+    development/contribute_to_paddle.md
+    development/write_docs.rst
+    development/new_op.md
+    development/cpu_profiling_cn.md
+    development/gpu_profiling_cn.rst
+    development/host_memory_profiling_cn.md
+    development/timeline_cn.md
+    benchmark.rst
diff --git a/doc/fluid/new_docs/advanced_usage/pics/anakin_fm_ch.png b/doc/fluid/new_docs/advanced_usage/pics/anakin_fm_ch.png
new file mode 100644
index 0000000000000000000000000000000000000000..52d4992a22397119af949aa7c11a9ea6365c167c
Binary files /dev/null and b/doc/fluid/new_docs/advanced_usage/pics/anakin_fm_ch.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/image_classification/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..dc7c62b06287ad333dd41082e566b0553d3a5341
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/.gitignore
@@ -0,0 +1,8 @@
+*.pyc
+train.log
+output
+data/cifar-10-batches-py/
+data/cifar-10-python.tar.gz
+data/*.txt
+data/*.list
+data/mean.meta
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog.png
new file mode 100644
index 0000000000000000000000000000000000000000..ca8f858a902ea723d886d2b88c2c0a1005301c50
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog_cat.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog_cat.png
new file mode 100644
index 0000000000000000000000000000000000000000..38b21f21604b1bb84fc3f6aa96bd5fce45d15a55
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog_cat.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/fea_conv0.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/fea_conv0.png
new file mode 100644
index 0000000000000000000000000000000000000000..647c822e52cd55d50e5f207978f5e6ada86cf34c
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/fea_conv0.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/flowers.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/flowers.png
new file mode 100644
index 0000000000000000000000000000000000000000..04245cef60fe7126ae4c92ba8085273965078bee
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/flowers.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/googlenet.jpeg b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/googlenet.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..249dbf96df61c3352ea5bd80470f6c4a1e03ff10
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/googlenet.jpeg differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/ilsvrc.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/ilsvrc.png
new file mode 100644
index 0000000000000000000000000000000000000000..4660ac122e9d533023a21154d35eee29e3b08d27
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/ilsvrc.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/inception.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/inception.png
new file mode 100644
index 0000000000000000000000000000000000000000..9591a0c1e8c0165c40ca560be35a7b9a91cd5027
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/inception.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/lenet.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/lenet.png
new file mode 100644
index 0000000000000000000000000000000000000000..77f785e03bacd38c4c64a817874a58ff3298d2f3
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/lenet.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/plot.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/plot.png
new file mode 100644
index 0000000000000000000000000000000000000000..57e45cc0c27dd99b9918de2ff1228bc6b65f7424
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/plot.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet.png
new file mode 100644
index 0000000000000000000000000000000000000000..0aeb4f254639fdbf18e916dc219ca61602596d85
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet_block.jpg b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet_block.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..c500eb01a90190ff66150871fe83ec275e2de8d7
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet_block.jpg differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/train_and_test.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/train_and_test.png
new file mode 100644
index 0000000000000000000000000000000000000000..c6336a9a69b95dc978719ce68896e3e752e67fed
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/train_and_test.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/vgg16.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/vgg16.png
new file mode 100644
index 0000000000000000000000000000000000000000..6270eefcfd7071bc1643ee06567e5b81aaf4c177
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/vgg16.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/index.md b/doc/fluid/new_docs/beginners_guide/basics/image_classification/index.md
new file mode 100644
index 0000000000000000000000000000000000000000..ce0d2bb1dc0cf73151ee9aceea7e4d7b24af1926
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/index.md
@@ -0,0 +1,559 @@
+
+# 图像分类
+
+本教程源代码目录在[book/image_classification](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)。
+
+## 背景介绍
+
+图像相比文字能够提供更加生动、容易理解及更具艺术感的信息，是人们转递与交换信息的重要来源。在本教程中，我们专注于图像识别领域的一个重要问题，即图像分类。
+
+图像分类是根据图像的语义信息将不同类别图像区分开来，是计算机视觉中重要的基本问题，也是图像检测、图像分割、物体跟踪、行为分析等其他高层视觉任务的基础。图像分类在很多领域有广泛应用，包括安防领域的人脸识别和智能视频分析等，交通领域的交通场景识别，互联网领域基于内容的图像检索和相册自动归类，医学领域的图像识别等。
+
+
+一般来说，图像分类通过手工特征或特征学习方法对整个图像进行全部描述，然后使用分类器判别物体类别，因此如何提取图像的特征至关重要。在深度学习算法之前使用较多的是基于词袋(Bag of Words)模型的物体分类方法。词袋方法从自然语言处理中引入，即一句话可以用一个装了词的袋子表示其特征，袋子中的词为句子中的单词、短语或字。对于图像而言，词袋方法需要构建字典。最简单的词袋模型框架可以设计为**底层特征抽取**、**特征编码**、**分类器设计**三个过程。
+
+而基于深度学习的图像分类方法，可以通过有监督或无监督的方式**学习**层次化的特征描述，从而取代了手工设计或选择图像特征的工作。深度学习模型中的卷积神经网络(Convolution Neural Network, CNN)近年来在图像领域取得了惊人的成绩，CNN直接利用图像像素信息作为输入，最大程度上保留了输入图像的所有信息，通过卷积操作进行特征的提取和高层抽象，模型输出直接是图像识别的结果。这种基于"输入-输出"直接端到端的学习方法取得了非常好的效果，得到了广泛的应用。
+
+本教程主要介绍图像分类的深度学习模型，以及如何使用PaddlePaddle训练CNN模型。
+
+## 效果展示
+
+图像分类包括通用图像分类、细粒度图像分类等。图1展示了通用图像分类效果，即模型可以正确识别图像上的主要物体。
+
+![dogCatClassification](./image/dog_cat.png)
+<p align="center">
+图1. 通用图像分类展示
+</p>
+
+
+图2展示了细粒度图像分类-花卉识别的效果，要求模型可以正确识别花的类别。
+
+![flowersClassification](./image/flowers.png)
+<p align="center">
+图2. 细粒度图像分类展示
+</p>
+
+
+一个好的模型既要对不同类别识别正确，同时也应该能够对不同视角、光照、背景、变形或部分遮挡的图像正确识别(这里我们统一称作图像扰动)。图3展示了一些图像的扰动，较好的模型会像聪明的人类一样能够正确识别。
+
+![imageVariations](https://raw.githubusercontent.com/PaddlePaddle/book/develop/03.image_classification/image/variations.png)
+<p align="center">
+图3. 扰动图片展示[22]
+</p>
+
+## 模型概览
+
+图像识别领域大量的研究成果都是建立在[PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/)、[ImageNet](http://image-net.org/)等公开的数据集上，很多图像识别算法通常在这些数据集上进行测试和比较。PASCAL VOC是2005年发起的一个视觉挑战赛，ImageNet是2010年发起的大规模视觉识别竞赛(ILSVRC)的数据集，在本章中我们基于这些竞赛的一些论文介绍图像分类模型。
+
+在2012年之前的传统图像分类方法可以用背景描述中提到的三步完成，但通常完整建立图像识别模型一般包括底层特征学习、特征编码、空间约束、分类器设计、模型融合等几个阶段。
+1). **底层特征提取**: 通常从图像中按照固定步长、尺度提取大量局部特征描述。常用的局部特征包括SIFT(Scale-Invariant Feature Transform, 尺度不变特征转换) \[[1](#参考文献)\]、HOG(Histogram of Oriented Gradient, 方向梯度直方图) \[[2](#参考文献)\]、LBP(Local Bianray Pattern, 局部二值模式) \[[3](#参考文献)\] 等，一般也采用多种特征描述子，防止丢失过多的有用信息。
+2). **特征编码**: 底层特征中包含了大量冗余与噪声，为了提高特征表达的鲁棒性，需要使用一种特征变换算法对底层特征进行编码，称作特征编码。常用的特征编码包括向量量化编码 \[[4](#参考文献)\]、稀疏编码 \[[5](#参考文献)\]、局部线性约束编码 \[[6](#参考文献)\]、Fisher向量编码 \[[7](#参考文献)\] 等。
+3). **空间特征约束**: 特征编码之后一般会经过空间特征约束，也称作**特征汇聚**。特征汇聚是指在一个空间范围内，对每一维特征取最大值或者平均值，可以获得一定特征不变形的特征表达。金字塔特征匹配是一种常用的特征聚会方法，这种方法提出将图像均匀分块，在分块内做特征汇聚。
+4). **通过分类器分类**: 经过前面步骤之后一张图像可以用一个固定维度的向量进行描述，接下来就是经过分类器对图像进行分类。通常使用的分类器包括SVM(Support Vector Machine, 支持向量机)、随机森林等。而使用核方法的SVM是最为广泛的分类器，在传统图像分类任务上性能很好。
+
+这种方法在PASCAL VOC竞赛中的图像分类算法中被广泛使用 \[[18](#参考文献)\]。[NEC实验室](http://www.nec-labs.com/)在ILSVRC2010中采用SIFT和LBP特征，两个非线性编码器以及SVM分类器获得图像分类的冠军 \[[8](#参考文献)\]。
+
+Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得了历史性的突破，效果大幅度超越传统方法，获得了ILSVRC2012冠军，该模型被称作AlexNet。这也是首次将深度学习用于大规模图像分类中。从AlexNet之后，涌现了一系列CNN模型，不断地在ImageNet上刷新成绩，如图4展示。随着模型变得越来越深以及精妙的结构设计，Top-5的错误率也越来越低，降到了3.5%附近。而在同样的ImageNet数据集上，人眼的辨识错误率大概在5.1%，也就是目前的深度学习模型的识别能力已经超过了人眼。
+
+![ilsvrc](./image/ilsvrc.png)
+<p align="center">
+图4. ILSVRC图像分类Top-5错误率
+</p>
+
+### CNN
+
+传统CNN包含卷积层、全连接层等组件，并采用softmax多类别分类器和多类交叉熵损失函数，一个典型的卷积神经网络如图5所示，我们先介绍用来构造CNN的常见组件。
+
+![cnnStructure](./image/lenet.png)
+<p align="center">
+图5. CNN网络示例[20]
+</p>
+
+- 卷积层(convolution layer): 执行卷积操作提取底层到高层的特征，发掘出图片局部关联性质和空间不变性质。
+- 池化层(pooling layer): 执行降采样操作。通过取卷积输出特征图中局部区块的最大值(max-pooling)或者均值(avg-pooling)。降采样也是图像处理中常见的一种操作，可以过滤掉一些不重要的高频信息。
+- 全连接层(fully-connected layer，或者fc layer): 输入层到隐藏层的神经元是全部连接的。
+- 非线性变化: 卷积层、全连接层后面一般都会接非线性变化层，例如Sigmoid、Tanh、ReLu等来增强网络的表达能力，在CNN里最常使用的为ReLu激活函数。
+- Dropout \[[10](#参考文献)\] : 在模型训练阶段随机让一些隐层节点权重不工作，提高网络的泛化能力，一定程度上防止过拟合。
+
+另外，在训练过程中由于每层参数不断更新，会导致下一次输入分布发生变化，这样导致训练过程需要精心设计超参数。如2015年Sergey Ioffe和Christian Szegedy提出了Batch Normalization (BN)算法 \[[14](#参考文献)\] 中，每个batch对网络中的每一层特征都做归一化，使得每层分布相对稳定。BN算法不仅起到一定的正则作用，而且弱化了一些超参数的设计。经过实验证明，BN算法加速了模型收敛过程，在后来较深的模型中被广泛使用。
+
+接下来我们主要介绍VGG，GoogleNet和ResNet网络结构。
+
+### VGG
+
+牛津大学VGG(Visual Geometry Group)组在2014年ILSVRC提出的模型被称作VGG模型 \[[11](#参考文献)\] 。该模型相比以往模型进一步加宽和加深了网络结构，它的核心是五组卷积操作，每两组之间做Max-Pooling空间降维。同一组内采用多次连续的3X3卷积，卷积核的数目由较浅组的64增多到最深组的512，同一组内的卷积核数目是一样的。卷积之后接两层全连接层，之后是分类层。由于每组内卷积层的不同，有11、13、16、19层这几种模型，下图展示一个16层的网络结构。VGG模型结构相对简洁，提出之后也有很多文章基于此模型进行研究，如在ImageNet上首次公开超过人眼识别的模型\[[19](#参考文献)\]就是借鉴VGG模型的结构。
+
+![vgg16](./image/vgg16.png)
+<p align="center">
+图6. 基于ImageNet的VGG16模型
+</p>
+
+### GoogleNet
+
+GoogleNet \[[12](#参考文献)\] 在2014年ILSVRC的获得了冠军，在介绍该模型之前我们先来了解NIN(Network in Network)模型 \[[13](#参考文献)\] 和Inception模块，因为GoogleNet模型由多组Inception模块组成，模型设计借鉴了NIN的一些思想。
+
+NIN模型主要有两个特点：1) 引入了多层感知卷积网络(Multi-Layer Perceptron Convolution, MLPconv)代替一层线性卷积网络。MLPconv是一个微小的多层卷积网络，即在线性卷积后面增加若干层1x1的卷积，这样可以提取出高度非线性特征。2) 传统的CNN最后几层一般都是全连接层，参数较多。而NIN模型设计最后一层卷积层包含类别维度大小的特征图，然后采用全局均值池化(Avg-Pooling)替代全连接层，得到类别维度大小的向量，再进行分类。这种替代全连接层的方式有利于减少参数。
+
+Inception模块如下图7所示，图(a)是最简单的设计，输出是3个卷积层和一个池化层的特征拼接。这种设计的缺点是池化层不会改变特征通道数，拼接后会导致特征的通道数较大，经过几层这样的模块堆积后，通道数会越来越大，导致参数和计算量也随之增大。为了改善这个缺点，图(b)引入3个1x1卷积层进行降维，所谓的降维就是减少通道数，同时如NIN模型中提到的1x1卷积也可以修正线性特征。
+
+![inception](./image/inception.png)
+<p align="center">
+图7. Inception模块
+</p>
+
+GoogleNet由多组Inception模块堆积而成。另外，在网络最后也没有采用传统的多层全连接层，而是像NIN网络一样采用了均值池化层；但与NIN不同的是，池化层后面接了一层到类别数映射的全连接层。除了这两个特点之外，由于网络中间层特征也很有判别性，GoogleNet在中间层添加了两个辅助分类器，在后向传播中增强梯度并且增强正则化，而整个网络的损失函数是这个三个分类器的损失加权求和。
+
+GoogleNet整体网络结构如图8所示，总共22层网络：开始由3层普通的卷积组成；接下来由三组子网络组成，第一组子网络包含2个Inception模块，第二组包含5个Inception模块，第三组包含2个Inception模块；然后接均值池化层、全连接层。
+
+![googleNet](./image/googlenet.jpeg)
+<p align="center">
+图8. GoogleNet[12]
+</p>
+
+
+上面介绍的是GoogleNet第一版模型(称作GoogleNet-v1)。GoogleNet-v2 \[[14](#参考文献)\] 引入BN层；GoogleNet-v3 \[[16](#参考文献)\] 对一些卷积层做了分解，进一步提高网络非线性能力和加深网络；GoogleNet-v4 \[[17](#参考文献)\] 引入下面要讲的ResNet设计思路。从v1到v4每一版的改进都会带来准确度的提升，介于篇幅，这里不再详细介绍v2到v4的结构。
+
+
+### ResNet
+
+ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类、图像物体定位和图像物体检测比赛的冠军。针对训练卷积神经网络时加深网络导致准确度下降的问题，ResNet提出了采用残差学习。在已有设计思路(BN, 小卷积核，全卷积网络)的基础上，引入了残差模块。每个残差模块包含两条路径，其中一条路径是输入特征的直连通路，另一条路径对该特征做两到三次卷积操作得到该特征的残差，最后再将两条路径上的特征相加。
+
+残差模块如图9所示，左边是基本模块连接方式，由两个输出通道数相同的3x3卷积组成。右边是瓶颈模块(Bottleneck)连接方式，之所以称为瓶颈，是因为上面的1x1卷积用来降维(图示例即256->64)，下面的1x1卷积用来升维(图示例即64->256)，这样中间3x3卷积的输入和输出通道数都较小(图示例即64->64)。
+
+![ResNetBlock](./image/resnet_block.jpg)
+<p align="center">
+图9. 残差模块
+</p>
+
+图10展示了50、101、152层网络连接示意图，使用的是瓶颈模块。这三个模型的区别在于每组中残差模块的重复次数不同(见图右上角)。ResNet训练收敛较快，成功的训练了上百乃至近千层的卷积神经网络。
+
+![ResNet](./image/resnet.png)
+<p align="center">
+图10. 基于ImageNet的ResNet模型
+</p>
+
+
+## 数据准备
+
+通用图像分类公开的标准数据集常用的有[CIFAR](https://www.cs.toronto.edu/~kriz/cifar.html)、[ImageNet](http://image-net.org/)、[COCO](http://mscoco.org/)等，常用的细粒度图像分类数据集包括[CUB-200-2011](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html)、[Stanford Dog](http://vision.stanford.edu/aditya86/ImageNetDogs/)、[Oxford-flowers](http://www.robots.ox.ac.uk/~vgg/data/flowers/)等。其中ImageNet数据集规模相对较大，如[模型概览](#模型概览)一章所讲，大量研究成果基于ImageNet。ImageNet数据从2010年来稍有变化，常用的是ImageNet-2012数据集，该数据集包含1000个类别：训练集包含1,281,167张图片，每个类别数据732至1300张不等，验证集包含50,000张图片，平均每个类别50张图片。
+
+由于ImageNet数据集较大，下载和训练较慢，为了方便大家学习，我们使用[CIFAR10](<https://www.cs.toronto.edu/~kriz/cifar.html>)数据集。CIFAR10数据集包含60,000张32x32的彩色图片，10个类别，每个类包含6,000张。其中50,000张图片作为训练集，10000张作为测试集。图11从每个类别中随机抽取了10张图片，展示了所有的类别。
+
+![CIFAR](https://raw.githubusercontent.com/PaddlePaddle/book/develop/03.image_classification/image/cifar.png)
+<p align="center">
+图11. CIFAR10数据集[21]
+</p>
+
+Paddle API提供了自动加载cifar数据集模块 `paddle.dataset.cifar`。
+
+通过输入`python train.py`，就可以开始训练模型了，以下小节将详细介绍`train.py`的相关内容。
+
+### 模型结构
+
+#### Paddle 初始化
+
+让我们从导入 Paddle Fluid API 和辅助模块开始。
+
+```python
+import paddle
+import paddle.fluid as fluid
+import numpy
+import sys
+```
+
+本教程中我们提供了VGG和ResNet两个模型的配置。
+
+#### VGG
+
+首先介绍VGG模型结构，由于CIFAR10图片大小和数量相比ImageNet数据小很多，因此这里的模型针对CIFAR10数据做了一定的适配。卷积部分引入了BN和Dropout操作。
+VGG核心模块的输入是数据层，`vgg_bn_drop` 定义了16层VGG结构，每层卷积后面引入BN层和Dropout层，详细的定义如下：
+
+```python
+def vgg_bn_drop(input):
+def conv_block(ipt, num_filter, groups, dropouts):
+return fluid.nets.img_conv_group(
+input=ipt,
+pool_size=2,
+pool_stride=2,
+conv_num_filter=[num_filter] * groups,
+conv_filter_size=3,
+conv_act='relu',
+conv_with_batchnorm=True,
+conv_batchnorm_drop_rate=dropouts,
+pool_type='max')
+
+conv1 = conv_block(input, 64, 2, [0.3, 0])
+conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+bn = fluid.layers.batch_norm(input=fc1, act='relu')
+drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+predict = fluid.layers.fc(input=fc2, size=10, act='softmax')
+return predict
+```
+
+1. 首先定义了一组卷积网络，即conv_block。卷积核大小为3x3，池化窗口大小为2x2，窗口滑动大小为2，groups决定每组VGG模块是几次连续的卷积操作，dropouts指定Dropout操作的概率。所使用的`img_conv_group`是在`paddle.networks`中预定义的模块，由若干组 Conv->BN->ReLu->Dropout 和 一组 Pooling 组成。
+
+2. 五组卷积操作，即 5个conv_block。 第一、二组采用两次连续的卷积操作。第三、四、五组采用三次连续的卷积操作。每组最后一个卷积后面Dropout概率为0，即不使用Dropout操作。
+
+3. 最后接两层512维的全连接。
+
+4. 通过上面VGG网络提取高层特征，然后经过全连接层映射到类别维度大小的向量，再通过Softmax归一化得到每个类别的概率，也可称作分类器。
+
+### ResNet
+
+ResNet模型的第1、3、4步和VGG模型相同，这里不再介绍。主要介绍第2步即CIFAR10数据集上ResNet核心模块。
+
+先介绍`resnet_cifar10`中的一些基本函数，再介绍网络连接过程。
+
+- `conv_bn_layer` : 带BN的卷积层。
+- `shortcut` : 残差模块的"直连"路径，"直连"实际分两种形式：残差模块输入和输出特征通道数不等时，采用1x1卷积的升维操作；残差模块输入和输出通道相等时，采用直连操作。
+- `basicblock` : 一个基础残差模块，即图9左边所示，由两组3x3卷积组成的路径和一条"直连"路径组成。
+- `bottleneck` : 一个瓶颈残差模块，即图9右边所示，由上下1x1卷积和中间3x3卷积组成的路径和一条"直连"路径组成。
+- `layer_warp` : 一组残差模块，由若干个残差模块堆积而成。每组中第一个残差模块滑动窗口大小与其他可以不同，以用来减少特征图在垂直和水平方向的大小。
+
+```python
+def conv_bn_layer(input,
+ch_out,
+filter_size,
+stride,
+padding,
+act='relu',
+bias_attr=False):
+tmp = fluid.layers.conv2d(
+input=input,
+filter_size=filter_size,
+num_filters=ch_out,
+stride=stride,
+padding=padding,
+act=None,
+bias_attr=bias_attr)
+return fluid.layers.batch_norm(input=tmp, act=act)
+
+
+def shortcut(input, ch_in, ch_out, stride):
+if ch_in != ch_out:
+return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+else:
+return input
+
+
+def basicblock(input, ch_in, ch_out, stride):
+tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
+tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None, bias_attr=True)
+short = shortcut(input, ch_in, ch_out, stride)
+return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
+
+
+def layer_warp(block_func, input, ch_in, ch_out, count, stride):
+tmp = block_func(input, ch_in, ch_out, stride)
+for i in range(1, count):
+tmp = block_func(tmp, ch_out, ch_out, 1)
+return tmp
+```
+
+`resnet_cifar10` 的连接结构主要有以下几个过程。
+
+1. 底层输入连接一层 `conv_bn_layer`，即带BN的卷积层。
+2. 然后连接3组残差模块即下面配置3组 `layer_warp` ，每组采用图 10 左边残差模块组成。
+3. 最后对网络做均值池化并返回该层。
+
+注意：除过第一层卷积层和最后一层全连接层之外，要求三组 `layer_warp` 总的含参层数能够被6整除，即 `resnet_cifar10` 的 depth 要满足 `$(depth - 2) % 6 == 0$` 。
+
+```python
+def resnet_cifar10(ipt, depth=32):
+# depth should be one of 20, 32, 44, 56, 110, 1202
+assert (depth - 2) % 6 == 0
+n = (depth - 2) / 6
+nStages = {16, 64, 128}
+conv1 = conv_bn_layer(ipt, ch_out=16, filter_size=3, stride=1, padding=1)
+res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
+res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
+res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
+pool = fluid.layers.pool2d(
+input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+predict = fluid.layers.fc(input=pool, size=10, act='softmax')
+return predict
+```
+
+## Infererence Program 配置
+
+网络输入定义为 `data_layer` (数据层)，在图像分类中即为图像像素信息。CIFRAR10是RGB 3通道32x32大小的彩色图，因此输入数据大小为3072(3x32x32)。
+
+```python
+def inference_program():
+# The image is 32 * 32 with RGB representation.
+data_shape = [3, 32, 32]
+images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+
+predict = resnet_cifar10(images, 32)
+# predict = vgg_bn_drop(images) # un-comment to use vgg net
+return predict
+```
+
+## Train Program 配置
+
+然后我们需要设置训练程序 `train_program`。它首先从推理程序中进行预测。
+在训练期间，它将从预测中计算 `avg_cost`。
+在有监督训练中需要输入图像对应的类别信息，同样通过`fluid.layers.data`来定义。训练中采用多类交叉熵作为损失函数，并作为网络的输出，预测阶段定义网络的输出为分类器得到的概率信息。
+
+**注意:** 训练程序应该返回一个数组，第一个返回参数必须是 `avg_cost`。训练器使用它来计算梯度。
+
+```python
+def train_program():
+predict = inference_program()
+
+label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(cost)
+accuracy = fluid.layers.accuracy(input=predict, label=label)
+return [avg_cost, accuracy]
+```
+
+## Optimizer Function 配置
+
+在下面的 `Adam optimizer`，`learning_rate` 是训练的速度，与网络的训练收敛速度有关系。
+
+```python
+def optimizer_program():
+return fluid.optimizer.Adam(learning_rate=0.001)
+```
+
+## 训练模型
+
+### Trainer 配置
+
+现在，我们需要配置 `Trainer`。`Trainer` 需要接受训练程序 `train_program`, `place` 和优化器 `optimizer_func`。
+
+```python
+use_cuda = False
+place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+trainer = fluid.Trainer(
+train_func=train_program,
+optimizer_func=optimizer_program,
+place=place)
+```
+
+### Data Feeders 配置
+
+`cifar.train10()` 每次产生一条样本，在完成shuffle和batch之后，作为训练的输入。
+
+```python
+# Each batch will yield 128 images
+BATCH_SIZE = 128
+
+# Reader for training
+train_reader = paddle.batch(
+paddle.reader.shuffle(paddle.dataset.cifar.train10(), buf_size=50000),
+batch_size=BATCH_SIZE)
+
+# Reader for testing. A separated data set for testing.
+test_reader = paddle.batch(
+paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
+```
+
+### Event Handler
+
+可以使用`event_handler`回调函数来观察训练过程，或进行测试等, 该回调函数是`trainer.train`函数里设定。
+
+`event_handler_plot`可以用来利用回调数据来打点画图:
+
+![png](./image/train_and_test.png)
+
+```python
+params_dirname = "image_classification_resnet.inference.model"
+
+from paddle.v2.plot import Ploter
+
+train_title = "Train cost"
+test_title = "Test cost"
+cost_ploter = Ploter(train_title, test_title)
+
+step = 0
+def event_handler_plot(event):
+global step
+if isinstance(event, fluid.EndStepEvent):
+if step % 1 == 0:
+cost_ploter.append(train_title, step, event.metrics[0])
+cost_ploter.plot()
+step += 1
+if isinstance(event, fluid.EndEpochEvent):
+avg_cost, accuracy = trainer.test(
+reader=test_reader,
+feed_order=['pixel', 'label'])
+cost_ploter.append(test_title, step, avg_cost)
+
+# save parameters
+if params_dirname is not None:
+trainer.save_params(params_dirname)
+```
+
+`event_handler` 用来在训练过程中输出文本日志
+
+```python
+params_dirname = "image_classification_resnet.inference.model"
+
+# event handler to track training and testing process
+def event_handler(event):
+if isinstance(event, fluid.EndStepEvent):
+if event.step % 100 == 0:
+print("\nPass %d, Batch %d, Cost %f, Acc %f" %
+(event.step, event.epoch, event.metrics[0],
+event.metrics[1]))
+else:
+sys.stdout.write('.')
+sys.stdout.flush()
+
+if isinstance(event, fluid.EndEpochEvent):
+# Test against with the test dataset to get accuracy.
+avg_cost, accuracy = trainer.test(
+reader=test_reader, feed_order=['pixel', 'label'])
+
+print('\nTest with Pass {0}, Loss {1:2.2}, Acc {2:2.2}'.format(event.epoch, avg_cost, accuracy))
+
+# save parameters
+if params_dirname is not None:
+trainer.save_params(params_dirname)
+```
+
+### 训练
+
+通过`trainer.train`函数训练:
+
+**注意:** CPU，每个 Epoch 将花费大约15～20分钟。这部分可能需要一段时间。请随意修改代码，在GPU上运行测试，以提高培训速度。
+
+```python
+trainer.train(
+reader=train_reader,
+num_epochs=2,
+event_handler=event_handler,
+feed_order=['pixel', 'label'])
+```
+
+一轮训练log示例如下所示，经过1个pass， 训练集上平均 Accuracy 为0.59 ，测试集上平均  Accuracy 为0.6 。
+
+```text
+Pass 0, Batch 0, Cost 3.869598, Acc 0.164062
+...................................................................................................
+Pass 100, Batch 0, Cost 1.481038, Acc 0.460938
+...................................................................................................
+Pass 200, Batch 0, Cost 1.340323, Acc 0.523438
+...................................................................................................
+Pass 300, Batch 0, Cost 1.223424, Acc 0.593750
+..........................................................................................
+Test with Pass 0, Loss 1.1, Acc 0.6
+```
+
+图12是训练的分类错误率曲线图，运行到第200个pass后基本收敛，最终得到测试集上分类错误率为8.54%。
+
+![CIFARErrorRate](./image/plot.png)
+<p align="center">
+图12. CIFAR10数据集上VGG模型的分类错误率
+</p>
+
+## 应用模型
+
+可以使用训练好的模型对图片进行分类，下面程序展示了如何使用 `fluid.Inferencer` 接口进行推断，可以打开注释，更改加载的模型。
+
+### 生成预测输入数据
+
+`dog.png` is an example image of a dog. Turn it into an numpy array to match the data feeder format.
+
+```python
+# Prepare testing data.
+from PIL import Image
+import numpy as np
+import os
+
+def load_image(file):
+im = Image.open(file)
+im = im.resize((32, 32), Image.ANTIALIAS)
+
+im = np.array(im).astype(np.float32)
+# The storage order of the loaded image is W(width),
+# H(height), C(channel). PaddlePaddle requires
+# the CHW order, so transpose them.
+im = im.transpose((2, 0, 1))  # CHW
+im = im / 255.0
+
+# Add one dimension to mimic the list format.
+im = numpy.expand_dims(im, axis=0)
+return im
+
+cur_dir = os.getcwd()
+img = load_image(cur_dir + '/image/dog.png')
+```
+
+### Inferencer 配置和预测
+
+`Inferencer` 需要一个 `infer_func` 和 `param_path` 来设置网络和经过训练的参数。
+我们可以简单地插入前面定义的推理程序。
+现在我们准备做预测。
+
+```python
+inferencer = fluid.Inferencer(
+infer_func=inference_program, param_path=params_dirname, place=place)
+
+# inference
+results = inferencer.infer({'pixel': img})
+print("infer results: ", results)
+```
+
+## 总结
+
+传统图像分类方法由多个阶段构成，框架较为复杂，而端到端的CNN模型结构可一步到位，而且大幅度提升了分类准确率。本文我们首先介绍VGG、GoogleNet、ResNet三个经典的模型；然后基于CIFAR10数据集，介绍如何使用PaddlePaddle配置和训练CNN模型，尤其是VGG和ResNet模型；最后介绍如何使用PaddlePaddle的API接口对图片进行预测和特征提取。对于其他数据集比如ImageNet，配置和训练流程是同样的，大家可以自行进行实验。
+
+
+## 参考文献
+
+[1] D. G. Lowe, [Distinctive image features from scale-invariant keypoints](http://www.cs.ubc.ca/~lowe/papers/ijcv04.pdf). IJCV, 60(2):91-110, 2004.
+
+[2] N. Dalal, B. Triggs, [Histograms of Oriented Gradients for Human Detection](http://vision.stanford.edu/teaching/cs231b_spring1213/papers/CVPR05_DalalTriggs.pdf), Proc. IEEE Conf. Computer Vision and Pattern Recognition, 2005.
+
+[3] Ahonen, T., Hadid, A., and Pietikinen, M. (2006). [Face description with local binary patterns: Application to face recognition](http://ieeexplore.ieee.org/document/1717463/). PAMI, 28.
+
+[4] J. Sivic, A. Zisserman, [Video Google: A Text Retrieval Approach to Object Matching in Videos](http://www.robots.ox.ac.uk/~vgg/publications/papers/sivic03.pdf), Proc. Ninth Int'l Conf. Computer Vision, pp. 1470-1478, 2003.
+
+[5] B. Olshausen, D. Field, [Sparse Coding with an Overcomplete Basis Set: A Strategy Employed by V1?](http://redwood.psych.cornell.edu/papers/olshausen_field_1997.pdf), Vision Research, vol. 37, pp. 3311-3325, 1997.
+
+[6] Wang, J., Yang, J., Yu, K., Lv, F., Huang, T., and Gong, Y. (2010). [Locality-constrained Linear Coding for image classification](http://ieeexplore.ieee.org/abstract/document/5540018/). In CVPR.
+
+[7] Perronnin, F., Sánchez, J., & Mensink, T. (2010). [Improving the fisher kernel for large-scale image classification](http://dl.acm.org/citation.cfm?id=1888101). In ECCV (4).
+
+[8] Lin, Y., Lv, F., Cao, L., Zhu, S., Yang, M., Cour, T., Yu, K., and Huang, T. (2011). [Large-scale image clas- sification: Fast feature extraction and SVM training](http://ieeexplore.ieee.org/document/5995477/). In CVPR.
+
+[9] Krizhevsky, A., Sutskever, I., and Hinton, G. (2012). [ImageNet classification with deep convolutional neu- ral networks](http://www.cs.toronto.edu/~kriz/imagenet_classification_with_deep_convolutional.pdf). In NIPS.
+
+[10] G.E. Hinton, N. Srivastava, A. Krizhevsky, I. Sutskever, and R.R. Salakhutdinov. [Improving neural networks by preventing co-adaptation of feature detectors](https://arxiv.org/abs/1207.0580). arXiv preprint arXiv:1207.0580, 2012.
+
+[11] K. Chatfield, K. Simonyan, A. Vedaldi, A. Zisserman. [Return of the Devil in the Details: Delving Deep into Convolutional Nets](https://arxiv.org/abs/1405.3531). BMVC, 2014。
+
+[12] Szegedy, C., Liu, W., Jia, Y., Sermanet, P., Reed, S., Anguelov, D., Erhan, D., Vanhoucke, V., Rabinovich, A., [Going deeper with convolutions](https://arxiv.org/abs/1409.4842). In: CVPR. (2015)
+
+[13] Lin, M., Chen, Q., and Yan, S. [Network in network](https://arxiv.org/abs/1312.4400). In Proc. ICLR, 2014.
+
+[14] S. Ioffe and C. Szegedy. [Batch normalization: Accelerating deep network training by reducing internal covariate shift](https://arxiv.org/abs/1502.03167). In ICML, 2015.
+
+[15] K. He, X. Zhang, S. Ren, J. Sun. [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385). CVPR 2016.
+
+[16] Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., Wojna, Z. [Rethinking the incep-tion architecture for computer vision](https://arxiv.org/abs/1512.00567). In: CVPR. (2016).
+
+[17] Szegedy, C., Ioffe, S., Vanhoucke, V. [Inception-v4, inception-resnet and the impact of residual connections on learning](https://arxiv.org/abs/1602.07261). arXiv:1602.07261 (2016).
+
+[18] Everingham, M., Eslami, S. M. A., Van Gool, L., Williams, C. K. I., Winn, J. and Zisserman, A. [The Pascal Visual Object Classes Challenge: A Retrospective]((http://link.springer.com/article/10.1007/s11263-014-0733-5)). International Journal of Computer Vision, 111(1), 98-136, 2015.
+
+[19] He, K., Zhang, X., Ren, S., and Sun, J. [Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification](https://arxiv.org/abs/1502.01852). ArXiv e-prints, February 2015.
+
+[20] http://deeplearning.net/tutorial/lenet.html
+
+[21] https://www.cs.toronto.edu/~kriz/cifar.html
+
+[22] http://cs231n.github.io/classification/
+
+<br/>
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/doc/fluid/new_docs/beginners_guide/basics/index.rst b/doc/fluid/new_docs/beginners_guide/basics/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d16f8b947253a535567ddc8d7b227dd153d9b154
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/index.rst
@@ -0,0 +1,18 @@
+################
+深度学习基础知识
+################
+
+
+..  todo::
+
+    概述
+    
+..  toctree::
+    :maxdepth: 2
+
+    image_classification/index.md
+    word2vec/index.md
+    recommender_system/index.md
+    understand_sentiment/index.md
+    label_semantic_roles/index.md
+    machine_translation/index.md
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..29b5622a53a1b0847e9f53febf1cc50dcf4f044a
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/.gitignore
@@ -0,0 +1,12 @@
+data/train.list
+data/test.*
+data/conll05st-release.tar.gz
+data/conll05st-release
+data/predicate_dict
+data/label_dict
+data/word_dict
+data/emb
+data/feature
+output
+predict.res
+train.log
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm.png
new file mode 100644
index 0000000000000000000000000000000000000000..e63f5ebd6d00f2e4ecf97b9ab2027e74683013f2
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm_en.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm_en.png
new file mode 100755
index 0000000000000000000000000000000000000000..f0a195c24d9ee493f96bb93c28a99e70566be7a4
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm_en.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example.png
new file mode 100755
index 0000000000000000000000000000000000000000..e5f7151c9fcc50a7cf7af485cbbc7e4fccab0c20
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example_en.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example_en.png
new file mode 100755
index 0000000000000000000000000000000000000000..93b44dd4874402ef29ad7bd7d94147609b92e309
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example_en.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network.png
new file mode 100644
index 0000000000000000000000000000000000000000..592f7ee23bdc88a9a35059612e5ab880bbc9d34b
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network_en.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network_en.png
new file mode 100755
index 0000000000000000000000000000000000000000..c3646312e48db977402fb353dc0c9b4d02269bf4
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network_en.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing.png
new file mode 100755
index 0000000000000000000000000000000000000000..9265b671735940ed6549e2980064d2ce08baae64
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing_en.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing_en.png
new file mode 100755
index 0000000000000000000000000000000000000000..23f4f45b603e3d60702af2b2464d10fc8deed061
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing_en.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/linear_chain_crf.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/linear_chain_crf.png
new file mode 100644
index 0000000000000000000000000000000000000000..0778fda74b2ad22ce4b631791a7b028cdef780a5
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/linear_chain_crf.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm.png
new file mode 100644
index 0000000000000000000000000000000000000000..3d2914c726b5f4c46e66dfa85d4e88649fede6b3
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm_en.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm_en.png
new file mode 100755
index 0000000000000000000000000000000000000000..0b944ef91e8b5ba4b14d2a35bd8879f261cf8f61
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm_en.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/index.md b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/index.md
new file mode 100644
index 0000000000000000000000000000000000000000..828ca738317992270487647e66b08b6d2f80e209
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/index.md
@@ -0,0 +1,568 @@
+# 语义角色标注
+
+本教程源代码目录在[book/label_semantic_roles](https://github.com/PaddlePaddle/book/tree/develop/07.label_semantic_roles)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)。
+
+## 背景介绍
+
+自然语言分析技术大致分为三个层面：词法分析、句法分析和语义分析。语义角色标注是实现浅层语义分析的一种方式。在一个句子中，谓词是对主语的陈述或说明，指出“做什么”、“是什么”或“怎么样，代表了一个事件的核心，跟谓词搭配的名词称为论元。语义角色是指论元在动词所指事件中担任的角色。主要有：施事者（Agent）、受事者（Patient）、客体（Theme）、经验者（Experiencer）、受益者（Beneficiary）、工具（Instrument）、处所（Location）、目标（Goal）和来源（Source）等。
+
+请看下面的例子，“遇到” 是谓词（Predicate，通常简写为“Pred”），“小明”是施事者（Agent），“小红”是受事者（Patient），“昨天” 是事件发生的时间（Time），“公园”是事情发生的地点（Location）。
+
+$$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_{\mbox{Time}}\mbox{在[公园]}_{\mbox{Location}}\mbox{[遇到]}_{\mbox{Predicate}}\mbox{了[小红]}_{\mbox{Patient}}\mbox{。}$$
+
+语义角色标注（Semantic Role Labeling，SRL）以句子的谓词为中心，不对句子所包含的语义信息进行深入分析，只分析句子中各成分与谓词之间的关系，即句子的谓词（Predicate）- 论元（Argument）结构，并用语义角色来描述这些结构关系，是许多自然语言理解任务（如信息抽取，篇章分析，深度问答等）的一个重要中间步骤。在研究中一般都假定谓词是给定的，所要做的就是找出给定谓词的各个论元和它们的语义角色。
+
+传统的SRL系统大多建立在句法分析基础之上，通常包括5个流程：
+
+1. 构建一棵句法分析树，例如，图1是对上面例子进行依存句法分析得到的一棵句法树。
+2. 从句法树上识别出给定谓词的候选论元。
+3. 候选论元剪除；一个句子中的候选论元可能很多，候选论元剪除就是从大量的候选项中剪除那些最不可能成为论元的候选项。
+4. 论元识别：这个过程是从上一步剪除之后的候选中判断哪些是真正的论元，通常当做一个二分类问题来解决。
+5. 对第4步的结果，通过多分类得到论元的语义角色标签。可以看到，句法分析是基础，并且后续步骤常常会构造的一些人工特征，这些特征往往也来自句法分析。
+
+![dependencyParsing](./image/dependency_parsing.png)
+<div  align="center">
+图1. 依存句法分析句法树示例
+</div>
+
+然而，完全句法分析需要确定句子所包含的全部句法信息，并确定句子各成分之间的关系，是一个非常困难的任务，目前技术下的句法分析准确率并不高，句法分析的细微错误都会导致SRL的错误。为了降低问题的复杂度，同时获得一定的句法结构信息，“浅层句法分析”的思想应运而生。浅层句法分析也称为部分句法分析（partial parsing）或语块划分（chunking）。和完全句法分析得到一颗完整的句法树不同，浅层句法分析只需要识别句子中某些结构相对简单的独立成分，例如：动词短语，这些被识别出来的结构称为语块。为了回避 “无法获得准确率较高的句法树” 所带来的困难，一些研究\[[1](#参考文献)\]也提出了基于语块（chunk）的SRL方法。基于语块的SRL方法将SRL作为一个序列标注问题来解决。序列标注任务一般都会采用BIO表示方式来定义序列标注的标签集，我们先来介绍这种表示方法。在BIO表示法中，B代表语块的开始，I代表语块的中间，O代表语块结束。通过B、I、O 三种标记将不同的语块赋予不同的标签，例如：对于一个角色为A的论元，将它所包含的第一个语块赋予标签B-A，将它所包含的其它语块赋予标签I-A，不属于任何论元的语块赋予标签O。
+
+我们继续以上面的这句话为例，图1展示了BIO表示方法。
+
+![bioExample](./image/bio_example.png)
+<div  align="center">
+图2. BIO标注方法示例
+</div>
+
+从上面的例子可以看到，根据序列标注结果可以直接得到论元的语义角色标注结果，是一个相对简单的过程。这种简单性体现在：（1）依赖浅层句法分析，降低了句法分析的要求和难度；（2）没有了候选论元剪除这一步骤；（3）论元的识别和论元标注是同时实现的。这种一体化处理论元识别和论元标注的方法，简化了流程，降低了错误累积的风险，往往能够取得更好的结果。
+
+与基于语块的SRL方法类似，在本教程中我们也将SRL看作一个序列标注问题，不同的是，我们只依赖输入文本序列，不依赖任何额外的语法解析结果或是复杂的人造特征，利用深度神经网络构建一个端到端学习的SRL系统。我们以[CoNLL-2004 and CoNLL-2005 Shared Tasks](http://www.cs.upc.edu/~srlconll/)任务中SRL任务的公开数据集为例，实践下面的任务：给定一句话和这句话里的一个谓词，通过序列标注的方式，从句子中找到谓词对应的论元，同时标注它们的语义角色。
+
+## 模型概览
+
+循环神经网络（Recurrent Neural Network）是一种对序列建模的重要模型，在自然语言处理任务中有着广泛地应用。不同于前馈神经网络（Feed-forward Neural Network），RNN能够处理输入之间前后关联的问题。LSTM是RNN的一种重要变种，常用来学习长序列中蕴含的长程依赖关系，我们在[情感分析](https://github.com/PaddlePaddle/book/tree/develop/05.understand_sentiment)一篇中已经介绍过，这一篇中我们依然利用LSTM来解决SRL问题。
+
+### 栈式循环神经网络（Stacked Recurrent Neural Network）
+
+深层网络有助于形成层次化特征，网络上层在下层已经学习到的初级特征基础上，形成更复杂的高级特征。尽管LSTM沿时间轴展开后等价于一个非常“深”的前馈网络，但由于LSTM各个时间步参数共享，`$t-1$`时刻状态到`$t$`时刻的映射，始终只经过了一次非线性映射，也就是说单层LSTM对状态转移的建模是 “浅” 的。堆叠多个LSTM单元，令前一个LSTM`$t$`时刻的输出，成为下一个LSTM单元`$t$`时刻的输入，帮助我们构建起一个深层网络，我们把它称为第一个版本的栈式循环神经网络。深层网络提高了模型拟合复杂模式的能力，能够更好地建模跨不同时间步的模式\[[2](#参考文献)\]。
+
+然而，训练一个深层LSTM网络并非易事。纵向堆叠多个LSTM单元可能遇到梯度在纵向深度上传播受阻的问题。通常，堆叠4层LSTM单元可以正常训练，当层数达到4~8层时，会出现性能衰减，这时必须考虑一些新的结构以保证梯度纵向顺畅传播，这是训练深层LSTM网络必须解决的问题。我们可以借鉴LSTM解决 “梯度消失梯度爆炸” 问题的智慧之一：在记忆单元（Memory Cell）这条信息传播的路线上没有非线性映射，当梯度反向传播时既不会衰减、也不会爆炸。因此，深层LSTM模型也可以在纵向上添加一条保证梯度顺畅传播的路径。
+
+一个LSTM单元完成的运算可以被分为三部分：（1）输入到隐层的映射（input-to-hidden） ：每个时间步输入信息`$x$`会首先经过一个矩阵映射，再作为遗忘门，输入门，记忆单元，输出门的输入，注意，这一次映射没有引入非线性激活；（2）隐层到隐层的映射（hidden-to-hidden）：这一步是LSTM计算的主体，包括遗忘门，输入门，记忆单元更新，输出门的计算；（3）隐层到输出的映射（hidden-to-output）：通常是简单的对隐层向量进行激活。我们在第一个版本的栈式网络的基础上，加入一条新的路径：除上一层LSTM输出之外，将前层LSTM的输入到隐层的映射作为的一个新的输入，同时加入一个线性映射去学习一个新的变换。
+
+图3是最终得到的栈式循环神经网络结构示意图。
+
+![lstmStructure](./image/stacked_lstm.png)
+<p align="center">
+图3. 基于LSTM的栈式循环神经网络结构示意图
+</p>
+
+### 双向循环神经网络（Bidirectional Recurrent Neural Network）
+
+在LSTM中，`$t$`时刻的隐藏层向量编码了到`$t$`时刻为止所有输入的信息，但`$t$`时刻的LSTM可以看到历史，却无法看到未来。在绝大多数自然语言处理任务中，我们几乎总是能拿到整个句子。这种情况下，如果能够像获取历史信息一样，得到未来的信息，对序列学习任务会有很大的帮助。
+
+为了克服这一缺陷，我们可以设计一种双向循环网络单元，它的思想简单且直接：对上一节的栈式循环神经网络进行一个小小的修改，堆叠多个LSTM单元，让每一层LSTM单元分别以：正向、反向、正向 …… 的顺序学习上一层的输出序列。于是，从第2层开始，`$t$`时刻我们的LSTM单元便总是可以看到历史和未来的信息。图4是基于LSTM的双向循环神经网络结构示意图。
+
+![lstmStructure](./image/bidirectional_stacked_lstm.png)
+<p align="center">
+图4. 基于LSTM的双向循环神经网络结构示意图
+</p>
+
+需要说明的是，这种双向RNN结构和Bengio等人在机器翻译任务中使用的双向RNN结构\[[3](#参考文献), [4](#参考文献)\] 并不相同，我们会在后续[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)任务中，介绍另一种双向循环神经网络。
+
+### 条件随机场 (Conditional Random Field)
+
+使用神经网络模型解决问题的思路通常是：前层网络学习输入的特征表示，网络的最后一层在特征基础上完成最终的任务。在SRL任务中，深层LSTM网络学习输入的特征表示，条件随机场（Conditional Random Filed， CRF）在特征的基础上完成序列标注，处于整个网络的末端。
+
+CRF是一种概率化结构模型，可以看作是一个概率无向图模型，结点表示随机变量，边表示随机变量之间的概率依赖关系。简单来讲，CRF学习条件概率`$P(X|Y)$`，其中 `$X = (x_1, x_2, ... , x_n)$` 是输入序列，`$Y = (y_1, y_2, ... , y_n)$` 是标记序列；解码过程是给定 `$X$`序列求解令`$P(Y|X)$`最大的`$Y$`序列，即`$Y^* = \mbox{arg max}_{Y} P(Y | X)$`。
+
+序列标注任务只需要考虑输入和输出都是一个线性序列，并且由于我们只是将输入序列作为条件，不做任何条件独立假设，因此输入序列的元素之间并不存在图结构。综上，在序列标注任务中使用的是如图5所示的定义在链式图上的CRF，称之为线性链条件随机场（Linear Chain Conditional Random Field）。
+
+![linear_chain_crf](./image/linear_chain_crf.png)
+<p align="center">
+图5. 序列标注任务中使用的线性链条件随机场
+</p>
+
+根据线性链条件随机场上的因子分解定理\[[5](#参考文献)\]，在给定观测序列`$X$`时，一个特定标记序列`$Y$`的概率可以定义为：
+
+$$p(Y | X) = \frac{1}{Z(X)} \text{exp}\left(\sum_{i=1}^{n}\left(\sum_{j}\lambda_{j}t_{j} (y_{i - 1}, y_{i}, X, i) + \sum_{k} \mu_k s_k (y_i, X, i)\right)\right)$$
+
+其中`$Z(X)$`是归一化因子，`$t_j$` 是定义在边上的特征函数，依赖于当前和前一个位置，称为转移特征，表示对于输入序列`$X$`及其标注序列在 `$i$`及`$i - 1$`位置上标记的转移概率。`$s_k$`是定义在结点上的特征函数，称为状态特征，依赖于当前位置，表示对于观察序列`$X$`及其`$i$`位置的标记概率。`$\lambda_j$` 和 `$\mu_k$` 分别是转移特征函数和状态特征函数对应的权值。实际上，`$t$`和`$s$`可以用相同的数学形式表示，再对转移特征和状态特在各个位置`$i$`求和有：`$f_{k}(Y, X) = \sum_{i=1}^{n}f_k({y_{i - 1}, y_i, X, i})$`，把`$f$`统称为特征函数，于是`$P(Y|X)$`可表示为：
+
+$$p(Y|X, W) = \frac{1}{Z(X)}\text{exp}\sum_{k}\omega_{k}f_{k}(Y, X)$$
+
+`$\omega$`是特征函数对应的权值，是CRF模型要学习的参数。训练时，对于给定的输入序列和对应的标记序列集合`$D = \left[(X_1,  Y_1), (X_2 , Y_2) , ... , (X_N, Y_N)\right]$` ，通过正则化的极大似然估计，求解如下优化目标：
+
+$$\DeclareMathOperator*{\argmax}{arg\,max} L(\lambda, D) = - \text{log}\left(\prod_{m=1}^{N}p(Y_m|X_m, W)\right) + C \frac{1}{2}\lVert W\rVert^{2}$$
+
+这个优化目标可以通过反向传播算法和整个神经网络一起求解。解码时，对于给定的输入序列`$X$`，通过解码算法（通常有：维特比算法、Beam Search）求令出条件概率`$\bar{P}(Y|X)$`最大的输出序列 `$\bar{Y}$`。
+
+### 深度双向LSTM（DB-LSTM）SRL模型
+
+在SRL任务中，输入是 “谓词” 和 “一句话”，目标是从这句话中找到谓词的论元，并标注论元的语义角色。如果一个句子含有`$n$`个谓词，这个句子会被处理`$n$`次。一个最为直接的模型是下面这样：
+
+1. 构造输入；
+- 输入1是谓词，输入2是句子
+- 将输入1扩展成和输入2一样长的序列，用one-hot方式表示；
+2. one-hot方式的谓词序列和句子序列通过词表，转换为实向量表示的词向量序列；
+3. 将步骤2中的2个词向量序列作为双向LSTM的输入，学习输入序列的特征表示；
+4. CRF以步骤3中模型学习到的特征为输入，以标记序列为监督信号，实现序列标注；
+
+大家可以尝试上面这种方法。这里，我们提出一些改进，引入两个简单但对提高系统性能非常有效的特征：
+
+- 谓词上下文：上面的方法中，只用到了谓词的词向量表达谓词相关的所有信息，这种方法始终是非常弱的，特别是如果谓词在句子中出现多次，有可能引起一定的歧义。从经验出发，谓词前后若干个词的一个小片段，能够提供更丰富的信息，帮助消解歧义。于是，我们把这样的经验也添加到模型中，为每个谓词同时抽取一个“谓词上下文” 片段，也就是从这个谓词前后各取`$n$`个词构成的一个窗口片段；
+- 谓词上下文区域标记：为句子中的每一个词引入一个0-1二值变量，表示它们是否在“谓词上下文”片段中；
+
+修改后的模型如下（图6是一个深度为4的模型结构示意图）：
+
+1. 构造输入
+- 输入1是句子序列，输入2是谓词序列，输入3是谓词上下文，从句子中抽取这个谓词前后各`$n$`个词，构成谓词上下文，用one-hot方式表示，输入4是谓词上下文区域标记，标记了句子中每一个词是否在谓词上下文中；
+- 将输入2~3均扩展为和输入1一样长的序列；
+2. 输入1~4均通过词表取词向量转换为实向量表示的词向量序列；其中输入1、3共享同一个词表，输入2和4各自独有词表；
+3. 第2步的4个词向量序列作为双向LSTM模型的输入；LSTM模型学习输入序列的特征表示，得到新的特性表示序列；
+4. CRF以第3步中LSTM学习到的特征为输入，以标记序列为监督信号，完成序列标注；
+
+![db_lstm_network](./image/db_lstm_network.png)
+<div  align="center">
+图6. SRL任务上的深层双向LSTM模型
+</div>
+
+
+## 数据介绍
+
+在此教程中，我们选用[CoNLL 2005](http://www.cs.upc.edu/~srlconll/)SRL任务开放出的数据集作为示例。需要特别说明的是，CoNLL 2005 SRL任务的训练数集和开发集在比赛之后并非免费进行公开，目前，能够获取到的只有测试集，包括Wall Street Journal的23节和Brown语料集中的3节。在本教程中，我们以测试集中的WSJ数据为训练集来讲解模型。但是，由于测试集中样本的数量远远不够，如果希望训练一个可用的神经网络SRL系统，请考虑付费获取全量数据。
+
+原始数据中同时包括了词性标注、命名实体识别、语法解析树等多种信息。本教程中，我们使用test.wsj文件夹中的数据进行训练和测试，并只会用到words文件夹（文本序列）和props文件夹（标注结果）下的数据。本教程使用的数据目录如下：
+
+```text
+conll05st-release/
+└── test.wsj
+├── props  # 标注结果
+└── words  # 输入文本序列
+```
+
+标注信息源自Penn TreeBank\[[7](#参考文献)\]和PropBank\[[8](#参考文献)\]的标注结果。PropBank标注结果的标签和我们在文章一开始示例中使用的标注结果标签不同，但原理是相同的，关于标注结果标签含义的说明，请参考论文\[[9](#参考文献)\]。
+
+原始数据需要进行数据预处理才能被PaddlePaddle处理，预处理包括下面几个步骤:
+
+1. 将文本序列和标记序列其合并到一条记录中；
+2. 一个句子如果含有`$n$`个谓词，这个句子会被处理`$n$`次，变成`$n$`条独立的训练样本，每个样本一个不同的谓词；
+3. 抽取谓词上下文和构造谓词上下文区域标记；
+4. 构造以BIO法表示的标记；
+5. 依据词典获取词对应的整数索引。
+
+
+```python
+# import paddle.v2.dataset.conll05 as conll05
+# conll05.corpus_reader函数完成上面第1步和第2步.
+# conll05.reader_creator函数完成上面第3步到第5步.
+# conll05.test函数可以获取处理之后的每条样本来供PaddlePaddle训练.
+```
+
+预处理完成之后一条训练样本包含9个特征，分别是：句子序列、谓词、谓词上下文（占 5 列）、谓词上下区域标志、标注序列。下表是一条训练样本的示例。
+
+| 句子序列 | 谓词 | 谓词上下文（窗口 = 5） | 谓词上下文区域标记 | 标注序列 |
+|---|---|---|---|---|
+| A | set | n't been set . × | 0 | B-A1 |
+| record | set | n't been set . × | 0 | I-A1 |
+| date | set | n't been set . × | 0 | I-A1 |
+| has | set | n't been set . × | 0 | O |
+| n't | set | n't been set . × | 1 | B-AM-NEG |
+| been | set | n't been set . × | 1 | O |
+| set | set | n't been set . × | 1 | B-V |
+| . | set | n't been set . × | 1 | O |
+
+
+除数据之外，我们同时提供了以下资源：
+
+| 文件名称 | 说明 |
+|---|---|
+| word_dict | 输入句子的词典，共计44068个词 |
+| label_dict | 标记的词典，共计106个标记 |
+| predicate_dict | 谓词的词典，共计3162个词 |
+| emb | 一个训练好的词表，32维 |
+
+我们在英文维基百科上训练语言模型得到了一份词向量用来初始化SRL模型。在SRL模型训练过程中，词向量不再被更新。关于语言模型和词向量可以参考[词向量](https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/README.cn.md) 这篇教程。我们训练语言模型的语料共有995,000,000个token，词典大小控制为4900,000词。CoNLL 2005训练语料中有5%的词不在这4900,000个词中，我们将它们全部看作未登录词，用`<unk>`表示。
+
+获取词典，打印词典大小：
+
+```python
+import math, os
+import numpy as np
+import paddle
+import paddle.v2.dataset.conll05 as conll05
+import paddle.fluid as fluid
+import time
+
+with_gpu = os.getenv('WITH_GPU', '0') != '0'
+
+word_dict, verb_dict, label_dict = conll05.get_dict()
+word_dict_len = len(word_dict)
+label_dict_len = len(label_dict)
+pred_dict_len = len(verb_dict)
+
+print word_dict_len
+print label_dict_len
+print pred_dict_len
+```
+
+## 模型配置说明
+
+- 定义输入数据维度及模型超参数。
+
+```python
+mark_dict_len = 2   # 谓上下文区域标志的维度，是一个0-1 2值特征，因此维度为2
+word_dim = 32       # 词向量维度
+mark_dim = 5        # 谓词上下文区域通过词表被映射为一个实向量，这个是相邻的维度
+hidden_dim = 512    # LSTM隐层向量的维度 ： 512 / 4
+depth = 8           # 栈式LSTM的深度
+mix_hidden_lr = 1e-3
+
+IS_SPARSE = True
+PASS_NUM = 10
+BATCH_SIZE = 10
+
+embedding_name = 'emb'
+```
+
+这里需要特别说明的是hidden_dim = 512指定了LSTM隐层向量的维度为128维，关于这一点请参考PaddlePaddle官方文档中[lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)的说明。
+
+- 如上文提到，我们用基于英文维基百科训练好的词向量来初始化序列输入、谓词上下文总共6个特征的embedding层参数，在训练中不更新。
+
+```python
+# 这里加载PaddlePaddle上版保存的二进制模型
+def load_parameter(file_name, h, w):
+with open(file_name, 'rb') as f:
+f.read(16)  # skip header.
+return np.fromfile(f, dtype=np.float32).reshape(h, w)
+```
+
+- 8个LSTM单元以“正向/反向”的顺序对所有输入序列进行学习。
+
+```python
+def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
+**ignored):
+# 8 features
+predicate_embedding = fluid.layers.embedding(
+input=predicate,
+size=[pred_dict_len, word_dim],
+dtype='float32',
+is_sparse=IS_SPARSE,
+param_attr='vemb')
+
+mark_embedding = fluid.layers.embedding(
+input=mark,
+size=[mark_dict_len, mark_dim],
+dtype='float32',
+is_sparse=IS_SPARSE)
+
+word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+# Since word vector lookup table is pre-trained, we won't update it this time.
+# trainable being False prevents updating the lookup table during training.
+emb_layers = [
+fluid.layers.embedding(
+size=[word_dict_len, word_dim],
+input=x,
+param_attr=fluid.ParamAttr(
+name=embedding_name, trainable=False)) for x in word_input
+]
+emb_layers.append(predicate_embedding)
+emb_layers.append(mark_embedding)
+
+# 8 LSTM units are trained through alternating left-to-right / right-to-left order
+# denoted by the variable `reverse`.
+hidden_0_layers = [
+fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
+for emb in emb_layers
+]
+
+hidden_0 = fluid.layers.sums(input=hidden_0_layers)
+
+lstm_0 = fluid.layers.dynamic_lstm(
+input=hidden_0,
+size=hidden_dim,
+candidate_activation='relu',
+gate_activation='sigmoid',
+cell_activation='sigmoid')
+
+# stack L-LSTM and R-LSTM with direct edges
+input_tmp = [hidden_0, lstm_0]
+
+# In PaddlePaddle, state features and transition features of a CRF are implemented
+# by a fully connected layer and a CRF layer seperately. The fully connected layer
+# with linear activation learns the state features, here we use fluid.layers.sums
+# (fluid.layers.fc can be uesed as well), and the CRF layer in PaddlePaddle:
+# fluid.layers.linear_chain_crf only
+# learns the transition features, which is a cost layer and is the last layer of the network.
+# fluid.layers.linear_chain_crf outputs the log probability of true tag sequence
+# as the cost by given the input sequence and it requires the true tag sequence
+# as target in the learning process.
+
+for i in range(1, depth):
+mix_hidden = fluid.layers.sums(input=[
+fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
+fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh')
+])
+
+lstm = fluid.layers.dynamic_lstm(
+input=mix_hidden,
+size=hidden_dim,
+candidate_activation='relu',
+gate_activation='sigmoid',
+cell_activation='sigmoid',
+is_reverse=((i % 2) == 1))
+
+input_tmp = [mix_hidden, lstm]
+
+# 取最后一个栈式LSTM的输出和这个LSTM单元的输入到隐层映射，
+# 经过一个全连接层映射到标记字典的维度，来学习 CRF 的状态特征
+feature_out = fluid.layers.sums(input=[
+fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
+fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
+])
+
+return feature_out
+```
+
+## 训练模型
+
+- 我们根据网络拓扑结构和模型参数来构造出trainer用来训练，在构造时还需指定优化方法，这里使用最基本的SGD方法(momentum设置为0)，同时设定了学习率、正则等。
+
+- 数据介绍部分提到CoNLL 2005训练集付费，这里我们使用测试集训练供大家学习。conll05.test()每次产生一条样本，包含9个特征，shuffle和组完batch后作为训练的输入。
+
+- 通过feeding来指定每一个数据和data_layer的对应关系。 例如 下面feeding表示: conll05.test()产生数据的第0列对应word_data层的特征。
+
+- 可以使用event_handler回调函数来观察训练过程，或进行测试等。这里我们打印了训练过程的cost，该回调函数是trainer.train函数里设定。
+
+- 通过trainer.train函数训练
+
+```python
+def train(use_cuda, save_dirname=None, is_local=True):
+# define network topology
+
+# 句子序列
+word = fluid.layers.data(
+name='word_data', shape=[1], dtype='int64', lod_level=1)
+
+# 谓词
+predicate = fluid.layers.data(
+name='verb_data', shape=[1], dtype='int64', lod_level=1)
+
+# 谓词上下文5个特征
+ctx_n2 = fluid.layers.data(
+name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
+ctx_n1 = fluid.layers.data(
+name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
+ctx_0 = fluid.layers.data(
+name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
+ctx_p1 = fluid.layers.data(
+name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
+ctx_p2 = fluid.layers.data(
+name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
+
+# 谓词上下区域标志
+mark = fluid.layers.data(
+name='mark_data', shape=[1], dtype='int64', lod_level=1)
+
+# define network topology
+feature_out = db_lstm(**locals())
+
+# 标注序列
+target = fluid.layers.data(
+name='target', shape=[1], dtype='int64', lod_level=1)
+
+# 学习 CRF 的转移特征
+crf_cost = fluid.layers.linear_chain_crf(
+input=feature_out,
+label=target,
+param_attr=fluid.ParamAttr(
+name='crfw', learning_rate=mix_hidden_lr))
+
+avg_cost = fluid.layers.mean(crf_cost)
+
+sgd_optimizer = fluid.optimizer.SGD(
+learning_rate=fluid.layers.exponential_decay(
+learning_rate=0.01,
+decay_steps=100000,
+decay_rate=0.5,
+staircase=True))
+
+sgd_optimizer.minimize(avg_cost)
+
+# The CRF decoding layer is used for evaluation and inference.
+# It shares weights with CRF layer.  The sharing of parameters among multiple layers
+# is specified by using the same parameter name in these layers. If true tag sequence
+# is provided in training process, `fluid.layers.crf_decoding` calculates labelling error
+# for each input token and sums the error over the entire sequence.
+# Otherwise, `fluid.layers.crf_decoding`  generates the labelling tags.
+crf_decode = fluid.layers.crf_decoding(
+input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
+
+train_data = paddle.batch(
+paddle.reader.shuffle(
+paddle.dataset.conll05.test(), buf_size=8192),
+batch_size=BATCH_SIZE)
+
+place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+
+feeder = fluid.DataFeeder(
+feed_list=[
+word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
+],
+place=place)
+exe = fluid.Executor(place)
+
+def train_loop(main_program):
+exe.run(fluid.default_startup_program())
+embedding_param = fluid.global_scope().find_var(
+embedding_name).get_tensor()
+embedding_param.set(
+load_parameter(conll05.get_embedding(), word_dict_len, word_dim),
+place)
+
+start_time = time.time()
+batch_id = 0
+for pass_id in xrange(PASS_NUM):
+for data in train_data():
+cost = exe.run(main_program,
+feed=feeder.feed(data),
+fetch_list=[avg_cost])
+cost = cost[0]
+
+if batch_id % 10 == 0:
+print("avg_cost:" + str(cost))
+if batch_id != 0:
+print("second per batch: " + str((time.time(
+) - start_time) / batch_id))
+# Set the threshold low to speed up the CI test
+if float(cost) < 60.0:
+if save_dirname is not None:
+fluid.io.save_inference_model(save_dirname, [
+'word_data', 'verb_data', 'ctx_n2_data',
+'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
+'ctx_p2_data', 'mark_data'
+], [feature_out], exe)
+return
+
+batch_id = batch_id + 1
+
+train_loop(fluid.default_main_program())
+```
+
+
+## 应用模型
+
+训练完成之后，需要依据某个我们关心的性能指标选择最优的模型进行预测，可以简单的选择测试集上标记错误最少的那个模型。以下我们给出一个使用训练后的模型进行预测的示例。
+
+```python
+def infer(use_cuda, save_dirname=None):
+if save_dirname is None:
+return
+
+place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+exe = fluid.Executor(place)
+
+inference_scope = fluid.core.Scope()
+with fluid.scope_guard(inference_scope):
+# Use fluid.io.load_inference_model to obtain the inference program desc,
+# the feed_target_names (the names of variables that will be fed
+# data using feed operators), and the fetch_targets (variables that
+# we want to obtain data from using fetch operators).
+[inference_program, feed_target_names,
+fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+# Setup inputs by creating LoDTensors to represent sequences of words.
+# Here each word is the basic element of these LoDTensors and the shape of
+# each word (base_shape) should be [1] since it is simply an index to
+# look up for the corresponding word vector.
+# Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
+# which has only one lod level. Then the created LoDTensors will have only
+# one higher level structure (sequence of words, or sentence) than the basic
+# element (word). Hence the LoDTensor will hold data for three sentences of
+# length 3, 4 and 2, respectively.
+# Note that lod info should be a list of lists.
+lod = [[3, 4, 2]]
+base_shape = [1]
+# The range of random integers is [low, high]
+word = fluid.create_random_int_lodtensor(
+lod, base_shape, place, low=0, high=word_dict_len - 1)
+pred = fluid.create_random_int_lodtensor(
+lod, base_shape, place, low=0, high=pred_dict_len - 1)
+ctx_n2 = fluid.create_random_int_lodtensor(
+lod, base_shape, place, low=0, high=word_dict_len - 1)
+ctx_n1 = fluid.create_random_int_lodtensor(
+lod, base_shape, place, low=0, high=word_dict_len - 1)
+ctx_0 = fluid.create_random_int_lodtensor(
+lod, base_shape, place, low=0, high=word_dict_len - 1)
+ctx_p1 = fluid.create_random_int_lodtensor(
+lod, base_shape, place, low=0, high=word_dict_len - 1)
+ctx_p2 = fluid.create_random_int_lodtensor(
+lod, base_shape, place, low=0, high=word_dict_len - 1)
+mark = fluid.create_random_int_lodtensor(
+lod, base_shape, place, low=0, high=mark_dict_len - 1)
+
+# Construct feed as a dictionary of {feed_target_name: feed_target_data}
+# and results will contain a list of data corresponding to fetch_targets.
+assert feed_target_names[0] == 'word_data'
+assert feed_target_names[1] == 'verb_data'
+assert feed_target_names[2] == 'ctx_n2_data'
+assert feed_target_names[3] == 'ctx_n1_data'
+assert feed_target_names[4] == 'ctx_0_data'
+assert feed_target_names[5] == 'ctx_p1_data'
+assert feed_target_names[6] == 'ctx_p2_data'
+assert feed_target_names[7] == 'mark_data'
+
+results = exe.run(inference_program,
+feed={
+feed_target_names[0]: word,
+feed_target_names[1]: pred,
+feed_target_names[2]: ctx_n2,
+feed_target_names[3]: ctx_n1,
+feed_target_names[4]: ctx_0,
+feed_target_names[5]: ctx_p1,
+feed_target_names[6]: ctx_p2,
+feed_target_names[7]: mark
+},
+fetch_list=fetch_targets,
+return_numpy=False)
+print(results[0].lod())
+np_data = np.array(results[0])
+print("Inference Shape: ", np_data.shape)
+```
+
+整个程序的入口如下：
+
+```python
+def main(use_cuda, is_local=True):
+if use_cuda and not fluid.core.is_compiled_with_cuda():
+return
+
+# Directory for saving the trained model
+save_dirname = "label_semantic_roles.inference.model"
+
+train(use_cuda, save_dirname, is_local)
+infer(use_cuda, save_dirname)
+
+
+main(use_cuda=False)
+```
+
+## 总结
+
+语义角色标注是许多自然语言理解任务的重要中间步骤。这篇教程中我们以语义角色标注任务为例，介绍如何利用PaddlePaddle进行序列标注任务。教程中所介绍的模型来自我们发表的论文\[[10](#参考文献)\]。由于 CoNLL 2005 SRL任务的训练数据目前并非完全开放，教程中只使用测试数据作为示例。在这个过程中，我们希望减少对其它自然语言处理工具的依赖，利用神经网络数据驱动、端到端学习的能力，得到一个和传统方法可比、甚至更好的模型。在论文中我们证实了这种可能性。关于模型更多的信息和讨论可以在论文中找到。
+
+## 参考文献
+1. Sun W, Sui Z, Wang M, et al. [Chinese semantic role labeling with shallow parsing](http://www.aclweb.org/anthology/D09-1#page=1513)[C]//Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing: Volume 3-Volume 3. Association for Computational Linguistics, 2009: 1475-1483.
+2. Pascanu R, Gulcehre C, Cho K, et al. [How to construct deep recurrent neural networks](https://arxiv.org/abs/1312.6026)[J]. arXiv preprint arXiv:1312.6026, 2013.
+3. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](https://arxiv.org/abs/1406.1078)[J]. arXiv preprint arXiv:1406.1078, 2014.
+4. Bahdanau D, Cho K, Bengio Y. [Neural machine translation by jointly learning to align and translate](https://arxiv.org/abs/1409.0473)[J]. arXiv preprint arXiv:1409.0473, 2014.
+5. Lafferty J, McCallum A, Pereira F. [Conditional random fields: Probabilistic models for segmenting and labeling sequence data](http://www.jmlr.org/papers/volume15/doppa14a/source/biblio.bib.old)[C]//Proceedings of the eighteenth international conference on machine learning, ICML. 2001, 1: 282-289.
+6. 李航. 统计学习方法[J]. 清华大学出版社, 北京, 2012.
+7. Marcus M P, Marcinkiewicz M A, Santorini B. [Building a large annotated corpus of English: The Penn Treebank](http://repository.upenn.edu/cgi/viewcontent.cgi?article=1246&context=cis_reports)[J]. Computational linguistics, 1993, 19(2): 313-330.
+8. Palmer M, Gildea D, Kingsbury P. [The proposition bank: An annotated corpus of semantic roles](http://www.mitpressjournals.org/doi/pdfplus/10.1162/0891201053630264)[J]. Computational linguistics, 2005, 31(1): 71-106.
+9. Carreras X, Màrquez L. [Introduction to the CoNLL-2005 shared task: Semantic role labeling](http://www.cs.upc.edu/~srlconll/st05/papers/intro.pdf)[C]//Proceedings of the Ninth Conference on Computational Natural Language Learning. Association for Computational Linguistics, 2005: 152-164.
+10. Zhou J, Xu W. [End-to-end learning of semantic role labeling using recurrent neural networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf)[C]//Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
+
+<br/>
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/doc/fluid/new_docs/beginners_guide/basics/learning_materials.md b/doc/fluid/new_docs/beginners_guide/basics/learning_materials.md
new file mode 100644
index 0000000000000000000000000000000000000000..a27499c6ed8d1149c6d519006086febbcae943fa
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/learning_materials.md
@@ -0,0 +1,54 @@
+# 学习资料
+
+## 要读的第一本书
+基础理论习得的最直接来源就是书本。按机器学习理论、深度学习理论、编程语言三方面划分，这里推荐如下书籍辅助您。
+
+
+### 机器学习理论
+
+在开启深度学习之前，您需要先行掌握机器学习的理论。深度学习是机器学习中的一个分支，两者内在的理论基础存在强关联。
+机器学习理论的书籍教材比较多，这里推荐一本易懂易学的书籍，可以重点关注神经网络部分。
+
+书名：《机器学习》（周志华著，清华大学出版社，2016年版）
+
+### 深度学习理论
+
+打好机器学习的理论功底后，您可以开始钻研深度学习的理论。通常深度学习理论会给人留下抽象难懂的印象，且和数学结合紧密。
+为了让您能够顺利入门，这里推荐一份易学易用的教材，无论深度学习理论还是数学理论即可一本搞定。
+
+书名：《Deep Learning（深度学习）》（Goodfellow, Bengio, Courville合著，赵申剑、黎彧君、符天凡和李凯合译，人民邮电出版社，2017年版）
+此书电子版在Github上已经开源，详情可参考此链接 [《深度学习》](https://github.com/exacity/deeplearningbook-chinese)
+
+### 编程语言
+
+Python方向：这里推荐您学习Python，一方面各大主流深度学习框架的主力支撑编程语言均为Python；另一方面，对比其他语言，Python较为简单易学。
+Python的教材种类较多，这里推荐一本实操和理论性都兼顾的教材，只要完成书中52个习题，跑代码然后发现问题解决，就能逐步上手。
+
+书名：《“笨办法”学Python》（Zed Shaw著，王巍巍译，人民邮电出版社，2014年11月版）
+
+
+C++方向：C++语言在底层框架中使用较多，您逐步掌握开源框架的基本操作后，在更高阶的框架应用中会用到这个技能点。
+同前面提到的Python一样，学习C++时需要多上手操作。这里推荐迅速上手C++的书籍，不但能够学习功能和结构，还提供了解决方案的示例。
+
+书名：《Essential C++》【美】李普曼（Lippman,S.B.）著，侯捷译，电子工业出版社2013年8月版
+
+
+
+## 要看的视频公开课
+
+在学习一门新技术的同时，除了看书，如果有老师面对面教授，可以更快更好的学会知识。相比于线下授课，视频公开课能够在省钱省力的同时，达到易学易掌握的效果。
+目前深度学习的课程多是公开免费的，通过学习您可以更轻松的理解深度学习中的抽象理论，并在实操方面不绕弯路。
+综合课程生动性、可操作性、紧凑性、连续性这些特点，这里推荐如下课程，同步附上网址，便于您查找学习。
+
+### 理论知识详解视频课
+[机器学习](http://open.163.com/special/opencourse/machinelearning.html) 斯坦福大学教授吴恩达公开课程，包含相关算法的详细讲解。
+
+[AI技术](https://ai.baidu.com/paddlepaddle/player?id=13) 百度推出的“AI核心技术掌握”课程，每节课在20-30分钟左右，从AI技术到深度学习进行全面细致的解读。
+
+[深度学习](http://speech.ee.ntu.edu.tw/~tlkagk/courses_ML17_2.html) 台湾李宏毅教授的在线课程，其中是英文课程，会结合国外的科研成果，但也适合新手入门和理解深度学习。
+
+[编程语言](https://ai.baidu.com/paddlepaddle/openCourses) Python操作课程，从基础到进阶操作都提供详细说明，每节课时长20分钟左右。
+
+### PaddlePaddle实操视频课
+掌握好理论基础，具备编程能力后，您可以开始使用PaddlePaddle Fluid进行实操，从初阶开始学习，向着中高阶努力。
+目前已有PaddlePaddle官方视频公开课在官网呈现,内含PaddlePaddle实战、PaddlePaddle应用场景和机器学习模型讲解课程，帮助开发者从零开始使用PaddlePaddle，从简单场景逐步过渡到工业级应用。[点击这里](http://ai.baidu.com/paddlepaddle/openCourses)您即可开始视频课的学习之旅。
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..6129b9e8645010fcb8372d9dc3dbb568dfa80907
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/.gitignore
@@ -0,0 +1,9 @@
+data/wmt14
+data/pre-wmt14
+pretrained/wmt14_model
+gen.log
+gen_result
+train.log
+dataprovider_copy_1.py
+*.pyc
+multi-bleu.perl
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn.png
new file mode 100644
index 0000000000000000000000000000000000000000..9d8efd50a49d0305586f550344472ab94c93bed3
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn_en.png
new file mode 100755
index 0000000000000000000000000000000000000000..4b35c88fc8ea2c503473c0c15711744e784d6af6
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn_en.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention.png
new file mode 100644
index 0000000000000000000000000000000000000000..1b355e7786d25487a3f564af758c2c52c43b4690
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention_en.png
new file mode 100755
index 0000000000000000000000000000000000000000..3728f782ee09d9308d02b42305027b2735467ead
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention_en.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention.png
new file mode 100644
index 0000000000000000000000000000000000000000..28d7a15a3bd65262bde22a3f41b5aa78b46b368a
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention_en.png
new file mode 100755
index 0000000000000000000000000000000000000000..ea8585565da1ecaf241654c278c6f9b15e283286
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention_en.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder.png
new file mode 100755
index 0000000000000000000000000000000000000000..60aee0017de73f462e35708b1055aff8992c03e1
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder_en.png
new file mode 100755
index 0000000000000000000000000000000000000000..6b73798fe632e0873b35c117b86f347c8cf3116a
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder_en.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru.png
new file mode 100644
index 0000000000000000000000000000000000000000..0cde685b84106650a4df18ce335a23e6338d3d11
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru_en.png
new file mode 100755
index 0000000000000000000000000000000000000000..a6af429f23f0f7e82650139bbd8dcbef27a34abe
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru_en.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt.png
new file mode 100644
index 0000000000000000000000000000000000000000..bf56d73ebf297fadf522389c7b6836dd379aa097
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt_en.png
new file mode 100755
index 0000000000000000000000000000000000000000..557310e044b2b6687e5ea6895417ed946ac7bc11
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt_en.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/index.md b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/index.md
new file mode 100644
index 0000000000000000000000000000000000000000..fc161aaae9c37b0e1a596204e7138025a98adb1d
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/index.md
@@ -0,0 +1,448 @@
+# 机器翻译
+
+本教程源代码目录在[book/machine_translation](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)。
+
+## 背景介绍
+
+机器翻译（machine translation, MT）是用计算机来实现不同语言之间翻译的技术。被翻译的语言通常称为源语言（source language），翻译成的结果语言称为目标语言（target language）。机器翻译即实现从源语言到目标语言转换的过程，是自然语言处理的重要研究领域之一。
+
+早期机器翻译系统多为基于规则的翻译系统，需要由语言学家编写两种语言之间的转换规则，再将这些规则录入计算机。该方法对语言学家的要求非常高，而且我们几乎无法总结一门语言会用到的所有规则，更何况两种甚至更多的语言。因此，传统机器翻译方法面临的主要挑战是无法得到一个完备的规则集合\[[1](#参考文献)\]。
+
+为解决以上问题，统计机器翻译（Statistical Machine Translation, SMT）技术应运而生。在统计机器翻译技术中，转化规则是由机器自动从大规模的语料中学习得到的，而非我们人主动提供规则。因此，它克服了基于规则的翻译系统所面临的知识获取瓶颈的问题，但仍然存在许多挑战：1）人为设计许多特征（feature），但永远无法覆盖所有的语言现象；2）难以利用全局的特征；3）依赖于许多预处理环节，如词语对齐、分词或符号化（tokenization）、规则抽取、句法分析等，而每个环节的错误会逐步累积，对翻译的影响也越来越大。
+
+近年来，深度学习技术的发展为解决上述挑战提供了新的思路。将深度学习应用于机器翻译任务的方法大致分为两类：1）仍以统计机器翻译系统为框架，只是利用神经网络来改进其中的关键模块，如语言模型、调序模型等（见图1的左半部分）；2）不再以统计机器翻译系统为框架，而是直接用神经网络将源语言映射到目标语言，即端到端的神经网络机器翻译（End-to-End Neural Machine Translation, End-to-End NMT）（见图1的右半部分），简称为NMT模型。
+![nmt](./image/nmt.png)
+<p align="center">
+图1. 基于神经网络的机器翻译系统
+</p>
+
+本教程主要介绍NMT模型，以及如何用PaddlePaddle来训练一个NMT模型。
+
+## 效果展示
+
+以中英翻译（中文翻译到英文）的模型为例，当模型训练完毕时，如果输入如下已分词的中文句子：
+```text
+这些 是 希望 的 曙光 和 解脱 的 迹象 .
+```
+如果设定显示翻译结果的条数（即[柱搜索算法](#柱搜索算法)的宽度）为3，生成的英语句子如下：
+```text
+0 -5.36816   These are signs of hope and relief . <e>
+1 -6.23177   These are the light of hope and relief . <e>
+2 -7.7914  These are the light of hope and the relief of hope . <e>
+```
+- 左起第一列是生成句子的序号；左起第二列是该条句子的得分（从大到小），分值越高越好；左起第三列是生成的英语句子。
+- 另外有两个特殊标志：`<e>`表示句子的结尾，`<unk>`表示未登录词（unknown word），即未在训练字典中出现的词。
+
+## 模型概览
+
+本节依次介绍双向循环神经网络（Bi-directional Recurrent Neural Network），NMT模型中典型的编码器-解码器（Encoder-Decoder）框架以及柱搜索（beam search）算法。
+
+### 双向循环神经网络
+
+我们已经在[语义角色标注](https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/README.cn.md)一章中介绍了一种双向循环神经网络，这里介绍Bengio团队在论文\[[2](#参考文献),[4](#参考文献)\]中提出的另一种结构。该结构的目的是输入一个序列，得到其在每个时刻的特征表示，即输出的每个时刻都用定长向量表示到该时刻的上下文语义信息。
+
+具体来说，该双向循环神经网络分别在时间维以顺序和逆序——即前向（forward）和后向（backward）——依次处理输入序列，并将每个时间步RNN的输出拼接成为最终的输出层。这样每个时间步的输出节点，都包含了输入序列中当前时刻完整的过去和未来的上下文信息。下图展示的是一个按时间步展开的双向循环神经网络。该网络包含一个前向和一个后向RNN，其中有六个权重矩阵：输入到前向隐层和后向隐层的权重矩阵（`$W_1, W_3$`），隐层到隐层自己的权重矩阵（`$W_2,W_5$`），前向隐层和后向隐层到输出层的权重矩阵（`$W_4, W_6$`）。注意，该网络的前向隐层和后向隐层之间没有连接。
+
+![bi_rnn](./image/bi_rnn.png)
+<p align="center">
+图3. 按时间步展开的双向循环神经网络
+</p>
+
+### 编码器-解码器框架
+
+编码器-解码器（Encoder-Decoder）\[[2](#参考文献)\]框架用于解决由一个任意长度的源序列到另一个任意长度的目标序列的变换问题。即编码阶段将整个源序列编码成一个向量，解码阶段通过最大化预测序列概率，从中解码出整个目标序列。编码和解码的过程通常都使用RNN实现。
+![encoder_decoder](./image/encoder_decoder.png)
+<p align="center">
+图4. 编码器-解码器框架
+</p>
+
+#### 编码器
+
+编码阶段分为三步：
+
+1. one-hot vector表示：将源语言句子`$x=\left \{ x_1,x_2,...,x_T \right \}$`的每个词`$x_i$`表示成一个列向量`$w_i\epsilon \left \{ 0,1 \right \}^{\left | V \right |},i=1,2,...,T$`。这个向量`$w_i$`的维度与词汇表大小`$\left | V \right |$` 相同，并且只有一个维度上有值1（该位置对应该词在词汇表中的位置），其余全是0。
+
+2. 映射到低维语义空间的词向量：one-hot vector表示存在两个问题，1）生成的向量维度往往很大，容易造成维数灾难；2）难以刻画词与词之间的关系（如语义相似性，也就是无法很好地表达语义）。因此，需再one-hot vector映射到低维的语义空间，由一个固定维度的稠密向量（称为词向量）表示。记映射矩阵为`$C\epsilon R^{K\times \left | V \right |}$`，用`$s_i=Cw_i$`表示第`$i$`个词的词向量，`$K$`为向量维度。
+
+3. 用RNN编码源语言词序列：这一过程的计算公式为`$h_i=\varnothing _\theta \left ( h_{i-1}, s_i \right )$`，其中`$h_0$`是一个全零的向量，`$\varnothing _\theta$`是一个非线性激活函数，最后得到的`$\mathbf{h}=\left \{ h_1,..., h_T \right \}$`就是RNN依次读入源语言`$T$`个词的状态编码序列。整句话的向量表示可以采用`$\mathbf{h}$`在最后一个时间步`$T$`的状态编码，或使用时间维上的池化（pooling）结果。
+
+第3步也可以使用双向循环神经网络实现更复杂的句编码表示，具体可以用双向GRU实现。前向GRU按照词序列`$(x_1,x_2,...,x_T)$`的顺序依次编码源语言端词，并得到一系列隐层状态`$(\overrightarrow{h_1},\overrightarrow{h_2},...,\overrightarrow{h_T})$`。类似的，后向GRU按照`$(x_T,x_{T-1},...,x_1)$`的顺序依次编码源语言端词，得到`$(\overleftarrow{h_1},\overleftarrow{h_2},...,\overleftarrow{h_T})$`。最后对于词`$x_i$`，通过拼接两个GRU的结果得到它的隐层状态，即`$h_i=\left [ \overrightarrow{h_i^T},\overleftarrow{h_i^T} \right ]^{T}$`。
+
+![encoder_attention](./image/encoder_attention.png)
+<p align="center">
+图5. 使用双向GRU的编码器
+</p>
+
+#### 解码器
+
+机器翻译任务的训练过程中，解码阶段的目标是最大化下一个正确的目标语言词的概率。思路是：
+
+1. 每一个时刻，根据源语言句子的编码信息（又叫上下文向量，context vector）`$c$`、真实目标语言序列的第`$i$`个词`$u_i$`和`$i$`时刻RNN的隐层状态`$z_i$`，计算出下一个隐层状态`$z_{i+1}$`。计算公式如下：
+
+$$z_{i+1}=\phi _{\theta '}\left ( c,u_i,z_i \right )$$
+
+其中`$\phi _{\theta '}$`是一个非线性激活函数；`$c=q\mathbf{h}$`是源语言句子的上下文向量，在不使用[注意力机制](#注意力机制)时，如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素，则可以定义`$c=h_T$`；`$u_i$`是目标语言序列的第`$i$`个单词，`$u_0$`是目标语言序列的开始标记`<s>`，表示解码开始；`$z_i$`是`$i$`时刻解码RNN的隐层状态，`$z_0$`是一个全零的向量。
+
+2. 将`$z_{i+1}$`通过`softmax`归一化，得到目标语言序列的第`$i+1$`个单词的概率分布`$p_{i+1}$`。概率分布公式如下：
+
+$$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
+
+其中`$W_sz_{i+1}+b_z$`是对每个可能的输出单词进行打分，再用softmax归一化就可以得到第`$i+1$`个词的概率`$p_{i+1}$`。
+
+3. 根据`$p_{i+1}$`和`$u_{i+1}$`计算代价。
+4. 重复步骤1~3，直到目标语言序列中的所有词处理完毕。
+
+机器翻译任务的生成过程，通俗来讲就是根据预先训练的模型来翻译源语言句子。生成过程中的解码阶段和上述训练过程的有所差异，具体介绍请见[柱搜索算法](#柱搜索算法)。
+
+### 柱搜索算法
+
+柱搜索（[beam search](http://en.wikipedia.org/wiki/Beam_search)）是一种启发式图搜索算法，用于在图或树中搜索有限集合中的最优扩展节点，通常用在解空间非常大的系统（如机器翻译、语音识别）中，原因是内存无法装下图或树中所有展开的解。如在机器翻译任务中希望翻译“`<s>你好<e>`”，就算目标语言字典中只有3个词（`<s>`, `<e>`, `hello`），也可能生成无限句话（`hello`循环出现的次数不定），为了找到其中较好的翻译结果，我们可采用柱搜索算法。
+
+柱搜索算法使用广度优先策略建立搜索树，在树的每一层，按照启发代价（heuristic cost）（本教程中，为生成词的log概率之和）对节点进行排序，然后仅留下预先确定的个数（文献中通常称为beam width、beam size、柱宽度等）的节点。只有这些节点会在下一层继续扩展，其他节点就被剪掉了，也就是说保留了质量较高的节点，剪枝了质量较差的节点。因此，搜索所占用的空间和时间大幅减少，但缺点是无法保证一定获得最优解。
+
+使用柱搜索算法的解码阶段，目标是最大化生成序列的概率。思路是：
+
+1. 每一个时刻，根据源语言句子的编码信息`$c$`、生成的第`$i$`个目标语言序列单词`$u_i$`和`$i$`时刻RNN的隐层状态`$z_i$`，计算出下一个隐层状态`$z_{i+1}$`。
+2. 将`$z_{i+1}$`通过`softmax`归一化，得到目标语言序列的第`$i+1$`个单词的概率分布`$p_{i+1}$`。
+3. 根据`$p_{i+1}$`采样出单词`$u_{i+1}$`。
+4. 重复步骤1~3，直到获得句子结束标记`<e>`或超过句子的最大生成长度为止。
+
+注意：`$z_{i+1}$`和`$p_{i+1}$`的计算公式同[解码器](#解码器)中的一样。且由于生成时的每一步都是通过贪心法实现的，因此并不能保证得到全局最优解。
+
+## 数据介绍
+
+本教程使用[WMT-14](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/)数据集中的[bitexts(after selection)](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz)作为训练集，[dev+test data](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz)作为测试集和生成集。
+
+### 数据预处理
+
+我们的预处理流程包括两步：
+- 将每个源语言到目标语言的平行语料库文件合并为一个文件：
+- 合并每个`XXX.src`和`XXX.trg`文件为`XXX`。
+- `XXX`中的第`$i$`行内容为`XXX.src`中的第`$i$`行和`XXX.trg`中的第`$i$`行连接，用'\t'分隔。
+- 创建训练数据的“源字典”和“目标字典”。每个字典都有**DICTSIZE**个单词，包括：语料中词频最高的（DICTSIZE - 3）个单词，和3个特殊符号`<s>`（序列的开始）、`<e>`（序列的结束）和`<unk>`（未登录词）。
+
+### 示例数据
+
+因为完整的数据集数据量较大，为了验证训练流程，PaddlePaddle接口paddle.dataset.wmt14中默认提供了一个经过预处理的[较小规模的数据集](http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz)。
+
+该数据集有193319条训练数据，6003条测试数据，词典长度为30000。因为数据规模限制，使用该数据集训练出来的模型效果无法保证。
+
+## 模型配置说明
+
+下面我们开始根据输入数据的形式配置模型。首先引入所需的库函数以及定义全局变量。
+
+```python
+import contextlib
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.framework as framework
+import paddle.fluid.layers as pd
+from paddle.fluid.executor import Executor
+from functools import partial
+import os
+
+dict_size = 30000
+source_dict_dim = target_dict_dim = dict_size
+hidden_dim = 32
+word_dim = 16
+batch_size = 2
+max_length = 8
+topk_size = 50
+beam_size = 2
+
+decoder_size = hidden_dim
+```
+
+然后如下实现编码器框架：
+
+```python
+def encoder(is_sparse):
+src_word_id = pd.data(
+name="src_word_id", shape=[1], dtype='int64', lod_level=1)
+src_embedding = pd.embedding(
+input=src_word_id,
+size=[dict_size, word_dim],
+dtype='float32',
+is_sparse=is_sparse,
+param_attr=fluid.ParamAttr(name='vemb'))
+
+fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
+lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4)
+encoder_out = pd.sequence_last_step(input=lstm_hidden0)
+return encoder_out
+```
+
+再实现训练模式下的解码器：
+
+```python
+def train_decoder(context, is_sparse):
+trg_language_word = pd.data(
+name="target_language_word", shape=[1], dtype='int64', lod_level=1)
+trg_embedding = pd.embedding(
+input=trg_language_word,
+size=[dict_size, word_dim],
+dtype='float32',
+is_sparse=is_sparse,
+param_attr=fluid.ParamAttr(name='vemb'))
+
+rnn = pd.DynamicRNN()
+with rnn.block():
+current_word = rnn.step_input(trg_embedding)
+pre_state = rnn.memory(init=context)
+current_state = pd.fc(input=[current_word, pre_state],
+size=decoder_size,
+act='tanh')
+
+current_score = pd.fc(input=current_state,
+size=target_dict_dim,
+act='softmax')
+rnn.update_memory(pre_state, current_state)
+rnn.output(current_score)
+
+return rnn()
+```
+
+实现推测模式下的解码器：
+
+```python
+def decode(context, is_sparse):
+init_state = context
+array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
+counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True)
+
+# fill the first element with init_state
+state_array = pd.create_array('float32')
+pd.array_write(init_state, array=state_array, i=counter)
+
+# ids, scores as memory
+ids_array = pd.create_array('int64')
+scores_array = pd.create_array('float32')
+
+init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
+init_scores = pd.data(
+name="init_scores", shape=[1], dtype="float32", lod_level=2)
+
+pd.array_write(init_ids, array=ids_array, i=counter)
+pd.array_write(init_scores, array=scores_array, i=counter)
+
+cond = pd.less_than(x=counter, y=array_len)
+
+while_op = pd.While(cond=cond)
+with while_op.block():
+pre_ids = pd.array_read(array=ids_array, i=counter)
+pre_state = pd.array_read(array=state_array, i=counter)
+pre_score = pd.array_read(array=scores_array, i=counter)
+
+# expand the lod of pre_state to be the same with pre_score
+pre_state_expanded = pd.sequence_expand(pre_state, pre_score)
+
+pre_ids_emb = pd.embedding(
+input=pre_ids,
+size=[dict_size, word_dim],
+dtype='float32',
+is_sparse=is_sparse)
+
+# use rnn unit to update rnn
+current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb],
+size=decoder_size,
+act='tanh')
+current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score)
+# use score to do beam search
+current_score = pd.fc(input=current_state_with_lod,
+size=target_dict_dim,
+act='softmax')
+topk_scores, topk_indices = pd.topk(current_score, k=topk_size)
+selected_ids, selected_scores = pd.beam_search(
+pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0)
+
+pd.increment(x=counter, value=1, in_place=True)
+
+# update the memories
+pd.array_write(current_state, array=state_array, i=counter)
+pd.array_write(selected_ids, array=ids_array, i=counter)
+pd.array_write(selected_scores, array=scores_array, i=counter)
+
+pd.less_than(x=counter, y=array_len, cond=cond)
+
+translation_ids, translation_scores = pd.beam_search_decode(
+ids=ids_array, scores=scores_array)
+
+return translation_ids, translation_scores
+```
+
+进而，我们定义一个`train_program`来使用`inference_program`计算出的结果，在标记数据的帮助下来计算误差。我们还定义了一个`optimizer_func`来定义优化器。
+
+```python
+def train_program(is_sparse):
+context = encoder(is_sparse)
+rnn_out = train_decoder(context, is_sparse)
+label = pd.data(
+name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
+cost = pd.cross_entropy(input=rnn_out, label=label)
+avg_cost = pd.mean(cost)
+return avg_cost
+
+
+def optimizer_func():
+return fluid.optimizer.Adagrad(
+learning_rate=1e-4,
+regularization=fluid.regularizer.L2DecayRegularizer(
+regularization_coeff=0.1))
+```
+
+## 训练模型
+
+### 定义训练环境
+定义您的训练环境，可以指定训练是发生在CPU还是GPU上。
+
+```python
+use_cuda = False
+place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+```
+
+### 定义数据提供器
+下一步是为训练和测试定义数据提供器。提供器读入一个大小为 `BATCH_SIZE`的数据。`paddle.dataset.wmt.train` 每次会在乱序化后提供一个大小为`BATCH_SIZE`的数据，乱序化的大小为缓存大小`buf_size`。
+
+```python
+train_reader = paddle.batch(
+paddle.reader.shuffle(
+paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+batch_size=batch_size)
+```
+
+### 构造训练器(trainer)
+训练器需要一个训练程序和一个训练优化函数。
+
+```python
+is_sparse = False
+trainer = fluid.Trainer(
+train_func=partial(train_program, is_sparse),
+place=place,
+optimizer_func=optimizer_func)
+```
+
+### 提供数据
+
+`feed_order`用来定义每条产生的数据和`paddle.layer.data`之间的映射关系。比如，`wmt14.train`产生的第一列的数据对应的是`src_word_id`这个特征。
+
+```python
+feed_order = [
+'src_word_id', 'target_language_word', 'target_language_next_word'
+]
+```
+
+### 事件处理器
+回调函数`event_handler`在一个之前定义好的事件发生后会被调用。例如，我们可以在每步训练结束后查看误差。
+
+```python
+def event_handler(event):
+if isinstance(event, fluid.EndStepEvent):
+if event.step % 10 == 0:
+print('pass_id=' + str(event.epoch) + ' batch=' + str(event.step))
+
+if event.step == 20:
+trainer.stop()
+```
+
+### 开始训练
+最后，我们传入训练循环数（`num_epoch`）和一些别的参数，调用 `trainer.train` 来开始训练。
+
+```python
+EPOCH_NUM = 1
+
+trainer.train(
+reader=train_reader,
+num_epochs=EPOCH_NUM,
+event_handler=event_handler,
+feed_order=feed_order)
+```
+
+## 应用模型
+
+### 定义解码部分
+
+使用上面定义的 `encoder` 和 `decoder` 函数来推测翻译后的对应id和分数.
+
+```python
+context = encoder(is_sparse)
+translation_ids, translation_scores = decode(context, is_sparse)
+```
+
+### 定义数据
+
+我们先初始化id和分数来生成tensors来作为输入数据。在这个预测例子中，我们用`wmt14.test`数据中的第一个记录来做推测，最后我们用"源字典"和"目标字典"来列印对应的句子结果。
+
+```python
+init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64')
+init_scores_data = np.array(
+[1. for _ in range(batch_size)], dtype='float32')
+init_ids_data = init_ids_data.reshape((batch_size, 1))
+init_scores_data = init_scores_data.reshape((batch_size, 1))
+init_lod = [1] * batch_size
+init_lod = [init_lod, init_lod]
+
+init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place)
+init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place)
+
+test_data = paddle.batch(
+paddle.reader.shuffle(
+paddle.dataset.wmt14.test(dict_size), buf_size=1000),
+batch_size=batch_size)
+
+feed_order = ['src_word_id']
+feed_list = [
+framework.default_main_program().global_block().var(var_name)
+for var_name in feed_order
+]
+feeder = fluid.DataFeeder(feed_list, place)
+
+src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+```
+
+### 测试
+现在我们可以进行预测了。我们要在`feed_order`提供对应参数，放在`executor`上运行以取得id和分数结果
+
+```python
+exe = Executor(place)
+exe.run(framework.default_startup_program())
+
+for data in test_data():
+feed_data = map(lambda x: [x[0]], data)
+feed_dict = feeder.feed(feed_data)
+feed_dict['init_ids'] = init_ids
+feed_dict['init_scores'] = init_scores
+
+results = exe.run(
+framework.default_main_program(),
+feed=feed_dict,
+fetch_list=[translation_ids, translation_scores],
+return_numpy=False)
+
+result_ids = np.array(results[0])
+result_scores = np.array(results[1])
+
+print("Original sentence:")
+print(" ".join([src_dict[w] for w in feed_data[0][0]]))
+print("Translated sentence:")
+print(" ".join([trg_dict[w] for w in result_ids]))
+print("Corresponding score: ", result_scores)
+
+break
+```
+
+## 总结
+
+端到端的神经网络机器翻译是近几年兴起的一种全新的机器翻译方法。本章中，我们介绍了NMT中典型的“编码器-解码器”框架。由于NMT是一个典型的Seq2Seq（Sequence to Sequence，序列到序列）学习问题，因此，Seq2Seq中的query改写（query rewriting）、摘要、单轮对话等问题都可以用本教程的模型来解决。
+
+## 参考文献
+
+1. Koehn P. [Statistical machine translation](https://books.google.com.hk/books?id=4v_Cx1wIMLkC&printsec=frontcover&hl=zh-CN&source=gbs_ge_summary_r&cad=0#v=onepage&q&f=false)[M]. Cambridge University Press, 2009.
+2. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](http://www.aclweb.org/anthology/D/D14/D14-1179.pdf)[C]//Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), 2014: 1724-1734.
+3. Chung J, Gulcehre C, Cho K H, et al. [Empirical evaluation of gated recurrent neural networks on sequence modeling](https://arxiv.org/abs/1412.3555)[J]. arXiv preprint arXiv:1412.3555, 2014.
+4.  Bahdanau D, Cho K, Bengio Y. [Neural machine translation by jointly learning to align and translate](https://arxiv.org/abs/1409.0473)[C]//Proceedings of ICLR 2015, 2015.
+5. Papineni K, Roukos S, Ward T, et al. [BLEU: a method for automatic evaluation of machine translation](http://dl.acm.org/citation.cfm?id=1073135)[C]//Proceedings of the 40th annual meeting on association for computational linguistics. Association for Computational Linguistics, 2002: 311-318.
+
+<br/>
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..f23901aeb3a9e7cd12611fc556742670d04a9bb5
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/.gitignore
@@ -0,0 +1,2 @@
+.idea
+.ipynb_checkpoints
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.en.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.en.png
new file mode 100644
index 0000000000000000000000000000000000000000..c213608e769f69fb2cfe8597f8e696ee53730e3d
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.en.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..8aedb2204371e7691140ceffa5992f6080bbf097
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.en.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.en.png
new file mode 100644
index 0000000000000000000000000000000000000000..4298567ac5600173343299999965b20612e7affe
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.en.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.png
new file mode 100644
index 0000000000000000000000000000000000000000..a98e7cc67606b31e4c945f7eb907563e46dcef56
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/output_32_0.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/output_32_0.png
new file mode 100644
index 0000000000000000000000000000000000000000..7fd97b9cc3a0b9105b41591af4e8f8e4646bd681
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/output_32_0.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network.png
new file mode 100644
index 0000000000000000000000000000000000000000..90c9b09fb78db98391ee199934f2d16efd6d6652
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network_en.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network_en.png
new file mode 100755
index 0000000000000000000000000000000000000000..6fc8e11967000ec48c1c0a6fa3c2eaecb80cbb84
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network_en.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn.png
new file mode 100644
index 0000000000000000000000000000000000000000..61e63d9147cbc2901706ef80776d706e5368c3c5
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn_en.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn_en.png
new file mode 100644
index 0000000000000000000000000000000000000000..fbcae2be81141be955076e877b94b0ea5d7e4d4a
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn_en.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/index.md b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/index.md
new file mode 100644
index 0000000000000000000000000000000000000000..09a07f3dc30abc57ab3731af054dd83491acc9a6
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/index.md
@@ -0,0 +1,528 @@
+# 个性化推荐
+
+本教程源代码目录在[book/recommender_system](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)。
+
+## 背景介绍
+
+在网络技术不断发展和电子商务规模不断扩大的背景下，商品数量和种类快速增长，用户需要花费大量时间才能找到自己想买的商品，这就是信息超载问题。为了解决这个难题，推荐系统（Recommender System）应运而生。
+
+个性化推荐系统是信息过滤系统（Information Filtering System）的子集，它可以用在很多领域，如电影、音乐、电商和 Feed 流推荐等。推荐系统通过分析、挖掘用户行为，发现用户的个性化需求与兴趣特点，将用户可能感兴趣的信息或商品推荐给用户。与搜索引擎不同，推荐系统不需要用户准确地描述出自己的需求，而是根据分析历史行为建模，主动提供满足用户兴趣和需求的信息。
+
+传统的推荐系统方法主要有：
+
+- 协同过滤推荐（Collaborative Filtering Recommendation）：该方法收集分析用户历史行为、活动、偏好，计算一个用户与其他用户的相似度，利用目标用户的相似用户对商品评价的加权评价值，来预测目标用户对特定商品的喜好程度。优点是可以给用户推荐未浏览过的新产品；缺点是对于没有任何行为的新用户存在冷启动的问题，同时也存在用户与商品之间的交互数据不够多造成的稀疏问题，会导致模型难以找到相近用户。
+- 基于内容过滤推荐[[1](#参考文献)]（Content-based Filtering Recommendation）：该方法利用商品的内容描述，抽象出有意义的特征，通过计算用户的兴趣和商品描述之间的相似度，来给用户做推荐。优点是简单直接，不需要依据其他用户对商品的评价，而是通过商品属性进行商品相似度度量，从而推荐给用户所感兴趣商品的相似商品；缺点是对于没有任何行为的新用户同样存在冷启动的问题。
+- 组合推荐[[2](#参考文献)]（Hybrid Recommendation）：运用不同的输入和技术共同进行推荐，以弥补各自推荐技术的缺点。
+
+其中协同过滤是应用最广泛的技术之一，它又可以分为多个子类：基于用户 （User-Based）的推荐[[3](#参考文献)] 、基于物品（Item-Based）的推荐[[4](#参考文献)]、基于社交网络关系（Social-Based）的推荐[[5](#参考文献)]、基于模型（Model-based）的推荐等。1994年明尼苏达大学推出的GroupLens系统[[3](#参考文献)]一般被认为是推荐系统成为一个相对独立的研究方向的标志。该系统首次提出了基于协同过滤来完成推荐任务的思想，此后，基于该模型的协同过滤推荐引领了推荐系统十几年的发展方向。
+
+深度学习具有优秀的自动提取特征的能力，能够学习多层次的抽象特征表示，并对异质或跨域的内容信息进行学习，可以一定程度上处理推荐系统冷启动问题[[6](#参考文献)]。本教程主要介绍个性化推荐的深度学习模型，以及如何使用PaddlePaddle实现模型。
+
+## 效果展示
+
+我们使用包含用户信息、电影信息与电影评分的数据集作为个性化推荐的应用场景。当我们训练好模型后，只需要输入对应的用户ID和电影ID，就可以得出一个匹配的分数（范围[0,5]，分数越高视为兴趣越大），然后根据所有电影的推荐得分排序，推荐给用户可能感兴趣的电影。
+
+```
+Input movie_id: 1962
+Input user_id: 1
+Prediction Score is 4.25
+```
+
+## 模型概览
+
+本章中，我们首先介绍YouTube的视频推荐系统[[7](#参考文献)]，然后介绍我们实现的融合推荐模型。
+
+### YouTube的深度神经网络推荐系统
+
+YouTube是世界上最大的视频上传、分享和发现网站，YouTube推荐系统为超过10亿用户从不断增长的视频库中推荐个性化的内容。整个系统由两个神经网络组成：候选生成网络和排序网络。候选生成网络从百万量级的视频库中生成上百个候选，排序网络对候选进行打分排序，输出排名最高的数十个结果。系统结构如图1所示：
+
+![YouTube_Overview](./image/YouTube_Overview.png)
+<p align="center">
+图1. YouTube 推荐系统结构
+</p>
+
+#### 候选生成网络（Candidate Generation Network）
+
+候选生成网络将推荐问题建模为一个类别数极大的多类分类问题：对于一个Youtube用户，使用其观看历史（视频ID）、搜索词记录（search tokens）、人口学信息（如地理位置、用户登录设备）、二值特征（如性别，是否登录）和连续特征（如用户年龄）等，对视频库中所有视频进行多分类，得到每一类别的分类结果（即每一个视频的推荐概率），最终输出概率较高的几百个视频。
+
+首先，将观看历史及搜索词记录这类历史信息，映射为向量后取平均值得到定长表示；同时，输入人口学特征以优化新用户的推荐效果，并将二值特征和连续特征归一化处理到[0, 1]范围。接下来，将所有特征表示拼接为一个向量，并输入给非线形多层感知器（MLP，详见[识别数字](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md)教程）处理。最后，训练时将MLP的输出给softmax做分类，预测时计算用户的综合特征（MLP的输出）与所有视频的相似度，取得分最高的`$k$`个作为候选生成网络的筛选结果。图2显示了候选生成网络结构。
+
+![Deep_candidate_generation_model_architecture](./image/Deep_candidate_generation_model_architecture.png)
+<p align="center">
+图2. 候选生成网络结构
+</p>
+
+对于一个用户`$U$`，预测此刻用户要观看的视频`$\omega$`为视频`$i$`的概率公式为：
+
+$$P(\omega=i|u)=\frac{e^{v_{i}u}}{\sum_{j \in V}e^{v_{j}u}}$$
+
+其中`$u$`为用户`$U$`的特征表示，`$V$`为视频库集合，`$v_i$`为视频库中第`$i$`个视频的特征表示。`$u$`和`$v_i$`为长度相等的向量，两者点积可以通过全连接层实现。
+
+考虑到softmax分类的类别数非常多，为了保证一定的计算效率：1）训练阶段，使用负样本类别采样将实际计算的类别数缩小至数千；2）推荐（预测）阶段，忽略softmax的归一化计算（不影响结果），将类别打分问题简化为点积（dot product）空间中的最近邻（nearest neighbor）搜索问题，取与`$u$`最近的`$k$`个视频作为生成的候选。
+
+#### 排序网络（Ranking Network）
+排序网络的结构类似于候选生成网络，但是它的目标是对候选进行更细致的打分排序。和传统广告排序中的特征抽取方法类似，这里也构造了大量的用于视频排序的相关特征（如视频 ID、上次观看时间等）。这些特征的处理方式和候选生成网络类似，不同之处是排序网络的顶部是一个加权逻辑回归（weighted logistic regression），它对所有候选视频进行打分，从高到底排序后将分数较高的一些视频返回给用户。
+
+### 融合推荐模型
+本节会使卷积神经网络（Convolutional Neural Networks）来学习电影名称的表示。下面会依次介绍文本卷积神经网络以及融合推荐模型。
+
+#### 文本卷积神经网络（CNN）
+
+卷积神经网络经常用来处理具有类似网格拓扑结构（grid-like topology）的数据。例如，图像可以视为二维网格的像素点，自然语言可以视为一维的词序列。卷积神经网络可以提取多种局部特征，并对其进行组合抽象得到更高级的特征表示。实验表明，卷积神经网络能高效地对图像及文本问题进行建模处理。
+
+卷积神经网络主要由卷积（convolution）和池化（pooling）操作构成，其应用及组合方式灵活多变，种类繁多。本小结我们以如图3所示的网络进行讲解：
+
+![text_cnn](./image/text_cnn.png)
+<p align="center">
+图3. 卷积神经网络文本分类模型
+</p>
+
+假设待处理句子的长度为`$n$`，其中第`$i$`个词的词向量（word embedding）为`$x_i\in\mathbb{R}^k$`，`$k$`为维度大小。
+
+首先，进行词向量的拼接操作：将每`$h$`个词拼接起来形成一个大小为`$h$`的词窗口，记为`$x_{i:i+h-1}$`，它表示词序列`$x_{i},x_{i+1},\ldots,x_{i+h-1}$`的拼接，其中，`$i$`表示词窗口中第一个词在整个句子中的位置，取值范围从`$1$`到`$n-h+1$`，`$x_{i:i+h-1}\in\mathbb{R}^{hk}$`。
+
+其次，进行卷积操作：把卷积核(kernel)`$w\in\mathbb{R}^{hk}$`应用于包含`$h$`个词的窗口`$x_{i:i+h-1}$`，得到特征`$c_i=f(w\cdot x_{i:i+h-1}+b)$`，其中`$b\in\mathbb{R}$`为偏置项（bias），`$f$`为非线性激活函数，如`$sigmoid$`。将卷积核应用于句子中所有的词窗口`${x_{1:h},x_{2:h+1},\ldots,x_{n-h+1:n}}$`，产生一个特征图（feature map）：
+
+$$c=[c_1,c_2,\ldots,c_{n-h+1}], c \in \mathbb{R}^{n-h+1}$$
+
+接下来，对特征图采用时间维度上的最大池化（max pooling over time）操作得到此卷积核对应的整句话的特征`$\hat c$`，它是特征图中所有元素的最大值：
+
+$$\hat c=max(c)$$
+
+#### 模型概览
+
+在融合推荐模型的电影推荐系统中：
+
+1. 首先，使用用户特征和电影特征作为神经网络的输入，其中：
+
+- 用户特征融合了四个属性信息，分别是用户ID、性别、职业和年龄。
+
+- 电影特征融合了三个属性信息，分别是电影ID、电影类型ID和电影名称。
+
+2. 对用户特征，将用户ID映射为维度大小为256的向量表示，输入全连接层，并对其他三个属性也做类似的处理。然后将四个属性的特征表示分别全连接并相加。
+
+3. 对电影特征，将电影ID以类似用户ID的方式进行处理，电影类型ID以向量的形式直接输入全连接层，电影名称用文本卷积神经网络得到其定长向量表示。然后将三个属性的特征表示分别全连接并相加。
+
+4. 得到用户和电影的向量表示后，计算二者的余弦相似度作为推荐系统的打分。最后，用该相似度打分和用户真实打分的差异的平方作为该回归模型的损失函数。
+
+![rec_regression_network](./image/rec_regression_network.png)
+<p align="center">
+图4. 融合推荐模型
+</p>
+
+## 数据准备
+
+### 数据介绍与下载
+
+我们以 [MovieLens 百万数据集（ml-1m）](http://files.grouplens.org/datasets/movielens/ml-1m.zip)为例进行介绍。ml-1m 数据集包含了 6,000 位用户对 4,000 部电影的 1,000,000 条评价（评分范围 1~5 分，均为整数），由 GroupLens Research 实验室搜集整理。
+
+Paddle在API中提供了自动加载数据的模块。数据模块为 `paddle.dataset.movielens`
+
+
+```python
+import paddle
+movie_info = paddle.dataset.movielens.movie_info()
+print movie_info.values()[0]
+```
+
+
+```python
+# Run this block to show dataset's documentation
+# help(paddle.dataset.movielens)
+```
+
+在原始数据中包含电影的特征数据，用户的特征数据，和用户对电影的评分。
+
+例如，其中某一个电影特征为:
+
+
+```python
+movie_info = paddle.dataset.movielens.movie_info()
+print movie_info.values()[0]
+```
+
+<MovieInfo id(1), title(Toy Story ), categories(['Animation', "Children's", 'Comedy'])>
+
+
+这表示，电影的id是1，标题是《Toy Story》，该电影被分为到三个类别中。这三个类别是动画，儿童，喜剧。
+
+
+```python
+user_info = paddle.dataset.movielens.user_info()
+print user_info.values()[0]
+```
+
+<UserInfo id(1), gender(F), age(1), job(10)>
+
+
+这表示，该用户ID是1，女性，年龄比18岁还年轻。职业ID是10。
+
+
+其中，年龄使用下列分布
+*  1:  "Under 18"
+* 18:  "18-24"
+* 25:  "25-34"
+* 35:  "35-44"
+* 45:  "45-49"
+* 50:  "50-55"
+* 56:  "56+"
+
+职业是从下面几种选项里面选则得出:
+*  0:  "other" or not specified
+*  1:  "academic/educator"
+*  2:  "artist"
+*  3:  "clerical/admin"
+*  4:  "college/grad student"
+*  5:  "customer service"
+*  6:  "doctor/health care"
+*  7:  "executive/managerial"
+*  8:  "farmer"
+*  9:  "homemaker"
+* 10:  "K-12 student"
+* 11:  "lawyer"
+* 12:  "programmer"
+* 13:  "retired"
+* 14:  "sales/marketing"
+* 15:  "scientist"
+* 16:  "self-employed"
+* 17:  "technician/engineer"
+* 18:  "tradesman/craftsman"
+* 19:  "unemployed"
+* 20:  "writer"
+
+而对于每一条训练/测试数据，均为 <用户特征> + <电影特征> + 评分。
+
+例如，我们获得第一条训练数据:
+
+
+```python
+train_set_creator = paddle.dataset.movielens.train()
+train_sample = next(train_set_creator())
+uid = train_sample[0]
+mov_id = train_sample[len(user_info[uid].value())]
+print "User %s rates Movie %s with Score %s"%(user_info[uid], movie_info[mov_id], train_sample[-1])
+```
+
+User <UserInfo id(1), gender(F), age(1), job(10)> rates Movie <MovieInfo id(1193), title(One Flew Over the Cuckoo's Nest ), categories(['Drama'])> with Score [5.0]
+
+
+即用户1对电影1193的评价为5分。
+
+## 模型配置说明
+
+下面我们开始根据输入数据的形式配置模型。首先引入所需的库函数以及定义全局变量。
+
+
+```python
+import math
+import sys
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle.fluid.nets as nets
+
+IS_SPARSE = True
+USE_GPU = False
+BATCH_SIZE = 256
+```
+
+然后为我们的用户特征综合模型定义模型配置
+
+```python
+def get_usr_combined_features():
+
+USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1
+
+uid = layers.data(name='user_id', shape=[1], dtype='int64')
+
+usr_emb = layers.embedding(
+input=uid,
+dtype='float32',
+size=[USR_DICT_SIZE, 32],
+param_attr='user_table',
+is_sparse=IS_SPARSE)
+
+usr_fc = layers.fc(input=usr_emb, size=32)
+
+USR_GENDER_DICT_SIZE = 2
+
+usr_gender_id = layers.data(name='gender_id', shape=[1], dtype='int64')
+
+usr_gender_emb = layers.embedding(
+input=usr_gender_id,
+size=[USR_GENDER_DICT_SIZE, 16],
+param_attr='gender_table',
+is_sparse=IS_SPARSE)
+
+usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
+
+USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
+usr_age_id = layers.data(name='age_id', shape=[1], dtype="int64")
+
+usr_age_emb = layers.embedding(
+input=usr_age_id,
+size=[USR_AGE_DICT_SIZE, 16],
+is_sparse=IS_SPARSE,
+param_attr='age_table')
+
+usr_age_fc = layers.fc(input=usr_age_emb, size=16)
+
+USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
+usr_job_id = layers.data(name='job_id', shape=[1], dtype="int64")
+
+usr_job_emb = layers.embedding(
+input=usr_job_id,
+size=[USR_JOB_DICT_SIZE, 16],
+param_attr='job_table',
+is_sparse=IS_SPARSE)
+
+usr_job_fc = layers.fc(input=usr_job_emb, size=16)
+
+concat_embed = layers.concat(
+input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1)
+
+usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
+
+return usr_combined_features
+```
+
+如上述代码所示，对于每个用户，我们输入4维特征。其中包括user_id,gender_id,age_id,job_id。这几维特征均是简单的整数值。为了后续神经网络处理这些特征方便，我们借鉴NLP中的语言模型，将这几维离散的整数值，变换成embedding取出。分别形成usr_emb, usr_gender_emb, usr_age_emb, usr_job_emb。
+
+然后，我们对于所有的用户特征，均输入到一个全连接层(fc)中。将所有特征融合为一个200维度的特征。
+
+进而，我们对每一个电影特征做类似的变换，网络配置为:
+
+
+```python
+def get_mov_combined_features():
+
+MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1
+
+mov_id = layers.data(name='movie_id', shape=[1], dtype='int64')
+
+mov_emb = layers.embedding(
+input=mov_id,
+dtype='float32',
+size=[MOV_DICT_SIZE, 32],
+param_attr='movie_table',
+is_sparse=IS_SPARSE)
+
+mov_fc = layers.fc(input=mov_emb, size=32)
+
+CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
+
+category_id = layers.data(
+name='category_id', shape=[1], dtype='int64', lod_level=1)
+
+mov_categories_emb = layers.embedding(
+input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
+
+mov_categories_hidden = layers.sequence_pool(
+input=mov_categories_emb, pool_type="sum")
+
+MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
+
+mov_title_id = layers.data(
+name='movie_title', shape=[1], dtype='int64', lod_level=1)
+
+mov_title_emb = layers.embedding(
+input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
+
+mov_title_conv = nets.sequence_conv_pool(
+input=mov_title_emb,
+num_filters=32,
+filter_size=3,
+act="tanh",
+pool_type="sum")
+
+concat_embed = layers.concat(
+input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1)
+
+mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
+
+return mov_combined_features
+```
+
+电影标题名称(title)是一个序列的整数，整数代表的是这个词在索引序列中的下标。这个序列会被送入 `sequence_conv_pool` 层，这个层会在时间维度上使用卷积和池化。因为如此，所以输出会是固定长度，尽管输入的序列长度各不相同。
+
+最后，我们定义一个`inference_program`来使用余弦相似度计算用户特征与电影特征的相似性。
+
+```python
+def inference_program():
+usr_combined_features = get_usr_combined_features()
+mov_combined_features = get_mov_combined_features()
+
+inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
+scale_infer = layers.scale(x=inference, scale=5.0)
+
+return scale_infer
+```
+
+进而，我们定义一个`train_program`来使用`inference_program`计算出的结果，在标记数据的帮助下来计算误差。我们还定义了一个`optimizer_func`来定义优化器。
+
+```python
+def train_program():
+
+scale_infer = inference_program()
+
+label = layers.data(name='score', shape=[1], dtype='float32')
+square_cost = layers.square_error_cost(input=scale_infer, label=label)
+avg_cost = layers.mean(square_cost)
+
+return [avg_cost, scale_infer]
+
+
+def optimizer_func():
+return fluid.optimizer.SGD(learning_rate=0.2)
+```
+
+
+## 训练模型
+
+### 定义训练环境
+定义您的训练环境，可以指定训练是发生在CPU还是GPU上。
+
+```python
+use_cuda = False
+place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+```
+
+### 定义数据提供器
+下一步是为训练和测试定义数据提供器。提供器读入一个大小为 `BATCH_SIZE`的数据。`paddle.dataset.movielens.train` 每次会在乱序化后提供一个大小为`BATCH_SIZE`的数据，乱序化的大小为缓存大小`buf_size`。
+
+```python
+train_reader = paddle.batch(
+paddle.reader.shuffle(
+paddle.dataset.movielens.train(), buf_size=8192),
+batch_size=BATCH_SIZE)
+
+test_reader = paddle.batch(
+paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)
+```
+
+### 构造训练器(trainer)
+训练器需要一个训练程序和一个训练优化函数。
+
+```python
+trainer = fluid.Trainer(
+train_func=train_program, place=place, optimizer_func=optimizer_func)
+```
+
+### 提供数据
+
+`feed_order`用来定义每条产生的数据和`paddle.layer.data`之间的映射关系。比如，`movielens.train`产生的第一列的数据对应的是`user_id`这个特征。
+
+```python
+feed_order = [
+'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id',
+'movie_title', 'score'
+]
+```
+
+### 事件处理器
+回调函数`event_handler`在一个之前定义好的事件发生后会被调用。例如，我们可以在每步训练结束后查看误差。
+
+```python
+# Specify the directory path to save the parameters
+params_dirname = "recommender_system.inference.model"
+
+from paddle.v2.plot import Ploter
+test_title = "Test cost"
+plot_cost = Ploter(test_title)
+
+
+def event_handler(event):
+if isinstance(event, fluid.EndStepEvent):
+avg_cost_set = trainer.test(
+reader=test_reader, feed_order=feed_order)
+
+# get avg cost
+avg_cost = np.array(avg_cost_set).mean()
+
+plot_cost.append(test_title, event.step, avg_cost_set[0])
+plot_cost.plot()
+
+print("avg_cost: %s" % avg_cost)
+print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
+float(avg_cost)))
+
+if event.step == 20: # Adjust this number for accuracy
+trainer.save_params(params_dirname)
+trainer.stop()
+```
+
+### 开始训练
+最后，我们传入训练循环数（`num_epoch`）和一些别的参数，调用 `trainer.train` 来开始训练。
+
+```python
+trainer.train(
+num_epochs=1,
+event_handler=event_handler,
+reader=train_reader,
+feed_order=feed_order)
+```
+
+## 应用模型
+
+### 构建预测器
+传入`inference_program`和`params_dirname`来初始化一个预测器, `params_dirname`用来存放训练过程中的各个参数。
+
+```python
+inferencer = fluid.Inferencer(
+inference_program, param_path=params_dirname, place=place)
+```
+
+### 生成测试用输入数据
+使用 create_lod_tensor(data, lod, place) 的API来生成细节层次的张量。`data`是一个序列，每个元素是一个索引号的序列。`lod`是细节层次的信息，对应于`data`。比如，data = [[10, 2, 3], [2, 3]] 意味着它包含两个序列，长度分别是3和2。于是相应地 lod = [[3, 2]]，它表明其包含一层细节信息，意味着 `data` 有两个序列，长度分别是3和2。
+
+在这个预测例子中，我们试着预测用户ID为1的用户对于电影'Hunchback of Notre Dame'的评分
+
+```python
+infer_movie_id = 783
+infer_movie_name = paddle.dataset.movielens.movie_info()[infer_movie_id].title
+user_id = fluid.create_lod_tensor([[1]], [[1]], place)
+gender_id = fluid.create_lod_tensor([[1]], [[1]], place)
+age_id = fluid.create_lod_tensor([[0]], [[1]], place)
+job_id = fluid.create_lod_tensor([[10]], [[1]], place)
+movie_id = fluid.create_lod_tensor([[783]], [[1]], place) # Hunchback of Notre Dame
+category_id = fluid.create_lod_tensor([[10, 8, 9]], [[3]], place) # Animation, Children's, Musical
+movie_title = fluid.create_lod_tensor([[1069, 4140, 2923, 710, 988]], [[5]],
+place) # 'hunchback','of','notre','dame','the'
+```
+
+### 测试
+现在我们可以进行预测了。我们要提供的`feed_order`应该和训练过程一致。
+
+
+```python
+results = inferencer.infer(
+{
+'user_id': user_id,
+'gender_id': gender_id,
+'age_id': age_id,
+'job_id': job_id,
+'movie_id': movie_id,
+'category_id': category_id,
+'movie_title': movie_title
+},
+return_numpy=False)
+```
+
+## 总结
+
+本章介绍了传统的推荐系统方法和YouTube的深度神经网络推荐系统，并以电影推荐为例，使用PaddlePaddle训练了一个个性化推荐神经网络模型。推荐系统几乎涵盖了电商系统、社交网络、广告推荐、搜索引擎等领域的方方面面，而在图像处理、自然语言处理等领域已经发挥重要作用的深度学习技术，也将会在推荐系统领域大放异彩。
+
+## 参考文献
+
+1. [Peter Brusilovsky](https://en.wikipedia.org/wiki/Peter_Brusilovsky) (2007). *The Adaptive Web*. p. 325.
+2. Robin Burke , [Hybrid Web Recommender Systems](http://www.dcs.warwick.ac.uk/~acristea/courses/CS411/2010/Book%20-%20The%20Adaptive%20Web/HybridWebRecommenderSystems.pdf), pp. 377-408, The Adaptive Web, Peter Brusilovsky, Alfred Kobsa, Wolfgang Nejdl (Ed.), Lecture Notes in Computer Science, Springer-Verlag, Berlin, Germany, Lecture Notes in Computer Science, Vol. 4321, May 2007, 978-3-540-72078-2.
+3. P. Resnick, N. Iacovou, etc. “[GroupLens: An Open Architecture for Collaborative Filtering of Netnews](http://ccs.mit.edu/papers/CCSWP165.html)”, Proceedings of ACM Conference on Computer Supported Cooperative Work, CSCW 1994. pp.175-186.
+4. Sarwar, Badrul, et al. "[Item-based collaborative filtering recommendation algorithms.](http://files.grouplens.org/papers/www10_sarwar.pdf)" *Proceedings of the 10th international conference on World Wide Web*. ACM, 2001.
+5. Kautz, Henry, Bart Selman, and Mehul Shah. "[Referral Web: combining social networks and collaborative filtering.](http://www.cs.cornell.edu/selman/papers/pdf/97.cacm.refweb.pdf)" Communications of the ACM 40.3 (1997): 63-65. APA
+6. Yuan, Jianbo, et al. ["Solving Cold-Start Problem in Large-scale Recommendation Engines: A Deep Learning Approach."](https://arxiv.org/pdf/1611.05480v1.pdf) *arXiv preprint arXiv:1611.05480* (2016).
+7. Covington P, Adams J, Sargin E. [Deep neural networks for youtube recommendations](https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/45530.pdf)[C]//Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016: 191-198.
+
+
+<br/>
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..667762d327cb160376a4119fa9df9db41b6443b2
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/.gitignore
@@ -0,0 +1,10 @@
+data/aclImdb
+data/imdb
+data/pre-imdb
+data/mosesdecoder-master
+*.log
+model_output
+dataprovider_copy_1.py
+model.list
+*.pyc
+.DS_Store
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm.png b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm.png
new file mode 100644
index 0000000000000000000000000000000000000000..98fbea413a98a619004ca669c67f5f867fe974c9
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm_en.png b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm_en.png
new file mode 100755
index 0000000000000000000000000000000000000000..d73a00bf2c1fca2f9b8c26bccf5ea844fa1db50b
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm_en.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/rnn.png b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/rnn.png
new file mode 100755
index 0000000000000000000000000000000000000000..26c904102a6e6c4e30f0048b81373ae8c148b355
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/rnn.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm.jpg b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6b2adf70f2b5112a2e82505da5cff9f5fd0c6298
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm.jpg differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm_en.png b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm_en.png
new file mode 100755
index 0000000000000000000000000000000000000000..8b5dbd726178b5555c513294e7b10a81acc96ff5
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm_en.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/index.md b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/index.md
new file mode 100644
index 0000000000000000000000000000000000000000..624de7e4d439953c7255481fb0c9d62ce94f3900
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/index.md
@@ -0,0 +1,354 @@
+# 情感分析
+
+本教程源代码目录在[book/understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/06.understand_sentiment)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)。
+
+## 背景介绍
+
+在自然语言处理中，情感分析一般是指判断一段文本所表达的情绪状态。其中，一段文本可以是一个句子，一个段落或一个文档。情绪状态可以是两类，如（正面，负面），（高兴，悲伤）；也可以是三类，如（积极，消极，中性）等等。情感分析的应用场景十分广泛，如把用户在购物网站（亚马逊、天猫、淘宝等）、旅游网站、电影评论网站上发表的评论分成正面评论和负面评论；或为了分析用户对于某一产品的整体使用感受，抓取产品的用户评论并进行情感分析等等。表格1展示了对电影评论进行情感分析的例子：
+
+| 电影评论       | 类别  |
+| --------     | -----  |
+| 在冯小刚这几年的电影里，算最好的一部的了| 正面 |
+| 很不好看，好像一个地方台的电视剧     | 负面 |
+| 圆方镜头全程炫技，色调背景美则美矣，但剧情拖沓，口音不伦不类，一直努力却始终无法入戏| 负面|
+|剧情四星。但是圆镜视角加上婺源的风景整个非常有中国写意山水画的感觉，看得实在太舒服了。。|正面|
+
+<p align="center">表格 1 电影评论情感分析</p>
+
+在自然语言处理中，情感分析属于典型的**文本分类**问题，即把需要进行情感分析的文本划分为其所属类别。文本分类涉及文本表示和分类方法两个问题。在深度学习的方法出现之前，主流的文本表示方法为词袋模型BOW(bag of words)，话题模型等等；分类方法有SVM(support vector machine), LR(logistic regression)等等。
+
+对于一段文本，BOW表示会忽略其词顺序、语法和句法，将这段文本仅仅看做是一个词集合，因此BOW方法并不能充分表示文本的语义信息。例如，句子“这部电影糟糕透了”和“一个乏味，空洞，没有内涵的作品”在情感分析中具有很高的语义相似度，但是它们的BOW表示的相似度为0。又如，句子“一个空洞，没有内涵的作品”和“一个不空洞而且有内涵的作品”的BOW相似度很高，但实际上它们的意思很不一样。
+
+本章我们所要介绍的深度学习模型克服了BOW表示的上述缺陷，它在考虑词顺序的基础上把文本映射到低维度的语义空间，并且以端对端（end to end）的方式进行文本表示及分类，其性能相对于传统方法有显著的提升\[[1](#参考文献)\]。
+
+## 模型概览
+本章所使用的文本表示模型为卷积神经网络（Convolutional Neural Networks）和循环神经网络(Recurrent Neural Networks)及其扩展。下面依次介绍这几个模型。
+
+### 文本卷积神经网络简介（CNN）
+
+我们在[推荐系统](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)一节介绍过应用于文本数据的卷积神经网络模型的计算过程，这里进行一个简单的回顾。
+
+对卷积神经网络来说，首先使用卷积处理输入的词向量序列，产生一个特征图（feature map），对特征图采用时间维度上的最大池化（max pooling over time）操作得到此卷积核对应的整句话的特征，最后，将所有卷积核得到的特征拼接起来即为文本的定长向量表示，对于文本分类问题，将其连接至softmax即构建出完整的模型。在实际应用中，我们会使用多个卷积核来处理句子，窗口大小相同的卷积核堆叠起来形成一个矩阵，这样可以更高效的完成运算。另外，我们也可使用窗口大小不同的卷积核来处理句子，[推荐系统](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)一节的图3作为示意画了四个卷积核，不同颜色表示不同大小的卷积核操作。
+
+对于一般的短文本分类问题，上文所述的简单的文本卷积网络即可达到很高的正确率\[[1](#参考文献)\]。若想得到更抽象更高级的文本特征表示，可以构建深层文本卷积神经网络\[[2](#参考文献),[3](#参考文献)\]。
+
+### 循环神经网络（RNN）
+
+循环神经网络是一种能对序列数据进行精确建模的有力工具。实际上，循环神经网络的理论计算能力是图灵完备的\[[4](#参考文献)\]。自然语言是一种典型的序列数据（词序列），近年来，循环神经网络及其变体（如long short term memory\[[5](#参考文献)\]等）在自然语言处理的多个领域，如语言模型、句法解析、语义角色标注（或一般的序列标注）、语义表示、图文生成、对话、机器翻译等任务上均表现优异甚至成为目前效果最好的方法。
+
+![rnn](./image/rnn.png)
+<p align="center">
+图1. 循环神经网络按时间展开的示意图
+</p>
+
+循环神经网络按时间展开后如图1所示：在第`$t$`时刻，网络读入第`$t$`个输入`$x_t$`（向量表示）及前一时刻隐层的状态值`$h_{t-1}$`（向量表示，`$h_0$`一般初始化为`$0$`向量），计算得出本时刻隐层的状态值`$h_t$`，重复这一步骤直至读完所有输入。如果将循环神经网络所表示的函数记为`$f$`，则其公式可表示为：
+
+$$h_t=f(x_t,h_{t-1})=\sigma(W_{xh}x_t+W_{hh}h_{t-1}+b_h)$$
+
+其中`$W_{xh}$`是输入到隐层的矩阵参数，`$W_{hh}$`是隐层到隐层的矩阵参数，`$b_h$`为隐层的偏置向量（bias）参数，`$\sigma$`为`$sigmoid$`函数。
+
+在处理自然语言时，一般会先将词（one-hot表示）映射为其词向量（word embedding）表示，然后再作为循环神经网络每一时刻的输入`$x_t$`。此外，可以根据实际需要的不同在循环神经网络的隐层上连接其它层。如，可以把一个循环神经网络的隐层输出连接至下一个循环神经网络的输入构建深层（deep or stacked）循环神经网络，或者提取最后一个时刻的隐层状态作为句子表示进而使用分类模型等等。
+
+### 长短期记忆网络（LSTM）
+
+对于较长的序列数据，循环神经网络的训练过程中容易出现梯度消失或爆炸现象\[[6](#参考文献)\]。为了解决这一问题，Hochreiter S, Schmidhuber J. (1997)提出了LSTM(long short term memory\[[5](#参考文献)\])。
+
+相比于简单的循环神经网络，LSTM增加了记忆单元`$c$`、输入门`$i$`、遗忘门`$f$`及输出门`$o$`。这些门及记忆单元组合起来大大提升了循环神经网络处理长序列数据的能力。若将基于LSTM的循环神经网络表示的函数记为`$F$`，则其公式为：
+
+$$ h_t=F(x_t,h_{t-1})$$
+
+`$F$`由下列公式组合而成\[[7](#参考文献)\]：
+$$ i_t = \sigma{(W_{xi}x_t+W_{hi}h_{t-1}+W_{ci}c_{t-1}+b_i)} $$
+$$ f_t = \sigma(W_{xf}x_t+W_{hf}h_{t-1}+W_{cf}c_{t-1}+b_f) $$
+$$ c_t = f_t\odot c_{t-1}+i_t\odot tanh(W_{xc}x_t+W_{hc}h_{t-1}+b_c) $$
+$$ o_t = \sigma(W_{xo}x_t+W_{ho}h_{t-1}+W_{co}c_{t}+b_o) $$
+$$ h_t = o_t\odot tanh(c_t) $$
+其中，`$i_t, f_t, c_t, o_t$`分别表示输入门，遗忘门，记忆单元及输出门的向量值，带角标的`$W$`及`$b$`为模型参数，`$tanh$`为双曲正切函数，`$\odot$`表示逐元素（elementwise）的乘法操作。输入门控制着新输入进入记忆单元`$c$`的强度，遗忘门控制着记忆单元维持上一时刻值的强度，输出门控制着输出记忆单元的强度。三种门的计算方式类似，但有着完全不同的参数，它们各自以不同的方式控制着记忆单元`$c$`，如图2所示：
+
+![lstm](./image/lstm.png)
+<p align="center">
+图2. 时刻`$t$`的LSTM [7]
+</p>
+
+LSTM通过给简单的循环神经网络增加记忆及控制门的方式，增强了其处理远距离依赖问题的能力。类似原理的改进还有Gated Recurrent Unit (GRU)\[[8](#参考文献)\]，其设计更为简洁一些。**这些改进虽然各有不同，但是它们的宏观描述却与简单的循环神经网络一样（如图2所示），即隐状态依据当前输入及前一时刻的隐状态来改变，不断地循环这一过程直至输入处理完毕：**
+
+$$ h_t=Recrurent(x_t,h_{t-1})$$
+
+其中，`$Recrurent$`可以表示简单的循环神经网络、GRU或LSTM。
+
+### 栈式双向LSTM（Stacked Bidirectional LSTM）
+
+对于正常顺序的循环神经网络，`$h_t$`包含了`$t$`时刻之前的输入信息，也就是上文信息。同样，为了得到下文信息，我们可以使用反方向（将输入逆序处理）的循环神经网络。结合构建深层循环神经网络的方法（深层神经网络往往能得到更抽象和高级的特征表示），我们可以通过构建更加强有力的基于LSTM的栈式双向循环神经网络\[[9](#参考文献)\]，来对时序数据进行建模。
+
+如图3所示（以三层为例），奇数层LSTM正向，偶数层LSTM反向，高一层的LSTM使用低一层LSTM及之前所有层的信息作为输入，对最高层LSTM序列使用时间维度上的最大池化即可得到文本的定长向量表示（这一表示充分融合了文本的上下文信息，并且对文本进行了深层次抽象），最后我们将文本表示连接至softmax构建分类模型。
+
+![stacked_lstm](./image/stacked_lstm.jpg)
+<p align="center">
+图3. 栈式双向LSTM用于文本分类
+</p>
+
+
+## 数据集介绍
+
+我们以[IMDB情感分析数据集](http://ai.stanford.edu/%7Eamaas/data/sentiment/)为例进行介绍。IMDB数据集的训练集和测试集分别包含25000个已标注过的电影评论。其中，负面评论的得分小于等于4，正面评论的得分大于等于7，满分10分。
+```text
+aclImdb
+|- test
+|-- neg
+|-- pos
+|- train
+|-- neg
+|-- pos
+```
+Paddle在`dataset/imdb.py`中提实现了imdb数据集的自动下载和读取，并提供了读取字典、训练数据、测试数据等API。
+
+## 配置模型
+
+在该示例中，我们实现了两种文本分类算法，分别基于[推荐系统](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)一节介绍过的文本卷积神经网络，以及[栈式双向LSTM](#栈式双向LSTM（Stacked Bidirectional LSTM）)。我们首先引入要用到的库和定义全局变量：
+
+```python
+import paddle
+import paddle.fluid as fluid
+from functools import partial
+import numpy as np
+
+CLASS_DIM = 2
+EMB_DIM = 128
+HID_DIM = 512
+BATCH_SIZE = 128
+USE_GPU = False
+```
+
+
+### 文本卷积神经网络
+我们构建神经网络`convolution_net`，示例代码如下。
+需要注意的是：`fluid.nets.sequence_conv_pool` 包含卷积和池化层两个操作。
+
+```python
+def convolution_net(data, input_dim, class_dim, emb_dim, hid_dim):
+emb = fluid.layers.embedding(
+input=data, size=[input_dim, emb_dim], is_sparse=True)
+conv_3 = fluid.nets.sequence_conv_pool(
+input=emb,
+num_filters=hid_dim,
+filter_size=3,
+act="tanh",
+pool_type="sqrt")
+conv_4 = fluid.nets.sequence_conv_pool(
+input=emb,
+num_filters=hid_dim,
+filter_size=4,
+act="tanh",
+pool_type="sqrt")
+prediction = fluid.layers.fc(
+input=[conv_3, conv_4], size=class_dim, act="softmax")
+return prediction
+```
+
+网络的输入`input_dim`表示的是词典的大小，`class_dim`表示类别数。这里，我们使用[`sequence_conv_pool`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/trainer_config_helpers/networks.py) API实现了卷积和池化操作。
+
+### 栈式双向LSTM
+
+栈式双向神经网络`stacked_lstm_net`的代码片段如下：
+
+```python
+def stacked_lstm_net(data, input_dim, class_dim, emb_dim, hid_dim, stacked_num):
+
+emb = fluid.layers.embedding(
+input=data, size=[input_dim, emb_dim], is_sparse=True)
+
+fc1 = fluid.layers.fc(input=emb, size=hid_dim)
+lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)
+
+inputs = [fc1, lstm1]
+
+for i in range(2, stacked_num + 1):
+fc = fluid.layers.fc(input=inputs, size=hid_dim)
+lstm, cell = fluid.layers.dynamic_lstm(
+input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
+inputs = [fc, lstm]
+
+fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
+lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
+
+prediction = fluid.layers.fc(input=[fc_last, lstm_last],
+size=class_dim,
+act='softmax')
+return prediction
+```
+以上的栈式双向LSTM抽象出了高级特征并把其映射到和分类类别数同样大小的向量上。`paddle.activation.Softmax`函数用来计算分类属于某个类别的概率。
+
+重申一下，此处我们可以调用`convolution_net`或`stacked_lstm_net`的任何一个。我们以`convolution_net`为例。
+
+接下来我们定义预测程序（`inference_program`）。预测程序使用`convolution_net`来对`fluid.layer.data`的输入进行预测。
+
+```python
+def inference_program(word_dict):
+data = fluid.layers.data(
+name="words", shape=[1], dtype="int64", lod_level=1)
+
+dict_dim = len(word_dict)
+net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM)
+return net
+```
+
+我们这里定义了`training_program`。它使用了从`inference_program`返回的结果来计算误差。我们同时定义了优化函数`optimizer_func`。
+
+因为是有监督的学习，训练集的标签也在`paddle.layer.data`中定义了。在训练过程中，交叉熵用来在`paddle.layer.classification_cost`中作为损失函数。
+
+在测试过程中，分类器会计算各个输出的概率。第一个返回的数值规定为 损耗(cost)。
+
+```python
+def train_program(word_dict):
+prediction = inference_program(word_dict)
+label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+cost = fluid.layers.cross_entropy(input=prediction, label=label)
+avg_cost = fluid.layers.mean(cost)
+accuracy = fluid.layers.accuracy(input=prediction, label=label)
+return [avg_cost, accuracy]
+
+
+def optimizer_func():
+return fluid.optimizer.Adagrad(learning_rate=0.002)
+```
+
+## 训练模型
+
+### 定义训练环境
+
+定义您的训练是在CPU上还是在GPU上：
+
+
+```python
+use_cuda = False
+place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+```
+
+### 定义数据提供器
+
+下一步是为训练和测试定义数据提供器。提供器读入一个大小为 BATCH_SIZE的数据。paddle.dataset.imdb.train 每次会在乱序化后提供一个大小为BATCH_SIZE的数据，乱序化的大小为缓存大小buf_size。
+
+注意：读取IMDB的数据可能会花费几分钟的时间，请耐心等待。
+
+```python
+print("Loading IMDB word dict....")
+word_dict = paddle.dataset.imdb.word_dict()
+
+print ("Reading training data....")
+train_reader = paddle.batch(
+paddle.reader.shuffle(
+paddle.dataset.imdb.train(word_dict), buf_size=25000),
+batch_size=BATCH_SIZE)
+```
+
+### 构造训练器(trainer)
+训练器需要一个训练程序和一个训练优化函数。
+
+```python
+trainer = fluid.Trainer(
+train_func=partial(train_program, word_dict),
+place=place,
+optimizer_func=optimizer_func)
+```
+
+### 提供数据
+
+`feed_order`用来定义每条产生的数据和`paddle.layer.data`之间的映射关系。比如，`imdb.train`产生的第一列的数据对应的是`words`这个特征。
+
+```python
+feed_order = ['words', 'label']
+```
+
+### 事件处理器
+
+回调函数event_handler在一个之前定义好的事件发生后会被调用。例如，我们可以在每步训练结束后查看误差。
+
+```python
+# Specify the directory path to save the parameters
+params_dirname = "understand_sentiment_conv.inference.model"
+
+def event_handler(event):
+if isinstance(event, fluid.EndStepEvent):
+print("Step {0}, Epoch {1} Metrics {2}".format(
+event.step, event.epoch, map(np.array, event.metrics)))
+
+if event.step == 10:
+trainer.save_params(params_dirname)
+trainer.stop()
+```
+
+### 开始训练
+
+最后，我们传入训练循环数（num_epoch）和一些别的参数，调用 trainer.train 来开始训练。
+
+```python
+trainer.train(
+num_epochs=1,
+event_handler=event_handler,
+reader=train_reader,
+feed_order=feed_order)
+```
+
+## 应用模型
+
+### 构建预测器
+
+传入`inference_program`和`params_dirname`来初始化一个预测器, `params_dirname`用来存放训练过程中的各个参数。
+
+```python
+inferencer = fluid.Inferencer(
+inference_program, param_path=params_dirname, place=place)
+```
+
+### 生成测试用输入数据
+
+为了进行预测，我们任意选取3个评论。请随意选取您看好的3个。我们把评论中的每个词对应到`word_dict`中的id。如果词典中没有这个词，则设为`unknown`。
+然后我们用`create_lod_tensor`来创建细节层次的张量。
+
+```python
+reviews_str = [
+'read the book forget the movie', 'this is a great movie', 'this is very bad'
+]
+reviews = [c.split() for c in reviews_str]
+
+UNK = word_dict['<unk>']
+lod = []
+for c in reviews:
+lod.append([word_dict.get(words, UNK) for words in c])
+
+base_shape = [[len(c) for c in lod]]
+
+tensor_words = fluid.create_lod_tensor(lod, base_shape, place)
+```
+
+## 应用模型
+
+现在我们可以对每一条评论进行正面或者负面的预测啦。
+
+```python
+results = inferencer.infer({'words': tensor_words})
+
+for i, r in enumerate(results[0]):
+print("Predict probability of ", r[0], " to be positive and ", r[1], " to be negative for review \'", reviews_str[i], "\'")
+
+```
+
+
+## 总结
+
+本章我们以情感分析为例，介绍了使用深度学习的方法进行端对端的短文本分类，并且使用PaddlePaddle完成了全部相关实验。同时，我们简要介绍了两种文本处理模型：卷积神经网络和循环神经网络。在后续的章节中我们会看到这两种基本的深度学习模型在其它任务上的应用。
+
+
+## 参考文献
+1. Kim Y. [Convolutional neural networks for sentence classification](http://arxiv.org/pdf/1408.5882)[J]. arXiv preprint arXiv:1408.5882, 2014.
+2. Kalchbrenner N, Grefenstette E, Blunsom P. [A convolutional neural network for modelling sentences](http://arxiv.org/pdf/1404.2188.pdf?utm_medium=App.net&utm_source=PourOver)[J]. arXiv preprint arXiv:1404.2188, 2014.
+3. Yann N. Dauphin, et al. [Language Modeling with Gated Convolutional Networks](https://arxiv.org/pdf/1612.08083v1.pdf)[J] arXiv preprint arXiv:1612.08083, 2016.
+4. Siegelmann H T, Sontag E D. [On the computational power of neural nets](http://research.cs.queensu.ca/home/akl/cisc879/papers/SELECTED_PAPERS_FROM_VARIOUS_SOURCES/05070215382317071.pdf)[C]//Proceedings of the fifth annual workshop on Computational learning theory. ACM, 1992: 440-449.
+5. Hochreiter S, Schmidhuber J. [Long short-term memory](http://web.eecs.utk.edu/~itamar/courses/ECE-692/Bobby_paper1.pdf)[J]. Neural computation, 1997, 9(8): 1735-1780.
+6. Bengio Y, Simard P, Frasconi P. [Learning long-term dependencies with gradient descent is difficult](http://www-dsi.ing.unifi.it/~paolo/ps/tnn-94-gradient.pdf)[J]. IEEE transactions on neural networks, 1994, 5(2): 157-166.
+7. Graves A. [Generating sequences with recurrent neural networks](http://arxiv.org/pdf/1308.0850)[J]. arXiv preprint arXiv:1308.0850, 2013.
+8. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](http://arxiv.org/pdf/1406.1078)[J]. arXiv preprint arXiv:1406.1078, 2014.
+9. Zhou J, Xu W. [End-to-end learning of semantic role labeling using recurrent neural networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf)[C]//Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
+
+<br/>
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/word2vec/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..a620e0279c310d213d4e6d8e99e666962c11e352
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/word2vec/.gitignore
@@ -0,0 +1,3 @@
+data/train.list
+data/test.list
+data/simple-examples*
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/2d_similarity.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/2d_similarity.png
new file mode 100644
index 0000000000000000000000000000000000000000..384f59919a2c8dedb198e97d51434616648932e1
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/2d_similarity.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow.png
new file mode 100644
index 0000000000000000000000000000000000000000..76b7d4bc0f99372465bd9aa34721513d39ad0776
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow_en.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow_en.png
new file mode 100755
index 0000000000000000000000000000000000000000..d985c393e618e9b79df05e4ff0ae57ccc93744d0
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow_en.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.en.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.en.png
new file mode 100755
index 0000000000000000000000000000000000000000..2e16ab2f443732b8ef5404a8e7cd2457bc5eee23
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.en.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.png
new file mode 100644
index 0000000000000000000000000000000000000000..2449dce6a86b43b1b997ff418ed0dba56848463f
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm.png
new file mode 100644
index 0000000000000000000000000000000000000000..1e0b40a8f7aefdf46d42761305511f281c08e595
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm_en.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm_en.png
new file mode 100755
index 0000000000000000000000000000000000000000..158bd64b8f8729dea67834a8d591d21bce8b8564
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm_en.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/sentence_emb.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/sentence_emb.png
new file mode 100644
index 0000000000000000000000000000000000000000..ce4a8bf4769183cbaff91793753d2350a3ce936c
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/sentence_emb.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram.png
new file mode 100644
index 0000000000000000000000000000000000000000..a3ab385845d3dc8b5c670bae91225bc8dd47a8bb
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram_en.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram_en.png
new file mode 100755
index 0000000000000000000000000000000000000000..3c36c6d1f66eb98ea78c0673965d02a4ee3aa288
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram_en.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/index.md b/doc/fluid/new_docs/beginners_guide/basics/word2vec/index.md
new file mode 100644
index 0000000000000000000000000000000000000000..e73a6334ca1acd49379604f24d3d4e463192a902
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/word2vec/index.md
@@ -0,0 +1,440 @@
+
+# 词向量
+
+本教程源代码目录在[book/word2vec](https://github.com/PaddlePaddle/book/tree/develop/04.word2vec)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)。
+
+## 背景介绍
+
+本章我们介绍词的向量表征，也称为word embedding。词向量是自然语言处理中常见的一个操作，是搜索引擎、广告系统、推荐系统等互联网服务背后常见的基础技术。
+
+在这些互联网服务里，我们经常要比较两个词或者两段文本之间的相关性。为了做这样的比较，我们往往先要把词表示成计算机适合处理的方式。最自然的方式恐怕莫过于向量空间模型(vector space model)。
+在这种方式里，每个词被表示成一个实数向量（one-hot vector），其长度为字典大小，每个维度对应一个字典里的每个词，除了这个词对应维度上的值是1，其他元素都是0。
+
+One-hot vector虽然自然，但是用处有限。比如，在互联网广告系统里，如果用户输入的query是“母亲节”，而有一个广告的关键词是“康乃馨”。虽然按照常理，我们知道这两个词之间是有联系的——母亲节通常应该送给母亲一束康乃馨；但是这两个词对应的one-hot vectors之间的距离度量，无论是欧氏距离还是余弦相似度(cosine similarity)，由于其向量正交，都认为这两个词毫无相关性。 得出这种与我们相悖的结论的根本原因是：每个词本身的信息量都太小。所以，仅仅给定两个词，不足以让我们准确判别它们是否相关。要想精确计算相关性，我们还需要更多的信息——从大量数据里通过机器学习方法归纳出来的知识。
+
+在机器学习领域里，各种“知识”被各种模型表示，词向量模型(word embedding model)就是其中的一类。通过词向量模型可将一个 one-hot vector映射到一个维度更低的实数向量（embedding vector），如`$embedding(Mother's\ Day) = [0.3, 4.2, -1.5, ...], embedding(Carnation) = [0.2, 5.6, -2.3, ...]$`。在这个映射到的实数向量表示中，希望两个语义（或用法）上相似的词对应的词向量“更像”，这样如“母亲节”和“康乃馨”的对应词向量的余弦相似度就不再为零了。
+
+词向量模型可以是概率模型、共生矩阵(co-occurrence matrix)模型或神经元网络模型。在用神经网络求词向量之前，传统做法是统计一个词语的共生矩阵`$X$`。`$X$`是一个`$|V| \times |V|$` 大小的矩阵，`$X_{ij}$`表示在所有语料中，词汇表`V`(vocabulary)中第i个词和第j个词同时出现的词数，`$|V|$`为词汇表的大小。对`$X$`做矩阵分解（如奇异值分解，Singular Value Decomposition \[[5](#参考文献)\]），得到的`$U$`即视为所有词的词向量：
+
+$$X = USV^T$$
+
+但这样的传统做法有很多问题：<br/>
+1) 由于很多词没有出现，导致矩阵极其稀疏，因此需要对词频做额外处理来达到好的矩阵分解效果；<br/>
+2) 矩阵非常大，维度太高(通常达到`$10^6*10^6$`的数量级)；<br/>
+3) 需要手动去掉停用词（如although, a,...），不然这些频繁出现的词也会影响矩阵分解的效果。
+
+
+基于神经网络的模型不需要计算存储一个在全语料上统计的大表，而是通过学习语义信息得到词向量，因此能很好地解决以上问题。在本章里，我们将展示基于神经网络训练词向量的细节，以及如何用PaddlePaddle训练一个词向量模型。
+
+
+## 效果展示
+
+本章中，当词向量训练好后，我们可以用数据可视化算法t-SNE\[[4](#参考文献)\]画出词语特征在二维上的投影（如下图所示）。从图中可以看出，语义相关的词语（如a, the, these; big, huge）在投影上距离很近，语意无关的词（如say, business; decision, japan）在投影上的距离很远。
+
+![2d_similarity](./image/2d_similarity.png)
+<p align="center">
+图1. 词向量的二维投影
+</p>
+
+另一方面，我们知道两个向量的余弦值在`$[-1,1]$`的区间内：两个完全相同的向量余弦值为1, 两个相互垂直的向量之间余弦值为0，两个方向完全相反的向量余弦值为-1，即相关性和余弦值大小成正比。因此我们还可以计算两个词向量的余弦相似度:
+
+```
+similarity: 0.899180685161
+please input two words: big huge
+
+please input two words: from company
+similarity: -0.0997506977351
+```
+
+以上结果可以通过运行`calculate_dis.py`, 加载字典里的单词和对应训练特征结果得到，我们将在[应用模型](#应用模型)中详细描述用法。
+
+
+## 模型概览
+
+在这里我们介绍三个训练词向量的模型：N-gram模型，CBOW模型和Skip-gram模型，它们的中心思想都是通过上下文得到一个词出现的概率。对于N-gram模型，我们会先介绍语言模型的概念，并在之后的[训练模型](#训练模型)中，带大家用PaddlePaddle实现它。而后两个模型，是近年来最有名的神经元词向量模型，由 Tomas Mikolov 在Google 研发\[[3](#参考文献)\]，虽然它们很浅很简单，但训练效果很好。
+
+### 语言模型
+
+在介绍词向量模型之前，我们先来引入一个概念：语言模型。
+语言模型旨在为语句的联合概率函数`$P(w_1, ..., w_T)$`建模, 其中`$w_i$`表示句子中的第i个词。语言模型的目标是，希望模型对有意义的句子赋予大概率，对没意义的句子赋予小概率。
+这样的模型可以应用于很多领域，如机器翻译、语音识别、信息检索、词性标注、手写识别等，它们都希望能得到一个连续序列的概率。 以信息检索为例，当你在搜索“how long is a football bame”时（bame是一个医学名词），搜索引擎会提示你是否希望搜索"how long is a football game", 这是因为根据语言模型计算出“how long is a football bame”的概率很低，而与bame近似的，可能引起错误的词中，game会使该句生成的概率最大。
+
+对语言模型的目标概率`$P(w_1, ..., w_T)$`，如果假设文本中每个词都是相互独立的，则整句话的联合概率可以表示为其中所有词语条件概率的乘积，即：
+
+$$P(w_1, ..., w_T) = \prod_{t=1}^TP(w_t)$$
+
+然而我们知道语句中的每个词出现的概率都与其前面的词紧密相关, 所以实际上通常用条件概率表示语言模型：
+
+$$P(w_1, ..., w_T) = \prod_{t=1}^TP(w_t | w_1, ... , w_{t-1})$$
+
+
+
+### N-gram neural model
+
+在计算语言学中，n-gram是一种重要的文本表示方法，表示一个文本中连续的n个项。基于具体的应用场景，每一项可以是一个字母、单词或者音节。 n-gram模型也是统计语言模型中的一种重要方法，用n-gram训练语言模型时，一般用每个n-gram的历史n-1个词语组成的内容来预测第n个词。
+
+Yoshua Bengio等科学家就于2003年在著名论文 Neural Probabilistic Language Models \[[1](#参考文献)\] 中介绍如何学习一个神经元网络表示的词向量模型。文中的神经概率语言模型（Neural Network Language Model，NNLM）通过一个线性映射和一个非线性隐层连接，同时学习了语言模型和词向量，即通过学习大量语料得到词语的向量表达，通过这些向量得到整个句子的概率。用这种方法学习语言模型可以克服维度灾难（curse of dimensionality）,即训练和测试数据不同导致的模型不准。注意：由于“神经概率语言模型”说法较为泛泛，我们在这里不用其NNLM的本名，考虑到其具体做法，本文中称该模型为N-gram neural model。
+
+我们在上文中已经讲到用条件概率建模语言模型，即一句话中第`$t$`个词的概率和该句话的前`$t-1$`个词相关。可实际上越远的词语其实对该词的影响越小，那么如果考虑一个n-gram, 每个词都只受其前面`n-1`个词的影响，则有：
+
+$$P(w_1, ..., w_T) = \prod_{t=n}^TP(w_t|w_{t-1}, w_{t-2}, ..., w_{t-n+1})$$
+
+给定一些真实语料，这些语料中都是有意义的句子，N-gram模型的优化目标则是最大化目标函数:
+
+$$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$
+
+其中`$f(w_t, w_{t-1}, ..., w_{t-n+1})$`表示根据历史n-1个词得到当前词`$w_t$`的条件概率，`$R(\theta)$`表示参数正则项。
+
+![nnlm](./image/nnlm.png)
+<p align="center">
+图2. N-gram神经网络模型
+</p>
+
+图2展示了N-gram神经网络模型，从下往上看，该模型分为以下几个部分：
+- 对于每个样本，模型输入`$w_{t-n+1},...w_{t-1}$`, 输出句子第t个词为字典中`|V|`个词的概率。
+
+每个输入词`$w_{t-n+1},...w_{t-1}$`首先通过映射矩阵映射到词向量`$C(w_{t-n+1}),...C(w_{t-1})$`。
+
+- 然后所有词语的词向量连接成一个大向量，并经过一个非线性映射得到历史词语的隐层表示：
+
+$$g=Utanh(\theta^Tx + b_1) + Wx + b_2$$
+
+其中，`$x$`为所有词语的词向量连接成的大向量，表示文本历史特征；`$\theta$`、`$U$`、`$b_1$`、`$b_2$`和`$W$`分别为词向量层到隐层连接的参数。`$g$`表示未经归一化的所有输出单词概率，`$g_i$`表示未经归一化的字典中第`$i$`个单词的输出概率。
+
+- 根据softmax的定义，通过归一化`$g_i$`, 生成目标词`$w_t$`的概率为：
+
+$$P(w_t | w_1, ..., w_{t-n+1}) = \frac{e^{g_{w_t}}}{\sum_i^{|V|} e^{g_i}}$$
+
+- 整个网络的损失值(cost)为多类分类交叉熵，用公式表示为
+
+$$J(\theta) = -\sum_{i=1}^N\sum_{c=1}^{|V|}y_k^{i}log(softmax(g_k^i))$$
+
+其中`$y_k^i$`表示第`$i$`个样本第`$k$`类的真实标签(0或1)，`$softmax(g_k^i)$`表示第i个样本第k类softmax输出的概率。
+
+
+
+### Continuous Bag-of-Words model(CBOW)
+
+CBOW模型通过一个词的上下文（各N个词）预测当前词。当N=2时，模型如下图所示：
+
+![cbow](./image/cbow.png)
+<p align="center">
+图3. CBOW模型
+</p>
+
+具体来说，不考虑上下文的词语输入顺序，CBOW是用上下文词语的词向量的均值来预测当前词。即：
+
+$$context = \frac{x_{t-1} + x_{t-2} + x_{t+1} + x_{t+2}}{4}$$
+
+其中`$x_t$`为第`$t$`个词的词向量，分类分数（score）向量 `$z=U*context$`，最终的分类`$y$`采用softmax，损失函数采用多类分类交叉熵。
+
+### Skip-gram model
+
+CBOW的好处是对上下文词语的分布在词向量上进行了平滑，去掉了噪声，因此在小数据集上很有效。而Skip-gram的方法中，用一个词预测其上下文，得到了当前词上下文的很多样本，因此可用于更大的数据集。
+
+![skipgram](./image/skipgram.png)
+<p align="center">
+图4. Skip-gram模型
+</p>
+
+如上图所示，Skip-gram模型的具体做法是，将一个词的词向量映射到`$2n$`个词的词向量（`$2n$`表示当前输入词的前后各`$n$`个词），然后分别通过softmax得到这`$2n$`个词的分类损失值之和。
+
+
+## 数据准备
+
+### 数据介绍
+
+本教程使用Penn Treebank （PTB）（经Tomas Mikolov预处理过的版本）数据集。PTB数据集较小，训练速度快，应用于Mikolov的公开语言模型训练工具\[[2](#参考文献)\]中。其统计情况如下：
+
+<p align="center">
+<table>
+<tr>
+<td>训练数据</td>
+<td>验证数据</td>
+<td>测试数据</td>
+</tr>
+<tr>
+<td>ptb.train.txt</td>
+<td>ptb.valid.txt</td>
+<td>ptb.test.txt</td>
+</tr>
+<tr>
+<td>42068句</td>
+<td>3370句</td>
+<td>3761句</td>
+</tr>
+</table>
+</p>
+
+
+### 数据预处理
+
+本章训练的是5-gram模型，表示在PaddlePaddle训练时，每条数据的前4个词用来预测第5个词。PaddlePaddle提供了对应PTB数据集的python包`paddle.dataset.imikolov`，自动做数据的下载与预处理，方便大家使用。
+
+预处理会把数据集中的每一句话前后加上开始符号`<s>`以及结束符号`<e>`。然后依据窗口大小（本教程中为5），从头到尾每次向右滑动窗口并生成一条数据。
+
+如"I have a dream that one day" 一句提供了5条数据：
+
+```text
+<s> I have a dream
+I have a dream that
+have a dream that one
+a dream that one day
+dream that one day <e>
+```
+
+最后，每个输入会按其单词次在字典里的位置，转化成整数的索引序列，作为PaddlePaddle的输入。
+
+## 编程实现
+
+本配置的模型结构如下图所示：
+
+![ngram](./image/ngram.png)
+<p align="center">
+图5. 模型配置中的N-gram神经网络模型
+</p>
+
+首先，加载所需要的包：
+
+```python
+import paddle
+import paddle.fluid as fluid
+import numpy
+from functools import partial
+import math
+import os
+import sys
+```
+
+然后，定义参数：
+```python
+EMBED_SIZE = 32  # word vector dimension
+HIDDEN_SIZE = 256  # hidden layer dimension
+N = 5  # train 5-gram
+BATCH_SIZE = 32  # batch size
+
+# can use CPU or GPU
+use_cuda = os.getenv('WITH_GPU', '0') != '0'
+
+word_dict = paddle.dataset.imikolov.build_dict()
+dict_size = len(word_dict)
+```
+
+不同于之前的PaddlePaddle v2版本，在新的Fluid版本里，我们不必再手动计算词向量。PaddlePaddle提供了一个内置的方法`fluid.layers.embedding`，我们就可以直接用它来构造 N-gram 神经网络。
+
+- 我们来定义我们的 N-gram 神经网络结构。这个结构在训练和预测中都会使用到。因为词向量比较稀疏，我们传入参数 `is_sparse == True`, 可以加速稀疏矩阵的更新。
+
+```python
+def inference_program(is_sparse):
+first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
+second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
+third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
+fourth_word = fluid.layers.data(name='fourthw', shape=[1], dtype='int64')
+
+embed_first = fluid.layers.embedding(
+input=first_word,
+size=[dict_size, EMBED_SIZE],
+dtype='float32',
+is_sparse=is_sparse,
+param_attr='shared_w')
+embed_second = fluid.layers.embedding(
+input=second_word,
+size=[dict_size, EMBED_SIZE],
+dtype='float32',
+is_sparse=is_sparse,
+param_attr='shared_w')
+embed_third = fluid.layers.embedding(
+input=third_word,
+size=[dict_size, EMBED_SIZE],
+dtype='float32',
+is_sparse=is_sparse,
+param_attr='shared_w')
+embed_fourth = fluid.layers.embedding(
+input=fourth_word,
+size=[dict_size, EMBED_SIZE],
+dtype='float32',
+is_sparse=is_sparse,
+param_attr='shared_w')
+
+concat_embed = fluid.layers.concat(
+input=[embed_first, embed_second, embed_third, embed_fourth], axis=1)
+hidden1 = fluid.layers.fc(input=concat_embed,
+size=HIDDEN_SIZE,
+act='sigmoid')
+predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax')
+return predict_word
+```
+
+- 基于以上的神经网络结构，我们可以如下定义我们的`训练`方法
+
+```python
+def train_program(is_sparse):
+# The declaration of 'next_word' must be after the invoking of inference_program,
+# or the data input order of train program would be [next_word, firstw, secondw,
+# thirdw, fourthw], which is not correct.
+predict_word = inference_program(is_sparse)
+next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
+avg_cost = fluid.layers.mean(cost)
+return avg_cost
+```
+
+- 现在我们可以开始训练啦。如今的版本较之以前就简单了许多。我们有现成的训练和测试集：`paddle.dataset.imikolov.train()`和`paddle.dataset.imikolov.test()`。两者都会返回一个读取器。在PaddlePaddle中，读取器是一个Python的函数，每次调用，会读取下一条数据。它是一个Python的generator。
+
+`paddle.batch` 会读入一个读取器，然后输出一个批次化了的读取器。`event_handler`亦可以一并传入`trainer.train`来时不时的输出每个步骤，批次的训练情况。
+
+```python
+def optimizer_func():
+# Note here we need to choose more sophisticated optimizers
+# such as AdaGrad with a decay rate. The normal SGD converges
+# very slowly.
+# optimizer=fluid.optimizer.SGD(learning_rate=0.001),
+return fluid.optimizer.AdagradOptimizer(
+learning_rate=3e-3,
+regularization=fluid.regularizer.L2DecayRegularizer(8e-4))
+
+
+def train(use_cuda, train_program, params_dirname):
+train_reader = paddle.batch(
+paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
+test_reader = paddle.batch(
+paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE)
+
+place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+def event_handler(event):
+if isinstance(event, fluid.EndStepEvent):
+# We output cost every 10 steps.
+if event.step % 10 == 0:
+outs = trainer.test(
+reader=test_reader,
+feed_order=['firstw', 'secondw', 'thirdw', 'fourthw', 'nextw'])
+avg_cost = outs[0]
+
+print "Step %d: Average Cost %f" % (event.step, avg_cost)
+
+# If average cost is lower than 5.8, we consider the model good enough to stop.
+# Note 5.8 is a relatively high value. In order to get a better model, one should
+# aim for avg_cost lower than 3.5. But the training could take longer time.
+if avg_cost < 5.8:
+trainer.save_params(params_dirname)
+trainer.stop()
+
+if math.isnan(avg_cost):
+sys.exit("got NaN loss, training failed.")
+
+trainer = fluid.Trainer(
+train_func=train_program,
+optimizer_func=optimizer_func,
+place=place)
+
+trainer.train(
+reader=train_reader,
+num_epochs=1,
+event_handler=event_handler,
+feed_order=['firstw', 'secondw', 'thirdw', 'fourthw', 'nextw'])
+```
+
+- `trainer.train`将会开始训练。从`event_handler`返回的监控情况如下：
+
+```python
+Step 0: Average Cost 7.337213
+Step 10: Average Cost 6.136128
+Step 20: Average Cost 5.766995
+...
+```
+
+## 模型应用
+在模型训练后，我们可以用它做一些预测。
+
+### 预测下一个词
+我们可以用我们训练过的模型，在得知之前的 N-gram 后，预测下一个词。
+
+```python
+def infer(use_cuda, inference_program, params_dirname=None):
+place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+inferencer = fluid.Inferencer(
+infer_func=inference_program, param_path=params_dirname, place=place)
+
+# Setup inputs by creating 4 LoDTensors representing 4 words. Here each word
+# is simply an index to look up for the corresponding word vector and hence
+# the shape of word (base_shape) should be [1]. The length-based level of
+# detail (lod) info of each LoDtensor should be [[1]] meaning there is only
+# one lod_level and there is only one sequence of one word on this level.
+# Note that lod info should be a list of lists.
+
+data1 = [[211]]  # 'among'
+data2 = [[6]]    # 'a'
+data3 = [[96]]   # 'group'
+data4 = [[4]]    # 'of'
+lod = [[1]]
+
+first_word  = fluid.create_lod_tensor(data1, lod, place)
+second_word = fluid.create_lod_tensor(data2, lod, place)
+third_word  = fluid.create_lod_tensor(data3, lod, place)
+fourth_word = fluid.create_lod_tensor(data4, lod, place)
+
+result = inferencer.infer(
+{
+'firstw': first_word,
+'secondw': second_word,
+'thirdw': third_word,
+'fourthw': fourth_word
+},
+return_numpy=False)
+
+print(numpy.array(result[0]))
+most_possible_word_index = numpy.argmax(result[0])
+print(most_possible_word_index)
+print([
+key for key, value in word_dict.iteritems()
+if value == most_possible_word_index
+][0])
+```
+
+在经历3分钟的短暂训练后，我们得到如下的预测。我们的模型预测 `among a group of` 的下一个词是`a`。这比较符合文法规律。如果我们训练时间更长，比如几个小时，那么我们会得到的下一个预测是 `workers`。
+
+
+```python
+[[0.00106646 0.0007907  0.00072041 ... 0.00049024 0.00041355 0.00084464]]
+6
+a
+```
+
+整个程序的入口很简单：
+
+```python
+def main(use_cuda, is_sparse):
+if use_cuda and not fluid.core.is_compiled_with_cuda():
+return
+
+params_dirname = "word2vec.inference.model"
+
+train(
+use_cuda=use_cuda,
+train_program=partial(train_program, is_sparse),
+params_dirname=params_dirname)
+
+infer(
+use_cuda=use_cuda,
+inference_program=partial(inference_program, is_sparse),
+params_dirname=params_dirname)
+
+
+main(use_cuda=use_cuda, is_sparse=True)
+```
+
+
+## 总结
+本章中，我们介绍了词向量、语言模型和词向量的关系、以及如何通过训练神经网络模型获得词向量。在信息检索中，我们可以根据向量间的余弦夹角，来判断query和文档关键词这二者间的相关性。在句法分析和语义分析中，训练好的词向量可以用来初始化模型，以得到更好的效果。在文档分类中，有了词向量之后，可以用聚类的方法将文档中同义词进行分组，也可以用 N-gram 来预测下一个词。希望大家在本章后能够自行运用词向量进行相关领域的研究。
+
+
+## 参考文献
+1. Bengio Y, Ducharme R, Vincent P, et al. [A neural probabilistic language model](http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)[J]. journal of machine learning research, 2003, 3(Feb): 1137-1155.
+2. Mikolov T, Kombrink S, Deoras A, et al. [Rnnlm-recurrent neural network language modeling toolkit](http://www.fit.vutbr.cz/~imikolov/rnnlm/rnnlm-demo.pdf)[C]//Proc. of the 2011 ASRU Workshop. 2011: 196-201.
+3. Mikolov T, Chen K, Corrado G, et al. [Efficient estimation of word representations in vector space](https://arxiv.org/pdf/1301.3781.pdf)[J]. arXiv preprint arXiv:1301.3781, 2013.
+4. Maaten L, Hinton G. [Visualizing data using t-SNE](https://lvdmaaten.github.io/publications/papers/JMLR_2008.pdf)[J]. Journal of Machine Learning Research, 2008, 9(Nov): 2579-2605.
+5. https://en.wikipedia.org/wiki/Singular_value_decomposition
+
+<br/>
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/doc/fluid/new_docs/beginners_guide/index.rst b/doc/fluid/new_docs/beginners_guide/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e18933dcc0038129077a455892ddd785579f0003
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/index.rst
@@ -0,0 +1,15 @@
+########
+新手入门
+########
+
+..  todo::
+
+    新手入门的导引文字，需要完善。
+
+..  toctree::
+    :maxdepth: 2
+    
+    install/install_doc.rst
+    quick_start/index.rst
+    basics/index.rst
+    basics/learning_materials.md
diff --git a/doc/fluid/new_docs/beginners_guide/install/install_doc.rst b/doc/fluid/new_docs/beginners_guide/install/install_doc.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8a66a95f45ea18dbfdc2450694517d5df8c47efd
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/install/install_doc.rst
@@ -0,0 +1,543 @@
+.. _how_to_install:
+
+安装说明
+^^^^^^^^
+
+若您的系统为Linux或Windows，您可以使用我们提供的安装包来安装PaddlePaddle。
+
+对于MacOS系统，我们暂未提供安装包，您可以使用 **从源码编译** 的方式安装。
+
+
+.. _install_linux:
+
+在Linux安装PaddlePaddle
+--------
+
+推荐您使用 `pip <https://pypi.org/project/pip/>`_
+安装，它是Linux系统下最简单的安装方式。
+
+注意事项：
+
+- PaddlePaddle Python API 依赖Python 2.7版本。
+
+执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境，并自动下载安装依赖软件。
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+您可以通过指定版本号来安装其它版本，例如：
+
+  .. code-block:: bash
+
+      pip install paddlepaddle==0.13.0
+
+
+如果需要安装支持GPU的版本（cuda9.0_cudnn7_avx_openblas），需要执行：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+PaddlePaddle针对不同需求提供了更多版本的安装包，部分列表如下：
+
+=================================   ========================================
+版本号                               版本说明
+=================================   ========================================
+paddlepaddle-gpu==0.14.0            使用CUDA 9.0和cuDNN 7编译的0.14.0版本
+paddlepaddle-gpu==0.14.0.post87     使用CUDA 8.0和cuDNN 7编译的0.14.0版本
+paddlepaddle-gpu==0.14.0.post85     使用CUDA 8.0和cuDNN 5编译的0.14.0版本
+paddlepaddle-gpu==0.13.0            使用CUDA 9.0和cuDNN 7编译的0.13.0版本
+paddlepaddle-gpu==0.12.0            使用CUDA 8.0和cuDNN 5编译的0.12.0版本
+paddlepaddle-gpu==0.11.0.post87     使用CUDA 8.0和cuDNN 7编译的0.11.0版本
+paddlepaddle-gpu==0.11.0.post8      使用CUDA 8.0和cuDNN 5编译的0.11.0版本
+paddlepaddle-gpu==0.11.0            使用CUDA 7.5和cuDNN 5编译的0.11.0版本
+=================================   ========================================
+
+您可以在 `Release History <https://pypi.org/project/paddlepaddle-gpu/#history>`_
+中找到paddlepaddle-gpu的各个发行版本。
+
+如果需要获取并安装最新的PaddlePaddle开发分支，可以从我们的 `CI系统 <https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview>`_ 中下载最新的whl安装包和c-api开发包并安装。如需登录，请点击“Log in as guest”。
+
+.. _FAQ:
+
+安装常见问题和解决方法
+======================
+
+- paddlepaddle*.whl is not a supported wheel on this platform.
+
+出现这个问题的主要原因是，没有找到和当前系统匹配的paddlepaddle安装包。
+请检查Python版本是否为2.7系列。另外最新的pip官方源中的安装包默认是manylinux1标准，
+需要使用最新的pip (>9.0.0) 才可以安装。
+
+可以使用下面的命令更新您的pip：
+
+  .. code-block:: bash
+
+      pip install --upgrade pip
+
+如果仍然存在问题，可以执行：
+
+    .. code-block:: bash
+
+        python -c "import pip; print(pip.pep425tags.get_supported())"
+
+获取当前系统支持的安装包格式，并检查和需安装的包是否匹配。pypi安装包
+可以在 `这里 <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_ 找到。
+
+如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ，需要升级pip版本到最新；
+如果系统支持 manylinux1_x86_64 而安装包（本地）是 linux_x86_64，
+可以重命名这个whl包为 manylinux1_x86_64 再安装。
+
+
+.. _install_windows:
+
+在Windows安装PaddlePaddle
+------------------------------
+Windows系统需要通过Docker来使用PaddleaPaddle。Docker是一个虚拟容器，使用Docker可以简化复杂的环境配置工作。
+
+我们提供了 `PaddlePaddle_Windows快速安装包 <http://paddle-windows.bj.bcebos.com/PaddlePaddle-windows.zip>`_，
+它能够帮助您安装Docker和PaddlePaddle。
+
+* 安装包支持的系统：Windows7，Windows8的所有版本，Windows10的专业版、企业版。
+
+* 如果您希望使用GPU提升训练速度，请使用Linux系统安装，Windows系统暂不支持。
+   
+.. _install_mac:
+
+在MacOS安装PaddlePaddle
+--------
+
+对于MacOS系统，我们暂未提供pip安装方式，您可以使用 **源码编译** 的方式安装。
+
+.. _others:
+
+其他安装方式
+-------------
+
+.. _source:
+源码编译（使用Docker镜像）
+==========
+
+.. _requirements:
+
+需要的软硬件
+"""""""""""""
+
+为了编译PaddlePaddle，我们需要
+
+1. 一台电脑，可以装的是 Linux, Windows 或者 MacOS 操作系统
+2. Docker
+
+不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要，因为我们会把所有编译工具都安装进一个 Docker 镜像里。
+
+.. _build_step:
+
+编译方法
+"""""""""""""
+
+PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安装编译依赖的步骤，可选的不同编译环境Docker镜像可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到。
+
+
+**I. 编译CPU-Only版本的PaddlePaddle，需要执行：**
+
+.. code-block:: bash
+
+   # 1. 获取源码
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # 2. 执行如下命令下载最新版本的docker镜像
+   docker run --name paddle-test -v $PWD:/paddle --network=host -it docker.paddlepaddlehub.com/paddle:latest-dev /bin/bash
+   # 3. 进入docker内执行如下命令编译CPU-Only的二进制安装包
+   mkdir -p /paddle/build && cd /paddle/build
+   cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF
+   make -j$(nproc)
+
+**II. 编译GPU版本的PaddlePaddle，需要执行：**
+
+.. code-block:: bash
+
+  # 1. 获取源码 
+  git clone https://github.com/PaddlePaddle/Paddle.git 
+  cd Paddle
+  # 2. 安装nvidia-docker
+  apt-get install nvidia-docker
+  # 3. 执行如下命令下载支持GPU运行的docker容器
+  nvidia-docker run --name paddle-test-gpu -v $PWD:/paddle --network=host -it docker.paddlepaddlehub.com/paddle:latest-dev /bin/bash
+  # 4. 进入docker内执行如下命令编译GPU版本的PaddlePaddle
+  mkdir -p /paddle/build && cd /paddle/build
+  cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=ON -DWITH_TESTING=OFF
+  make -j$(nproc)
+
+**注意事项：**
+
+* 上述有关 :code:`docker` 的命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。
+* 进入 :code:`docker` 后执行 :code:`cmake` 命令，若是出现 :code:`patchelf not found, please install it.` 错误，则执行 :code:`apt-get install -y patchelf` 命令即可解决问题。
+* 若您在使用Docker编译PaddlePaddle遇到问题时， `这个issue <https://github.com/PaddlePaddle/Paddle/issues/12079>`_ 可能会对您有所帮助。
+
+
+.. _source:
+源码编译（不使用Docker镜像）
+==========
+
+如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 `附录：编译依赖`_ 之后才能开始编译的步骤。
+
+.. _build_step:
+
+编译方法
+"""""""""""""
+
+在本机上编译CPU-Only版本的PaddlePaddle，需要执行如下命令：
+
+.. code-block:: bash
+
+   # 1. 使用virtualenvwrapper创建python虚环境并将工作空间切换到虚环境 [可选]
+   mkvirtualenv paddle-venv
+   workon paddle-venv
+   # 2. 获取源码
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # 3. 执行下面的命令编译CPU-Only的二进制
+   mkdir build && cd build
+   cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF
+   make -j4 # 根据机器配备CPU的核心数开启相应的多线程进行编译
+
+
+**注意事项：**
+
+* MacOS系统下因为默认安装了cblas库，所以编译时可能会遇到 :code:`use of undeclared identifier 'openblas_set_num_threads'` 错误。因此，在执行cmake命令时需要指定所使用openblas库的头文件路径，具体操作如下：
+
+  .. code-block:: bash
+
+    cd Paddle/build && rm -rf *
+    cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF -DOPENBLAS_INC_DIR=/usr/local/Cellar/openblas/[本机所安装的openblas版本号]/include/
+    make -j4 # 根据机器配备CPU的核心数开启相应的多线程进行编译
+* 若您在MacOS系统下从源码编译PaddlePaddle遇到问题时， `这个issue <https://github.com/PaddlePaddle/Paddle/issues/12078>`_ 可能会对您有所帮助。
+
+编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：
+
+.. code-block:: bash
+
+   pip install build/python/dist/*.whl
+
+如果机器中已经安装过PaddlePaddle，有两种方法：
+
+.. code-block:: bash
+
+   1. 先卸载之前的版本，再重新安装
+   pip uninstall paddlepaddle
+   pip install build/python/dist/*.whl
+
+   2. 直接升级到更新的版本
+   pip install build/python/dist/*.whl -U
+
+.. _run_test:
+
+执行单元测试
+"""""""""""""
+
+如果您期望在编译完成后立即执行所有的单元测试，可以按照下面的方法：
+
+设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后，立即执行单元测试。
+开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" docker.paddlepaddlehub.com/paddle:latest-dev bash -x /paddle/paddle/scripts/paddle_build.sh build
+
+如果期望执行其中一个单元测试，（比如 :code:`test_sum_op` ）：
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" docker.paddlepaddlehub.com/paddle:latest-dev bash -x /paddle/paddle/scripts/paddle_build.sh build
+   cd /paddle/build
+   ctest -R test_sum_op -V
+
+.. _faq_docker:
+
+常见问题
+"""""""""""""
+
+- 什么是 Docker?
+
+  如果您没有听说 Docker，可以把它想象为一个类似 virtualenv 的系统，但是虚拟的不仅仅是 Python 的运行环境。
+
+- Docker 还是虚拟机？
+
+  有人用虚拟机来类比 Docker。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。
+
+- 为什么用 Docker?
+
+  把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题，其他人可以复现问题以便帮助。
+
+  另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。
+
+- 可以选择不用Docker吗？
+
+  当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式，把这些工具安装到本机。这篇文档介绍基于 Docker 的开发流程，是因为这个流程比其他方法都更简便。
+
+- 学习 Docker 有多难？
+
+  理解 Docker 并不难，大概花十分钟看一下 `这篇文章 <https://zhuanlan.zhihu.com/p/19902938>`_。
+  这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
+
+- 可以用 IDE 吗？
+
+  当然可以，因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码，我们只需要配置 IDE 来调用 Docker 命令编译源码即可。
+
+  很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行
+
+  .. code-block:: bash
+
+    (global-set-key "\C-cc" 'compile)
+    (setq compile-command
+     "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
+
+  就可以按 `Ctrl-C` 和 `c` 键来启动编译了。
+
+- 可以并行编译吗？
+
+  是的。我们的 Docker image 运行一个 `Bash 脚本 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/paddle/scripts/paddle_build.sh>`_。这个脚本调用 :code:`make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
+
+- Docker 需要 sudo
+
+  如果用自己的电脑开发，自然也就有管理员权限（sudo）了。如果用公用的电脑开发，需要请管理员安装和配置好 Docker。此外，PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术，比如 rkt。
+
+- 在 Windows/MacOS 上编译很慢
+
+  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考 `这个issue <https://github.com/PaddlePaddle/Paddle/issues/627>`_。
+
+- 磁盘不够
+
+  本文中的例子里， :code:`docker run` 命令里都用了 :code:`--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 :code:`docker ps -a` 命令看到停止后但是没有删除的 containers。 :code:`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考 `这篇文章 <https://zaiste.net/posts/removing_docker_containers/>`_ 来清理这些内容。
+
+
+.. _compile_deps:
+
+附录：编译依赖
+"""""""""""""
+
+PaddlePaddle编译需要使用到下面的依赖（包含但不限于），其他的依赖软件，会自动在编译时下载。
+
+.. csv-table:: PaddlePaddle编译依赖
+   :header: "依赖", "版本", "说明"
+   :widths: 10, 15, 30
+
+   "CMake", "3.4", ""
+   "GCC", "4.8.2", "推荐使用CentOS的devtools2"
+   "Python", "2.7.x", "依赖libpython2.7.so"
+   "SWIG", ">=2.0", ""
+   "wget","",""
+   "openblas","",""
+   "pip", ">=9.0", ""
+   "numpy", "", ""
+   "protobuf","3.1.0",""
+   "wheel","",""
+   "Go", ">=1.8", "可选"
+
+
+.. _build_options:
+
+附录：编译选项
+"""""""""""""
+
+PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。
+用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考
+`官方文档 <https://cmake.org/cmake-tutorial>`_ 。
+
+在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如：
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: 编译选项说明
+    :header: "选项", "说明", "默认值"
+    :widths: 1, 7, 2
+
+    "WITH_GPU", "是否支持GPU", "ON"
+    "WITH_C_API", "是否仅编译CAPI", "OFF"
+    "WITH_DOUBLE", "是否使用双精度浮点数", "OFF"
+    "WITH_DSO", "是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。", "ON"
+    "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
+    "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
+    "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
+    "WITH_TESTING", "是否开启单元测试", "OFF"
+    "WITH_DOC", "是否编译中英文文档", "OFF"
+    "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练", "Auto"
+    "WITH_GOLANG", "是否编译go语言的可容错parameter server", "OFF"
+    "WITH_MKL", "是否使用MKL数学库，如果为否则是用OpenBLAS", "ON"
+
+BLAS
++++++
+
+PaddlePaddle支持 `MKL <https://software.intel.com/en-us/intel-mkl>`_ 和
+`OpenBlAS <http://www.openblas.net/>`_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集，
+还会下载MKL-DNN数学库，详细参考 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
+
+如果关闭MKL，则会使用OpenBLAS作为BLAS库。
+
+CUDA/cuDNN
++++++++++++
+
+PaddlePaddle在编译时/运行时会自动找到系统中安装的CUDA和cuDNN库进行编译和执行。
+使用参数 :code:`-DCUDA_ARCH_NAME=Auto` 可以指定开启自动检测SM架构，加速编译。
+
+PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cuDNN是同一个版本。
+我们推荐使用最新版本的cuDNN。
+
+编译选项的设置
+++++++++++++++
+
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ :code:`/usr/lib:/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
+
+注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（ :code:`rm -rf` ）后，再指定。
+
+.. _install_docker:
+
+使用Docker安装运行
+==================
+
+使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境。
+您可以在 `Docker官网 <https://docs.docker.com/get-started/>`_
+获得基本的Docker安装和使用方法。
+
+在了解Docker的基本使用方法之后，即可开始下面的步骤：
+
+.. _docker_pull:
+
+获取PaddlePaddle的Docker镜像
+""""""""""""""""""""""""""""
+
+执行下面的命令获取最新的PaddlePaddle Docker镜像，版本为cpu_avx_mkl：
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle
+
+对于国内用户，我们提供了加速访问的镜像源：
+
+  .. code-block:: bash
+
+     docker pull docker.paddlepaddlehub.com/paddle
+
+下载GPU版本（cuda8.0_cudnn5_avx_mkl）的Docker镜像：
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle:latest-gpu
+     docker pull docker.paddlepaddlehub.com/paddle:latest-gpu
+
+选择下载使用不同的BLAS库的Docker镜像：
+
+  .. code-block:: bash
+
+     # 默认是使用MKL的镜像
+     docker pull paddlepaddle/paddle
+     # 使用OpenBLAS的镜像
+     docker pull paddlepaddle/paddle:latest-openblas
+
+下载指定版本的Docker镜像，可以从 `DockerHub网站 <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 获取可选的tag，并执行下面的命令：
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle:[tag]
+     # 比如：
+     docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu
+
+.. _docker_run:
+
+在Docker中执行PaddlePaddle训练程序
+"""""""""""""""""""""""""""""""""""
+
+假设您已经在当前目录（比如在/home/work）编写了一个PaddlePaddle的程序 :code:`train.py` （可以参考
+`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_
+编写），就可以使用下面的命令开始执行训练：
+
+  .. code-block:: bash
+
+     cd /home/work
+     docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
+
+上述命令中， :code:`-it` 参数说明容器已交互式运行； :code:`-v $PWD:/work`
+指定将当前路径（Linux中$PWD变量会展开为当前路径的绝对路径）挂载到容器内部的 :code:`/work`
+目录； :code:`paddlepaddle/paddle` 指定需要使用的容器； 最后 :code:`/work/train.py`
+为容器内执行的命令，即运行训练程序。
+
+当然，您也可以进入到Docker容器中，以交互式的方式执行或调试您的代码：
+
+  .. code-block:: bash
+     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
+     cd /work
+     python train.py
+
+**注：PaddlePaddle Docker镜像为了减小体积，默认没有安装vim，您可以在容器中执行** :code:`apt-get install -y vim` **安装后，在容器中编辑代码。**
+
+.. _docker_run_book:
+
+使用Docker启动PaddlePaddle Book教程
+""""""""""""""""""""""""""""""""""""
+
+使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook，可以通过网页浏览。
+PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
+如果您想要更深入了解deep learning，PaddlePaddle Book一定是您最好的选择。
+大家可以通过它阅读教程，或者制作和分享带有代码、公式、图表、文字的交互式文档。
+
+我们提供可以直接运行PaddlePaddle Book的Docker镜像，直接运行：
+
+  .. code-block:: bash
+
+     docker run -p 8888:8888 paddlepaddle/book
+
+国内用户可以使用下面的镜像源来加速访问：
+
+  .. code-block: bash
+
+    docker run -p 8888:8888 docker.paddlepaddlehub.com/book
+
+然后在浏览器中输入以下网址：
+
+  .. code-block:: text
+
+     http://localhost:8888/
+
+就这么简单，享受您的旅程！
+
+.. _docker_run_gpu:
+
+使用Docker执行GPU训练
+""""""""""""""""""""""""""""
+
+为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_ 来运行镜像。
+请不要忘记提前在物理机上安装GPU最新驱动。
+
+  .. code-block:: bash
+
+     nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
+
+**注: 如果没有安装nvidia-docker，可以尝试以下的方法，将CUDA库和Linux设备挂载到Docker容器内：**
+
+  .. code-block:: bash
+
+     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+     docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
+
+**关于AVX：**
+
+AVX是一种CPU指令集，可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认
+是开启AVX编译的，所以，如果您的电脑不支持AVX，需要单独
+`编译 <./build_from_source_cn.html>`_ PaddlePaddle为no-avx版本。
+
+以下指令能检查Linux电脑是否支持AVX：
+
+   .. code-block:: bash
+
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
+
+如果输出是No，就需要选择使用no-AVX的镜像
diff --git a/doc/fluid/new_docs/beginners_guide/install/paddleci.png b/doc/fluid/new_docs/beginners_guide/install/paddleci.png
new file mode 100644
index 0000000000000000000000000000000000000000..16087ce059aa3c07ce8c927d983eb86351915825
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/install/paddleci.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..ba43ada5100ed1db7192de9c795b4b8a6596d705
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md
@@ -0,0 +1,329 @@
+```eval_rst
+..  _quick_start_fit_a_line:
+```
+# 线性回归
+让我们从经典的线性回归（Linear Regression \[[1](#参考文献)\]）模型开始这份教程。在这一章里，你将使用真实的数据集建立起一个房价预测模型，并且了解到机器学习中的若干重要概念。
+
+本教程源代码目录在[book/fit_a_line](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)。
+
+## 背景介绍
+给定一个大小为`$n$`的数据集  `${\{y_{i}, x_{i1}, ..., x_{id}\}}_{i=1}^{n}$`，其中`$x_{i1}, \ldots, x_{id}$`是第`$i$`个样本`$d$`个属性上的取值，`$y_i$`是该样本待预测的目标。线性回归模型假设目标`$y_i$`可以被属性间的线性组合描述，即
+
+$$y_i = \omega_1x_{i1} + \omega_2x_{i2} + \ldots + \omega_dx_{id} + b,  i=1,\ldots,n$$
+
+例如，在我们将要建模的房价预测问题里，`$x_{ij}$`是描述房子`$i$`的各种属性（比如房间的个数、周围学校和医院的个数、交通状况等），而 `$y_i$`是房屋的价格。
+
+初看起来，这个假设实在过于简单了，变量间的真实关系很难是线性的。但由于线性回归模型有形式简单和易于建模分析的优点，它在实际问题中得到了大量的应用。很多经典的统计学习、机器学习书籍\[[2,3,4](#参考文献)\]也选择对线性模型独立成章重点讲解。
+
+## 效果展示
+我们使用从[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)获得的波士顿房价数据集进行模型的训练和预测。下面的散点图展示了使用模型对部分房屋价格进行的预测。其中，每个点的横坐标表示同一类房屋真实价格的中位数，纵坐标表示线性回归模型根据特征预测的结果，当二者值完全相等的时候就会落在虚线上。所以模型预测得越准确，则点离虚线越近。
+
+![BostonHousePricePredictions](./image/predictions.png)
+<p align="center">图1. 预测值 V.S. 真实值</p>
+
+## 模型概览
+
+### 模型定义
+
+在波士顿房价数据集中，和房屋相关的值共有14个：前13个用来描述房屋相关的各种信息，即模型中的 `$x_i$`；最后一个值为我们要预测的该类房屋价格的中位数，即模型中的 `$y_i$`。因此，我们的模型就可以表示成：
+
+$$\hat{Y} = \omega_1X_{1} + \omega_2X_{2} + \ldots + \omega_{13}X_{13} + b$$
+
+`$\hat{Y}$` 表示模型的预测结果，用来和真实值`$Y$`区分。模型要学习的参数即：`$\omega_1, \ldots, \omega_{13}, b$`。
+
+建立模型后，我们需要给模型一个优化目标，使得学到的参数能够让预测值`$\hat{Y}$`尽可能地接近真实值`$Y$`。这里我们引入损失函数（[Loss Function](https://en.wikipedia.org/wiki/Loss_function)，或Cost Function）这个概念。 输入任意一个数据样本的目标值`$y_{i}$`和模型给出的预测值`$\hat{y_{i}}$`，损失函数输出一个非负的实值。这个实值通常用来反映模型误差的大小。
+
+对于线性回归模型来讲，最常见的损失函数就是均方误差（Mean Squared Error， [MSE](https://en.wikipedia.org/wiki/Mean_squared_error)）了，它的形式是：
+
+$$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$
+
+即对于一个大小为`$n$`的测试集，`$MSE$`是`$n$`个数据预测结果误差平方的均值。
+
+### 训练过程
+
+定义好模型结构之后，我们要通过以下几个步骤进行模型训练
+1. 初始化参数，其中包括权重`$\omega_i$`和偏置`$b$`，对其进行初始化（如0均值，1方差）。
+2. 网络正向传播计算网络输出和损失函数。
+3. 根据损失函数进行反向误差传播 （[backpropagation](https://en.wikipedia.org/wiki/Backpropagation)），将网络误差从输出层依次向前传递, 并更新网络中的参数。
+4. 重复2~3步骤，直至网络训练误差达到规定的程度或训练轮次达到设定值。
+
+## 数据集
+
+### 数据集介绍
+这份数据集共506行，每行包含了波士顿郊区的一类房屋的相关信息及该类房屋价格的中位数。其各维属性的意义如下：
+
+<p align="center">
+<table>
+    <thead>
+    <tr>
+        <th>属性名</th>
+        <th>解释</th>
+        <th>类型</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+        <td>CRIM</td>
+        <td>该镇的人均犯罪率</td>
+        <td>连续值</td>
+    </tr>
+    <tr>
+        <td>ZN</td>
+        <td>占地面积超过25,000平方呎的住宅用地比例</td>
+        <td>连续值</td>
+    </tr>
+    <tr>
+        <td>INDUS</td>
+        <td>非零售商业用地比例</td>
+        <td>连续值</td>
+    </tr>
+    <tr>
+        <td>CHAS</td>
+        <td>是否邻近 Charles River</td>
+        <td>离散值，1=邻近；0=不邻近</td>
+    </tr>
+    <tr>
+        <td>NOX</td>
+        <td>一氧化氮浓度</td>
+        <td>连续值</td>
+    </tr>
+    <tr>
+        <td>RM</td>
+        <td>每栋房屋的平均客房数</td>
+        <td>连续值</td>
+    </tr>
+    <tr>
+        <td>AGE</td>
+        <td>1940年之前建成的自用单位比例</td>
+        <td>连续值</td>
+    </tr>
+    <tr>
+        <td>DIS</td>
+        <td>到波士顿5个就业中心的加权距离</td>
+        <td>连续值</td>
+    </tr>
+    <tr>
+        <td>RAD</td>
+        <td>到径向公路的可达性指数</td>
+        <td>连续值</td>
+    </tr>
+    <tr>
+        <td>TAX</td>
+        <td>全值财产税率</td>
+        <td>连续值</td>
+    </tr>
+    <tr>
+        <td>PTRATIO</td>
+        <td>学生与教师的比例</td>
+        <td>连续值</td>
+    </tr>
+    <tr>
+        <td>B</td>
+        <td>1000(BK - 0.63)^2，其中BK为黑人占比</td>
+        <td>连续值</td>
+    </tr>
+    <tr>
+        <td>LSTAT</td>
+        <td>低收入人群占比</td>
+        <td>连续值</td>
+    </tr>
+    <tr>
+        <td>MEDV</td>
+        <td>同类房屋价格的中位数</td>
+        <td>连续值</td>
+    </tr>
+    </tbody>
+</table>
+</p>
+
+### 数据预处理
+#### 连续值与离散值
+观察一下数据，我们的第一个发现是：所有的13维属性中，有12维的连续值和1维的离散值（CHAS）。离散值虽然也常使用类似0、1、2这样的数字表示，但是其含义与连续值是不同的，因为这里的差值没有实际意义。例如，我们用0、1、2来分别表示红色、绿色和蓝色的话，我们并不能因此说“蓝色和红色”比“绿色和红色”的距离更远。所以通常对一个有`$d$`个可能取值的离散属性，我们会将它们转为`$d$`个取值为0或1的二值属性或者将每个可能取值映射为一个多维向量。不过就这里而言，因为CHAS本身就是一个二值属性，就省去了这个麻烦。
+
+#### 属性的归一化
+另外一个稍加观察即可发现的事实是，各维属性的取值范围差别很大（如图2所示）。例如，属性B的取值范围是[0.32, 396.90]，而属性NOX的取值范围是[0.3850, 0.8170]。这里就要用到一个常见的操作-归一化（normalization）了。归一化的目标是把各位属性的取值范围放缩到差不多的区间，例如[-0.5,0.5]。这里我们使用一种很常见的操作方法：减掉均值，然后除以原取值范围。
+
+做归一化（或 [Feature scaling](https://en.wikipedia.org/wiki/Feature_scaling)）至少有以下3个理由：
+- 过大或过小的数值范围会导致计算时的浮点上溢或下溢。
+- 不同的数值范围会导致不同属性对模型的重要性不同（至少在训练的初始阶段如此），而这个隐含的假设常常是不合理的。这会对优化的过程造成困难，使训练时间大大的加长。
+- 很多的机器学习技巧/模型（例如L1，L2正则项，向量空间模型-Vector Space Model）都基于这样的假设：所有的属性取值都差不多是以0为均值且取值范围相近的。
+
+![featureScale](./image/ranges.png)
+<p align="center">图2. 各维属性的取值范围</p>
+
+#### 整理训练集与测试集
+我们将数据集分割为两份：一份用于调整模型的参数，即进行模型的训练，模型在这份数据集上的误差被称为**训练误差**；另外一份被用来测试，模型在这份数据集上的误差被称为**测试误差**。我们训练模型的目的是为了通过从训练数据中找到规律来预测未知的新数据，所以测试误差是更能反映模型表现的指标。分割数据的比例要考虑到两个因素：更多的训练数据会降低参数估计的方差，从而得到更可信的模型；而更多的测试数据会降低测试误差的方差，从而得到更可信的测试误差。我们这个例子中设置的分割比例为`$8:2$`
+
+
+在更复杂的模型训练过程中，我们往往还会多使用一种数据集：验证集。因为复杂的模型中常常还有一些超参数（[Hyperparameter](https://en.wikipedia.org/wiki/Hyperparameter_optimization)）需要调节，所以我们会尝试多种超参数的组合来分别训练多个模型，然后对比它们在验证集上的表现选择相对最好的一组超参数，最后才使用这组参数下训练的模型在测试集上评估测试误差。由于本章训练的模型比较简单，我们暂且忽略掉这个过程。
+
+## 训练
+
+`fit_a_line/trainer.py`演示了训练的整体过程。
+
+### 配置数据提供器(Datafeeder)
+首先我们引入必要的库：
+```python
+import paddle
+import paddle.fluid as fluid
+import numpy
+```
+
+我们通过uci_housing模块引入了数据集合[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)
+
+其中，在uci_housing模块中封装了：
+
+1. 数据下载的过程。下载数据保存在~/.cache/paddle/dataset/uci_housing/housing.data。
+2. [数据预处理](#数据预处理)的过程。
+
+接下来我们定义了用于训练和测试的数据提供器。提供器每次读入一个大小为`BATCH_SIZE`的数据批次。如果用户希望加一些随机性，她可以同时定义一个批次大小和一个缓存大小。这样的话，每次数据提供器会从缓存中随机读取批次大小那么多的数据。
+
+```python
+BATCH_SIZE = 20
+
+train_reader = paddle.batch(
+paddle.reader.shuffle(
+paddle.dataset.uci_housing.train(), buf_size=500),
+batch_size=BATCH_SIZE)
+
+test_reader = paddle.batch(
+paddle.reader.shuffle(
+paddle.dataset.uci_housing.test(), buf_size=500),
+batch_size=BATCH_SIZE)
+```
+
+### 配置训练程序
+训练程序的目的是定义一个训练模型的网络结构。对于线性回归来讲，它就是一个从输入到输出的简单的全连接层。更加复杂的结果，比如卷积神经网络，递归神经网络等会在随后的章节中介绍。训练程序必须返回`平均损失`作为第一个返回值，因为它会被后面反向传播算法所用到。
+
+```python
+def train_program():
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+# feature vector of length 13
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+
+loss = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_loss = fluid.layers.mean(loss)
+
+return avg_loss
+```
+
+### 定义运算场所
+我们可以定义运算是发生在CPU还是GPU
+
+```python
+use_cuda = False
+place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+```
+
+### 创建训练器
+训练器会读入一个训练程序和一些必要的其他参数：
+
+```python
+trainer = fluid.Trainer(
+train_func=train_program,
+place=place,
+optimizer_func=fluid.optimizer.SGD(learning_rate=0.001))
+```
+
+### 开始提供数据
+PaddlePaddle提供了读取数据者发生器机制来读取训练数据。读取数据者会一次提供多列数据，因此我们需要一个Python的list来定义读取顺序。
+
+```python
+feed_order=['x', 'y']
+```
+
+除此之外，可以定义一个事件相应器来处理类似`打印训练进程`的事件：
+
+```python
+# Specify the directory path to save the parameters
+params_dirname = "fit_a_line.inference.model"
+
+# Plot data
+from paddle.v2.plot import Ploter
+train_title = "Train cost"
+test_title = "Test cost"
+plot_cost = Ploter(train_title, test_title)
+
+step = 0
+
+# event_handler to print training and testing info
+def event_handler_plot(event):
+global step
+if isinstance(event, fluid.EndStepEvent):
+if event.step % 10 == 0: # every 10 batches, record a test cost
+test_metrics = trainer.test(
+reader=test_reader, feed_order=feed_order)
+
+plot_cost.append(test_title, step, test_metrics[0])
+plot_cost.plot()
+
+if test_metrics[0] < 10.0:
+# If the accuracy is good enough, we can stop the training.
+print('loss is less than 10.0, stop')
+trainer.stop()
+
+# We can save the trained parameters for the inferences later
+if params_dirname is not None:
+trainer.save_params(params_dirname)
+
+step += 1
+```
+
+### 开始训练
+我们现在可以通过调用`trainer.train()`来开始训练
+
+```python
+%matplotlib inline
+
+# The training could take up to a few minutes.
+trainer.train(
+reader=train_reader,
+num_epochs=100,
+event_handler=event_handler_plot,
+feed_order=feed_order)
+```
+
+![trainTestCost](./image/train_and_test.png)
+
+## 预测
+提供一个`inference_program`和一个`params_dirname`来初始化预测器。`params_dirname`用来存储我们的参数。
+
+### 设定预测程序
+类似于`trainer.train`，预测器需要一个预测程序来做预测。我们可以稍加修改我们的训练程序来把预测值包含进来。
+
+
+```python
+def inference_program():
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+return y_predict
+```
+
+### 预测
+预测器会从`params_dirname`中读取已经训练好的模型，来对从未遇见过的数据进行预测。
+
+```python
+inferencer = fluid.Inferencer(
+infer_func=inference_program, param_path=params_dirname, place=place)
+
+batch_size = 10
+tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
+
+results = inferencer.infer({'x': tensor_x})
+print("infer results: ", results[0])
+```
+
+## 总结
+在这章里，我们借助波士顿房价这一数据集，介绍了线性回归模型的基本概念，以及如何使用PaddlePaddle实现训练和测试的过程。很多的模型和技巧都是从简单的线性回归模型演化而来，因此弄清楚线性模型的原理和局限非常重要。
+
+
+## 参考文献
+1. https://en.wikipedia.org/wiki/Linear_regression
+2. Friedman J, Hastie T, Tibshirani R. The elements of statistical learning[M]. Springer, Berlin: Springer series in statistics, 2001.
+3. Murphy K P. Machine learning: a probabilistic perspective[M]. MIT press, 2012.
+4. Bishop C M. Pattern recognition[J]. Machine Learning, 2006, 128.
+
+<br/>
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/predictions.png b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/predictions.png
new file mode 100644
index 0000000000000000000000000000000000000000..27e4acb1313794f52ad9ad9e874cdadd197ff41f
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/predictions.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/ranges.png b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/ranges.png
new file mode 100644
index 0000000000000000000000000000000000000000..5d86b12715f46afbafb7d50e2938e184219b5b95
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/ranges.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/train_and_test.png b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/train_and_test.png
new file mode 100644
index 0000000000000000000000000000000000000000..bcd304a6a0baf30ecfbc43e08fc0aca179d05958
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/train_and_test.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/index.rst b/doc/fluid/new_docs/beginners_guide/quick_start/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f5889ba52b8016596108de48bad59f238c16afc0
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/index.rst
@@ -0,0 +1,13 @@
+########
+快速入门
+########
+
+..  todo::
+
+    概述
+
+..  toctree::
+    :maxdepth: 2
+
+    fit_a_line/README.cn.md
+    recognize_digits/README.cn.md
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..c04a949a3f6550048f2a3447070829aeb640b995
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md
@@ -0,0 +1,453 @@
+# 识别数字
+
+本教程源代码目录在[book/recognize_digits](https://github.com/PaddlePaddle/book/tree/develop/02.recognize_digits)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)。
+
+## 背景介绍
+当我们学习编程的时候，编写的第一个程序一般是实现打印"Hello World"。而机器学习（或深度学习）的入门教程，一般都是 [MNIST](http://yann.lecun.com/exdb/mnist/) 数据库上的手写识别问题。原因是手写识别属于典型的图像分类问题，比较简单，同时MNIST数据集也很完备。MNIST数据集作为一个简单的计算机视觉数据集，包含一系列如图1所示的手写数字图片和对应的标签。图片是28x28的像素矩阵，标签则对应着0~9的10个数字。每张图片都经过了大小归一化和居中处理。
+
+![MNIST](./image/mnist_example_image.png)
+<p align="center">图1. MNIST图片示例</p>
+
+MNIST数据集是从 [NIST](https://www.nist.gov/srd/nist-special-database-19) 的Special Database 3（SD-3）和Special Database 1（SD-1）构建而来。由于SD-3是由美国人口调查局的员工进行标注，SD-1是由美国高中生进行标注，因此SD-3比SD-1更干净也更容易识别。Yann LeCun等人从SD-1和SD-3中各取一半作为MNIST的训练集（60000条数据）和测试集（10000条数据），其中训练集来自250位不同的标注员，此外还保证了训练集和测试集的标注员是不完全相同的。
+
+Yann LeCun早先在手写字符识别上做了很多研究，并在研究过程中提出了卷积神经网络（Convolutional Neural Network），大幅度地提高了手写字符的识别能力，也因此成为了深度学习领域的奠基人之一。如今的深度学习领域，卷积神经网络占据了至关重要的地位，从最早Yann LeCun提出的简单LeNet，到如今ImageNet大赛上的优胜模型VGGNet、GoogLeNet、ResNet等（请参见[图像分类](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification) 教程），人们在图像分类领域，利用卷积神经网络得到了一系列惊人的结果。
+
+有很多算法在MNIST上进行实验。1998年，LeCun分别用单层线性分类器、多层感知器（Multilayer Perceptron, MLP）和多层卷积神经网络LeNet进行实验，使得测试集上的误差不断下降（从12%下降到0.7%）\[[1](#参考文献)\]。此后，科学家们又基于K近邻（K-Nearest Neighbors）算法\[[2](#参考文献)\]、支持向量机（SVM）\[[3](#参考文献)\]、神经网络\[[4-7](#参考文献)\]和Boosting方法\[[8](#参考文献)\]等做了大量实验，并采用多种预处理方法（如去除歪曲、去噪、模糊等）来提高识别的准确率。
+
+本教程中，我们从简单的模型Softmax回归开始，带大家入门手写字符识别，并逐步进行模型优化。
+
+
+## 模型概览
+
+基于MNIST数据训练一个分类器，在介绍本教程使用的三个基本图像分类网络前，我们先给出一些定义：
+- `$X$`是输入：MNIST图片是`$28\times28$` 的二维图像，为了进行计算，我们将其转化为`$784$`维向量，即`$X=\left ( x_0, x_1, \dots, x_{783} \right )$`。
+- `$Y$`是输出：分类器的输出是10类数字（0-9），即`$Y=\left ( y_0, y_1, \dots, y_9 \right )$`，每一维`$y_i$`代表图片分类为第`$i$`类数字的概率。
+- `$L$`是图片的真实标签：`$L=\left ( l_0, l_1, \dots, l_9 \right )$`也是10维，但只有一维为1，其他都为0。
+
+### Softmax回归(Softmax Regression)
+
+最简单的Softmax回归模型是先将输入层经过一个全连接层得到的特征，然后直接通过softmax 函数进行多分类\[[9](#参考文献)\]。
+
+输入层的数据`$X$`传到输出层，在激活操作之前，会乘以相应的权重 `$W$` ，并加上偏置变量 `$b$` ，具体如下：
+
+$$ y_i = \text{softmax}(\sum_j W_{i,j}x_j + b_i) $$
+
+其中 `$ \text{softmax}(x_i) = \frac{e^{x_i}}{\sum_j e^{x_j}} $`
+
+对于有 `$N$` 个类别的多分类问题，指定 `$N$` 个输出节点，`$N$` 维结果向量经过softmax将归一化为 `$N$` 个[0,1]范围内的实数值，分别表示该样本属于这 `$N$` 个类别的概率。此处的 `$y_i$` 即对应该图片为数字 `$i$` 的预测概率。
+
+在分类问题中，我们一般采用交叉熵代价损失函数（cross entropy），公式如下：
+
+$$  \text{crossentropy}(label, y) = -\sum_i label_ilog(y_i) $$
+
+图2为softmax回归的网络图，图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。
+
+![softmaxRegression](./image/softmax_regression.png)
+<p align="center">图2. softmax回归网络结构图</p>
+
+### 多层感知器(Multilayer Perceptron, MLP)
+
+Softmax回归模型采用了最简单的两层神经网络，即只有输入层和输出层，因此其拟合能力有限。为了达到更好的识别效果，我们考虑在输入层和输出层中间加上若干个隐藏层\[[10](#参考文献)\]。
+
+1.  经过第一个隐藏层，可以得到 `$ H_1 = \phi(W_1X + b_1) $`，其中`$\phi$`代表激活函数，常见的有sigmoid、tanh或ReLU等函数。
+2.  经过第二个隐藏层，可以得到 `$ H_2 = \phi(W_2H_1 + b_2) $`。
+3.  最后，再经过输出层，得到的`$Y=\text{softmax}(W_3H_2 + b_3)$`，即为最后的分类结果向量。
+
+
+图3为多层感知器的网络结构图，图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。
+
+![multilayerPerceptron](./image/mlp.png)
+<p align="center">图3. 多层感知器网络结构图</p>
+
+### 卷积神经网络(Convolutional Neural Network, CNN)
+
+在多层感知器模型中，将图像展开成一维向量输入到网络中，忽略了图像的位置和结构信息，而卷积神经网络能够更好的利用图像的结构信息。[LeNet-5](http://yann.lecun.com/exdb/lenet/)是一个较简单的卷积神经网络。图4显示了其结构：输入的二维图像，先经过两次卷积层到池化层，再经过全连接层，最后使用softmax分类作为输出层。下面我们主要介绍卷积层和池化层。
+
+![cnnStructure](./image/cnn.png)
+<p align="center">图4. LeNet-5卷积神经网络结构</p>
+
+#### 卷积层
+
+卷积层是卷积神经网络的核心基石。在图像识别里我们提到的卷积是二维卷积，即离散二维滤波器（也称作卷积核）与二维图像做卷积操作，简单的讲是二维滤波器滑动到二维图像上所有位置，并在每个位置上与该像素点及其领域像素点做内积。卷积操作被广泛应用与图像处理领域，不同卷积核可以提取不同的特征，例如边沿、线性、角等特征。在深层卷积神经网络中，通过卷积操作可以提取出图像低级到复杂的特征。
+
+![cnn](https://raw.githubusercontent.com/PaddlePaddle/book/develop/02.recognize_digits/image/conv_layer.png)
+<p align="center">图5. 卷积层图片</p>
+
+图5给出一个卷积计算过程的示例图，输入图像大小为`$H=5,W=5,D=3$`，即`$5 \times 5$`大小的3通道（RGB，也称作深度）彩色图像。这个示例图中包含两（用`$K$`表示）组卷积核，即图中滤波器`$W_0$`和`$W_1$`。在卷积计算中，通常对不同的输入通道采用不同的卷积核，如图示例中每组卷积核包含（`$D=3$`）个`$3 \times 3$`（用`$F \times F$`表示）大小的卷积核。另外，这个示例中卷积核在图像的水平方向（`$W$`方向）和垂直方向（`$H$`方向）的滑动步长为2（用`$S$`表示）；对输入图像周围各填充1（用`$P$`表示）个0，即图中输入层原始数据为蓝色部分，灰色部分是进行了大小为1的扩展，用0来进行扩展。经过卷积操作得到输出为`$3 \times 3 \times 2$`（用`$H_{o} \times W_{o} \times K$`表示）大小的特征图，即`$3 \times 3$`大小的2通道特征图，其中`$H_o$`计算公式为：`$H_o = (H - F + 2 \times P)/S + 1$`，`$W_o$`同理。 而输出特征图中的每个像素，是每组滤波器与输入图像每个特征图的内积再求和，再加上偏置`$b_o$`，偏置通常对于每个输出特征图是共享的。输出特征图`$o[:,:,0]$`中的最后一个`$-2$`计算如图5右下角公式所示。
+
+在卷积操作中卷积核是可学习的参数，经过上面示例介绍，每层卷积的参数大小为`$D \times F \times F \times K$`。在多层感知器模型中，神经元通常是全部连接，参数较多。而卷积层的参数较少，这也是由卷积层的主要特性即局部连接和共享权重所决定。
+
+- 局部连接：每个神经元仅与输入神经元的一块区域连接，这块局部区域称作感受野（receptive field）。在图像卷积操作中，即神经元在空间维度（spatial dimension，即上图示例H和W所在的平面）是局部连接，但在深度上是全部连接。对于二维图像本身而言，也是局部像素关联较强。这种局部连接保证了学习后的过滤器能够对于局部的输入特征有最强的响应。局部连接的思想，也是受启发于生物学里面的视觉系统结构，视觉皮层的神经元就是局部接受信息的。
+
+- 权重共享：计算同一个深度切片的神经元时采用的滤波器是共享的。例如图4中计算`$o[:,:,0]$`的每个每个神经元的滤波器均相同，都为`$W_0$`，这样可以很大程度上减少参数。共享权重在一定程度上讲是有意义的，例如图片的底层边缘特征与特征在图中的具体位置无关。但是在一些场景中是无意的，比如输入的图片是人脸，眼睛和头发位于不同的位置，希望在不同的位置学到不同的特征 (参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/))。请注意权重只是对于同一深度切片的神经元是共享的，在卷积层，通常采用多组卷积核提取不同特征，即对应不同深度切片的特征，不同深度切片的神经元权重是不共享。另外，偏重对同一深度切片的所有神经元都是共享的。
+
+通过介绍卷积计算过程及其特性，可以看出卷积是线性操作，并具有平移不变性（shift-invariant），平移不变性即在图像每个位置执行相同的操作。卷积层的局部连接和权重共享使得需要学习的参数大大减小，这样也有利于训练较大卷积神经网络。
+
+#### 池化层
+
+![pooling](./image/max_pooling.png)
+<p align="center">图6. 池化层图片</p>
+
+池化是非线性下采样的一种形式，主要作用是通过减少网络的参数来减小计算量，并且能够在一定程度上控制过拟合。通常在卷积层的后面会加上一个池化层。池化包括最大池化、平均池化等。其中最大池化是用不重叠的矩形框将输入层分成不同的区域，对于每个矩形框的数取最大值作为输出层，如图6所示。
+
+更详细的关于卷积神经网络的具体知识可以参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/ )和[图像分类](https://github.com/PaddlePaddle/book/blob/develop/image_classification/README.md)教程。
+
+### 常见激活函数介绍
+- sigmoid激活函数： `$ f(x) = sigmoid(x) = \frac{1}{1+e^{-x}} $`
+
+- tanh激活函数： `$ f(x) = tanh(x) = \frac{e^x-e^{-x}}{e^x+e^{-x}} $`
+
+实际上，tanh函数只是规模变化的sigmoid函数，将sigmoid函数值放大2倍之后再向下平移1个单位：tanh(x) = 2sigmoid(2x) - 1 。
+
+- ReLU激活函数： `$ f(x) = max(0, x) $`
+
+更详细的介绍请参考[维基百科激活函数](https://en.wikipedia.org/wiki/Activation_function)。
+
+## 数据介绍
+
+PaddlePaddle在API中提供了自动加载[MNIST](http://yann.lecun.com/exdb/mnist/)数据的模块`paddle.dataset.mnist`。加载后的数据位于`/home/username/.cache/paddle/dataset/mnist`下：
+
+<p align="center">
+<table>
+    <thead>
+    <tr>
+        <th>文件名称</th>
+        <th>说明</th>
+    </tr>
+    </thead>
+
+    <tbody>
+    <tr>
+        <td>train-images-idx3-ubyte</td>
+        <td>训练数据图片，60,000条数据</td>
+    </tr>
+    <tr>
+        <td>train-labels-idx1-ubyte</td>
+        <td>训练数据标签，60,000条数据</td>
+    </tr>
+    <tr>
+        <td>t10k-images-idx3-ubyte</td>
+        <td>测试数据图片，10,000条数据</td>
+    </tr>
+    <tr>
+        <td>t10k-labels-idx1-ubyte</td>
+        <td>测试数据标签，10,000条数据</td>
+    </tr>
+    </tbody>
+</table>
+</p>
+
+## Fluid API 概述
+
+演示将使用最新的 `Fluid API`。Fluid API是最新的 PaddlePaddle API。它在不牺牲性能的情况下简化了模型配置。
+我们建议使用 Fluid API，因为它更容易学起来。
+
+下面是快速的 Fluid API 概述。
+1. `inference_program`：指定如何从数据输入中获得预测的函数。
+这是指定网络流的地方。
+
+1. `train_program`：指定如何从 `inference_program` 和`标签值`中获取 `loss` 的函数。
+这是指定损失计算的地方。
+
+1. `optimizer_func`: “指定优化器配置的函数。优化器负责减少损失并驱动培训。Paddle 支持多种不同的优化器。
+
+1. `Trainer`：PaddlePaddle Trainer 管理由 `train_program` 和 `optimizer` 指定的训练过程。
+通过 `event_handler` 回调函数，用户可以监控培训的进展。
+
+1. `Inferencer`：Fluid inferencer 加载 `inference_program` 和由 Trainer 训练的参数。
+然后，它可以推断数据和返回预测。
+
+在这个演示中，我们将深入了解它们。
+
+## 配置说明
+加载 PaddlePaddle 的 Fluid API 包。
+
+```python
+import paddle
+import paddle.fluid as fluid
+```
+
+### Program Functions 配置
+
+我们需要设置“推理程序”函数。我们想用这个程序来演示三个不同的分类器，每个分类器都定义为 Python 函数。
+我们需要将图像数据馈送到分类器。Paddle 为读取数据提供了一个特殊的层 `layer.data` 层。
+让我们创建一个数据层来读取图像并将其连接到分类网络。
+
+- Softmax回归：只通过一层简单的以softmax为激活函数的全连接层，就可以得到分类的结果。
+
+```python
+def softmax_regression():
+img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+predict = fluid.layers.fc(
+input=img, size=10, act='softmax')
+return predict
+```
+
+- 多层感知器：下面代码实现了一个含有两个隐藏层（即全连接层）的多层感知器。其中两个隐藏层的激活函数均采用ReLU，输出层的激活函数用Softmax。
+
+```python
+def multilayer_perceptron():
+img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+# 第一个全连接层，激活函数为ReLU
+hidden = fluid.layers.fc(input=img, size=200, act='relu')
+# 第二个全连接层，激活函数为ReLU
+hidden = fluid.layers.fc(input=hidden, size=200, act='relu')
+# 以softmax为激活函数的全连接输出层，输出层的大小必须为数字的个数10
+prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+return prediction
+```
+
+- 卷积神经网络LeNet-5: 输入的二维图像，首先经过两次卷积层到池化层，再经过全连接层，最后使用以softmax为激活函数的全连接层作为输出层。
+
+```python
+def convolutional_neural_network():
+img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+# 第一个卷积-池化层
+conv_pool_1 = fluid.nets.simple_img_conv_pool(
+input=img,
+filter_size=5,
+num_filters=20,
+pool_size=2,
+pool_stride=2,
+act="relu")
+conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+# 第二个卷积-池化层
+conv_pool_2 = fluid.nets.simple_img_conv_pool(
+input=conv_pool_1,
+filter_size=5,
+num_filters=50,
+pool_size=2,
+pool_stride=2,
+act="relu")
+# 以softmax为激活函数的全连接输出层，输出层的大小必须为数字的个数10
+prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
+return prediction
+```
+
+#### Train Program 配置
+然后我们需要设置训练程序 `train_program`。它首先从分类器中进行预测。
+在训练期间，它将从预测中计算 `avg_cost`。
+
+**注意:** 训练程序应该返回一个数组，第一个返回参数必须是 `avg_cost`。训练器使用它来计算梯度。
+
+请随意修改代码，测试 Softmax 回归 `softmax_regression`, `MLP` 和 卷积神经网络 `convolutional neural network` 分类器之间的不同结果。
+
+```python
+def train_program():
+label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+# predict = softmax_regression() # uncomment for Softmax回归
+# predict = multilayer_perceptron() # uncomment for 多层感知器
+predict = convolutional_neural_network() # uncomment for LeNet5卷积神经网络
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(cost)
+acc = fluid.layers.accuracy(input=predict, label=label)
+return [avg_cost, acc]
+
+
+# 该模型运行在单个CPU上
+```
+
+#### Optimizer Function 配置
+
+在下面的 `Adam optimizer`，`learning_rate` 是训练的速度，与网络的训练收敛速度有关系。
+
+```python
+def optimizer_program():
+return fluid.optimizer.Adam(learning_rate=0.001)
+```
+
+### 数据集 Feeders 配置
+
+下一步，我们开始训练过程。`paddle.dataset.movielens.train()`和`paddle.dataset.movielens.test()`分别做训练和测试数据集。这两个函数各自返回一个reader——PaddlePaddle中的reader是一个Python函数，每次调用的时候返回一个Python yield generator。
+
+下面`shuffle`是一个reader decorator，它接受一个reader A，返回另一个reader B —— reader B 每次读入`buffer_size`条训练数据到一个buffer里，然后随机打乱其顺序，并且逐条输出。
+
+`batch`是一个特殊的decorator，它的输入是一个reader，输出是一个batched reader —— 在PaddlePaddle里，一个reader每次yield一条训练数据，而一个batched reader每次yield一个minibatch。
+
+```python
+train_reader = paddle.batch(
+paddle.reader.shuffle(
+paddle.dataset.mnist.train(), buf_size=500),
+batch_size=64)
+
+test_reader = paddle.batch(
+paddle.dataset.mnist.test(), batch_size=64)
+```
+
+### Trainer 配置
+
+现在，我们需要配置 `Trainer`。`Trainer` 需要接受训练程序 `train_program`, `place` 和优化器 `optimizer`。
+
+```python
+# 该模型运行在单个CPU上
+use_cuda = False # set to True if training with GPU
+place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+trainer = fluid.Trainer(
+train_func=train_program, place=place, optimizer_func=optimizer_program)
+```
+
+#### Event Handler 配置
+
+Fluid API 在训练期间为回调函数提供了一个钩子。用户能够通过机制监控培训进度。
+我们将在这里演示两个 `event_handler` 程序。请随意修改 Jupyter 笔记本 ，看看有什么不同。
+
+`event_handler` 用来在训练过程中输出训练结果
+
+```python
+# Save the parameter into a directory. The Inferencer can load the parameters from it to do infer
+params_dirname = "recognize_digits_network.inference.model"
+lists = []
+def event_handler(event):
+if isinstance(event, fluid.EndStepEvent):
+if event.step % 100 == 0:
+# event.metrics maps with train program return arguments.
+# event.metrics[0] will yeild avg_cost and event.metrics[1] will yeild acc in this example.
+print "Pass %d, Batch %d, Cost %f" % (
+event.step, event.epoch, event.metrics[0])
+
+if isinstance(event, fluid.EndEpochEvent):
+avg_cost, acc = trainer.test(
+reader=test_reader, feed_order=['img', 'label'])
+
+print("Test with Epoch %d, avg_cost: %s, acc: %s" % (event.epoch, avg_cost, acc))
+
+# save parameters
+trainer.save_params(params_dirname)
+lists.append((event.epoch, avg_cost, acc))
+```
+
+`event_handler_plot` 可以用来在训练过程中画图如下：
+
+![png](./image/train_and_test.png)
+
+```python
+from paddle.v2.plot import Ploter
+
+train_title = "Train cost"
+test_title = "Test cost"
+cost_ploter = Ploter(train_title, test_title)
+step = 0
+lists = []
+
+# event_handler to plot a figure
+def event_handler_plot(event):
+global step
+if isinstance(event, fluid.EndStepEvent):
+if step % 100 == 0:
+# event.metrics maps with train program return arguments.
+# event.metrics[0] will yeild avg_cost and event.metrics[1] will yeild acc in this example.
+cost_ploter.append(train_title, step, event.metrics[0])
+cost_ploter.plot()
+step += 1
+if isinstance(event, fluid.EndEpochEvent):
+# save parameters
+trainer.save_params(params_dirname)
+
+avg_cost, acc = trainer.test(
+reader=test_reader, feed_order=['img', 'label'])
+cost_ploter.append(test_title, step, avg_cost)
+lists.append((event.epoch, avg_cost, acc))
+```
+
+#### 开始训练
+
+既然我们设置了 `event_handler` 和 `data reader`，我们就可以开始训练模型了。
+
+`feed_order` 用于将数据目录映射到 `train_program`
+
+```python
+trainer.train(
+num_epochs=5,
+event_handler=event_handler,
+reader=train_reader,
+feed_order=['img', 'label'])
+```
+
+训练过程是完全自动的，event_handler里打印的日志类似如下所示：
+
+```
+Pass 0, Batch 0, Cost 0.125650
+Pass 100, Batch 0, Cost 0.161387
+Pass 200, Batch 0, Cost 0.040036
+Pass 300, Batch 0, Cost 0.023391
+Pass 400, Batch 0, Cost 0.005856
+Pass 500, Batch 0, Cost 0.003315
+Pass 600, Batch 0, Cost 0.009977
+Pass 700, Batch 0, Cost 0.020959
+Pass 800, Batch 0, Cost 0.105560
+Pass 900, Batch 0, Cost 0.239809
+Test with Epoch 0, avg_cost: 0.053097883707459624, acc: 0.9822850318471338
+```
+
+训练之后，检查模型的预测准确度。用 MNIST 训练的时候，一般 softmax回归模型的分类准确率为约为 92.34%，多层感知器为97.66%，卷积神经网络可以达到 99.20%。
+
+
+## 应用模型
+
+可以使用训练好的模型对手写体数字图片进行分类，下面程序展示了如何使用 `fluid.Inferencer` 接口进行推断。
+
+### Inference 配置
+
+`Inference` 需要一个 `infer_func` 和 `param_path` 来设置网络和经过训练的参数。
+我们可以简单地插入在此之前定义的分类器。
+
+```python
+inferencer = fluid.Inferencer(
+# infer_func=softmax_regression, # uncomment for softmax regression
+# infer_func=multilayer_perceptron, # uncomment for MLP
+infer_func=convolutional_neural_network,  # uncomment for LeNet5
+param_path=params_dirname,
+place=place)
+```
+
+### 生成预测输入数据
+
+`infer_3.png` 是数字 3 的一个示例图像。把它变成一个 numpy 数组以匹配数据馈送格式。
+
+```python
+# Prepare the test image
+import os
+import numpy as np
+from PIL import Image
+def load_image(file):
+im = Image.open(file).convert('L')
+im = im.resize((28, 28), Image.ANTIALIAS)
+im = np.array(im).reshape(1, 1, 28, 28).astype(np.float32)
+im = im / 255.0 * 2.0 - 1.0
+return im
+
+cur_dir = cur_dir = os.getcwd()
+img = load_image(cur_dir + '/image/infer_3.png')
+```
+
+### 预测
+
+现在我们准备做预测。
+
+```python
+results = inferencer.infer({'img': img})
+lab = np.argsort(results)  # probs and lab are the results of one batch data
+print "Label of image/infer_3.png is: %d" % lab[0][0][-1]
+```
+
+## 总结
+
+本教程的softmax回归、多层感知器和卷积神经网络是最基础的深度学习模型，后续章节中复杂的神经网络都是从它们衍生出来的，因此这几个模型对之后的学习大有裨益。同时，我们也观察到从最简单的softmax回归变换到稍复杂的卷积神经网络的时候，MNIST数据集上的识别准确率有了大幅度的提升，原因是卷积层具有局部连接和共享权重的特性。在之后学习新模型的时候，希望大家也要深入到新模型相比原模型带来效果提升的关键之处。此外，本教程还介绍了PaddlePaddle模型搭建的基本流程，从dataprovider的编写、网络层的构建，到最后的训练和预测。对这个流程熟悉以后，大家就可以用自己的数据，定义自己的网络模型，并完成自己的训练和预测任务了。
+
+## 参考文献
+
+1. LeCun, Yann, Léon Bottou, Yoshua Bengio, and Patrick Haffner. ["Gradient-based learning applied to document recognition."](http://ieeexplore.ieee.org/abstract/document/726791/) Proceedings of the IEEE 86, no. 11 (1998): 2278-2324.
+2. Wejéus, Samuel. ["A Neural Network Approach to Arbitrary SymbolRecognition on Modern Smartphones."](http://www.diva-portal.org/smash/record.jsf?pid=diva2%3A753279&dswid=-434) (2014).
+3. Decoste, Dennis, and Bernhard Schölkopf. ["Training invariant support vector machines."](http://link.springer.com/article/10.1023/A:1012454411458) Machine learning 46, no. 1-3 (2002): 161-190.
+4. Simard, Patrice Y., David Steinkraus, and John C. Platt. ["Best Practices for Convolutional Neural Networks Applied to Visual Document Analysis."](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.160.8494&rep=rep1&type=pdf) In ICDAR, vol. 3, pp. 958-962. 2003.
+5. Salakhutdinov, Ruslan, and Geoffrey E. Hinton. ["Learning a Nonlinear Embedding by Preserving Class Neighbourhood Structure."](http://www.jmlr.org/proceedings/papers/v2/salakhutdinov07a/salakhutdinov07a.pdf) In AISTATS, vol. 11. 2007.
+6. Cireşan, Dan Claudiu, Ueli Meier, Luca Maria Gambardella, and Jürgen Schmidhuber. ["Deep, big, simple neural nets for handwritten digit recognition."](http://www.mitpressjournals.org/doi/abs/10.1162/NECO_a_00052) Neural computation 22, no. 12 (2010): 3207-3220.
+7. Deng, Li, Michael L. Seltzer, Dong Yu, Alex Acero, Abdel-rahman Mohamed, and Geoffrey E. Hinton. ["Binary coding of speech spectrograms using a deep auto-encoder."](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.185.1908&rep=rep1&type=pdf) In Interspeech, pp. 1692-1695. 2010.
+8. Kégl, Balázs, and Róbert Busa-Fekete. ["Boosting products of base classifiers."](http://dl.acm.org/citation.cfm?id=1553439) In Proceedings of the 26th Annual International Conference on Machine Learning, pp. 497-504. ACM, 2009.
+9. Rosenblatt, Frank. ["The perceptron: A probabilistic model for information storage and organization in the brain."](http://psycnet.apa.org/journals/rev/65/6/386/) Psychological review 65, no. 6 (1958): 386.
+10. Bishop, Christopher M. ["Pattern recognition."](http://users.isr.ist.utl.pt/~wurmd/Livros/school/Bishop%20-%20Pattern%20Recognition%20And%20Machine%20Learning%20-%20Springer%20%202006.pdf) Machine Learning 128 (2006): 1-58.
+
+<br/>
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn.png
new file mode 100644
index 0000000000000000000000000000000000000000..3f5cdaacdc6acce41c5c6c99649be46685cf9903
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_train_log.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_train_log.png
new file mode 100644
index 0000000000000000000000000000000000000000..65bd17eacd41bbdbdb042bd1ba366eb53663b410
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_train_log.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/infer_3.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/infer_3.png
new file mode 100644
index 0000000000000000000000000000000000000000..030cd60d3b4af9aecd4941204da4ad15f6e1189f
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/infer_3.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/max_pooling.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/max_pooling.png
new file mode 100644
index 0000000000000000000000000000000000000000..90b02fa2a735cfcc9efb2de90906325dedcb358c
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/max_pooling.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp.png
new file mode 100644
index 0000000000000000000000000000000000000000..9f4d26cd8da32201d0a5e9c72d466301dd2b42a1
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_train_log.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_train_log.png
new file mode 100644
index 0000000000000000000000000000000000000000..f5a478fdc24f29c17555a2f1451f3f5a079faed9
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_train_log.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mnist_example_image.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mnist_example_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..4edd7cabf8a2282f6392ac1421c7ca4afb288589
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mnist_example_image.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_regression.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_regression.png
new file mode 100644
index 0000000000000000000000000000000000000000..40b98298288b9c406fce1cbca9c913753020a94d
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_regression.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_train_log.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_train_log.png
new file mode 100644
index 0000000000000000000000000000000000000000..47204941af7f22e68386a70a06ec4f122b83e262
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_train_log.png differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/train_and_test.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/train_and_test.png
new file mode 100644
index 0000000000000000000000000000000000000000..5cb87b450d0398bcfaec0e647c362052069797e7
Binary files /dev/null and b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/train_and_test.png differ
diff --git a/doc/fluid/new_docs/faq/faq.rst b/doc/fluid/new_docs/faq/faq.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3b4bd4f895162fa3b0ba12e785e38ad694590b25
--- /dev/null
+++ b/doc/fluid/new_docs/faq/faq.rst
@@ -0,0 +1,12 @@
+###################
+编译安装与单元测试
+###################
+
+1. 通过pip安装的PaddlePaddle在  :code:`import paddle.fluid` 报找不到 :code:`libmkldnn.so` 或 :code:`libmklml_intel.so`
+------------------------------------------------------------------------------------------
+出现这种问题的原因是在导入 :code:`paddle.fluid` 时需要加载 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so`，
+但是系统没有找到该文件。一般通过pip安装PaddlePaddle时会将 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so`
+拷贝到 :code:`/usr/local/lib` 路径下，所以解决办法是将该路径加到 :code:`LD_LIBRARY_PATH` 环境变量下，
+即： :code:`export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH` 。
+
+**注意**：如果是在虚拟环境中安装PaddlePaddle， :code:`libmkldnn.so` 和 :code:`libmklml_intel.so` 可能不在 :code:`/usr/local/lib` 路径下。
diff --git a/doc/fluid/new_docs/faq/index_cn.rst b/doc/fluid/new_docs/faq/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..bb2ed99217609d3a9edd179d4f98ad5b8b649860
--- /dev/null
+++ b/doc/fluid/new_docs/faq/index_cn.rst
@@ -0,0 +1,9 @@
+FAQ
+====
+
+本文档对关于PaddlePaddle的一些常见问题提供了解答。如果您的问题未在此处，请您到 `PaddlePaddle社区 <https://github.com/PaddlePaddle/Paddle/issues>`_ 查找答案或直接提 `issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_ ，我们会及时进行回复。
+ 
+..  toctree::
+  :maxdepth: 1
+  
+  faq.rst
diff --git a/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_basic_concept.rst b/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_basic_concept.rst
new file mode 100644
index 0000000000000000000000000000000000000000..55c3c761f932713ffa2b462b35f9f46a8edae536
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_basic_concept.rst
@@ -0,0 +1,392 @@
+================================
+PaddleFluid设计思想和基本使用概念
+================================
+
+
+
+Paddle Fluid 是用来让用户像 PyTorch 和 Tensorflow Eager Execution 一样执行程序。
+在这些系统中，不再有模型这个概念，应用也不再包含一个用于描述 Operator 图或者一系列层的符号描述，
+而是像通用程序那样描述训练或者预测的过程。
+
+
+深度学习平台的演化
+================
+
+时至今日，深度学习已成为事实上最流行的机器学习技术。学术界多年研究加上工业界的长期实践提出了若干有效的基本建模单元：
+全连接，卷积，循环神经网络等；设计各类训练技巧：初始化方法，跨层连接，各类 norm 技术等；
+发明了各种新的优化算法：Adadelta，Adam 等；
+各类固定的网络结构：highway, residual, attention 等纷纷涌现，不胜枚举。
+学术界工业界多年的付出共同促成了深度学习方法今日的影响力。
+
+学术研究和生产实践中积累了大量的知识，能够很好的解释神经网络中基本模块各自独的学习能力和特性。
+基本模块和训练技术的组合能够搭建出千变万化的神经网络模型。
+基本模块和训练技术是有限的，但他们的组合却是千变万化，这是深度学习方法的魅力所在，也是难度所在。
+
+正是这样高度的模块化特性，研究者和工程师们都在努力避免重复造轮子以提高研究和生产的效率，
+又进一步催生了深度学习平台技术的发展，深度学习框架已演变成为 AI 基础设施中重要的一部分。
+从 Theano，到 DistBelief，到 TensorFlow；从 Caffe 到 Caffe2；
+从 Torch 到 PyTorch；从 PaddlePaddle 到 PaddleFluid，
+深度学习平台技术也经历了两代的演化，并向着第三代平台技术迈进。
+
+站在历史发展的今天，当我们准备切换尝试使用一个新的深度学习平台作为支持自己学习和研究的工具时，
+平台技术都发生了哪些演化，能够为我们的带来什么便利呢？
+
+先让我们来看看深度学习框架解决的三大问题：
+
+- 如何描述计算以支持未来潜在会出现的新模型？
+- 如何高效利用异构设备最大化算力？
+- 如何利用网络中的计算机进行分布式计算来处理千万亿级别的数据？
+
+以上三个问题中的第一个和使用者研究者最为密切相关。
+这篇文章我们通过分析 PaddleFluid的设计理念，
+来了解一个深度学习框架如何抽象深度学习模型，来看看我们的使用经验如何在不同深度学习平台之间过度和迁移。
+
+如何描述计算
+=============
+
+让我们首先来看看 PaddleFluid 如何描述机器学习模型
+
+
+PaddleFluid之 :code:`Program`
+
+如何描述计算很大程度决定了一个神经网络框架计算功能的完备性。
+深度学习模型和方法历经二十多年的发展：“依次执行一组计算的前向，
+再以和前向计算相反的顺序执行反向计算，中间无分支无交互”，
+这样的模型结构已经无法满足研究者和千千万万框架使用者的想象力。
+
+从 `PaddleFluid 的设计目标 <https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid.md>`_ 来看，
+在如何描述机器学习模型这一核心问题上，PaddleFluid 的目标是：
+创造一种新的计算描述方式，不但能够描述至今为止人们已知的主流神经网络模型，并且能够支持未来会出现的任意模型。
+
+PaddleFluid 是如何做到支持未来出现的新模型这一目标呢？PaddleFluid 的设计选择是：
+对用户来说，用一段 :code:`Program` （在 PaddleFluid 内部会被转化为一种叫作 :code:`ProgramDesc` 的描述语言），
+而不是用计算图来描述机器学习模型。 :code:`Program` 用符合用户使用直觉的方式，
+提供一种新的描述语言能够描述任意复杂的机器学习模型。
+
+对所有计算机专业同学学习编程语言的第一课一定是建立对“程序语言的三种执行结构：顺序执行，条件选择和循环执行”的认识。
+计算机世界的所有可计算逻辑都是由这三种执行结构表示，用这三种结构描述的逻辑是可计算的。那么同样道理，
+对一个神经网络框架来说，如果可以和程序语言一样提供对这三种执行结构的支持，那么将可以描述任意复杂的，
+可被计算机计算的机器学习模型。PaddleFluid通过提供对这三种执行结构的支持，来做到对任意复杂模型的描述。
+
+具体来说：
+
+1. Fluid 的核心设计理念都可以类比到程序语言，如果已经有写程序的经验，那么使用 Fluid 构建神经网络模型的体验，将非常接近写程序；
+
+2. 在 PaddleFluid 中，用户不会显示地感知“计算图”这样的概念，一个机器学习模型被描述为一个 Fluid :code:`Program` （Fluid 内部称之为 :code:`ProgramDesc` ）；
+
+- 一个 Fluid :code:`Program` 由一组嵌套的 :code:`Block` 构成。 :code:`Block` 的概念可以类比到 C++ 或是 Java 中的一对大括号，或是 Python 语言中的一个缩进快；
+-  :code:`Block` 中的计算由顺序执行、条件选择或者循环执行三种方式组合，构成复杂的计算逻辑。
+
+3. Fluid :code:`Program` 中包含对计算和计算对象的描述。计算的描述称之为 Operator；计算作用的对象（或者说 Operator 的输入和输出）被统一为 Tensor。
+
+在描述计算和计算的作用对象这一问题上，各个深度学习框架的选择是相同的，如果有一个平台的使用经验，那么将非常容易在各个平台之间进行迁移。
+
+核心使用概念
+=============
+
+下面，我们将更详细地了解核心使用概念在PaddlePaddle的使用方法。
+
+数据表示和计算的对象：Tensor
+--------------------------
+
+Tensor 是向量矩阵概念的扩展，是神经网络模型计算操作的基本对象。这在是今天所有主流深度学习平台的共同选择。
+
+可以简单地将 Tensor 理解为一个 N 维向量，它可以有任意多的维度。一个 Tensor 具有两个基本特征：
+
+1. 数据类型：每个 Tensor 的所有元素具有同样的、已知的数据类型；
+
+2. 大小（或者说形状）：即维度的个数（rank，阶）以及各维度的长度。
+
+Tensor 某些维度的长度在定义模型阶段可能是未知的，在实际算法执行时才能确定。例如一个 mini-batch 中包含的样本数目（batch size），或者是一个 mini-batch 中序列的最大长度。
+
+PaddleFluid中的Tensor
+""""""""""""""""""""""
+
+PaddleFluid 中也使用 Tensor 作为神经网络中输入输出数据的统一表示。Tensor 的概念在今天主流的深度学习平台中都是完全相同，可以在各个深度学习框架之间直接无缝迁移。
+
+在 Fluid 中也同样存在三种特殊的 Tensor：
+
+1. 模型中的可学习参数
+
+模型中的可学习参数生存期和整个训练任务一样长，会接受优化算法的更新。在 PaddleFluid 中同样以 :code:`Variable` 表示；
+用户在绝大多数情况下都不需要自己来创建网络中的可学习参数，Fluid 为几乎常见的神经网络基本计算模块都提供了封装。
+以最简单的全连接模型为例，下面的代码片段会直接为全连接层创建连接权值 WW 和偏置（ :code:`bias` ）两个可学习参数，
+无需显示地调用 variable 相关接口创建可学习参数。
+
+
+::
+
+  import paddle.fluid as fluid
+
+  y = fluid.layers.fc(input=x, size=128, bias_attr=True)
+
+2. 输入输出Tensor
+
+整个神经网络的输入数据也是一个特殊的 Tensor，在这个 Tensor 中，
+一些维度的大小在定义模型时无法确定（通常包括：batch size；
+如果 mini-batch 之间，数据可变，也会包括序列的最大长度，图片的宽度和高度等），在定义模型时需要占位；
+PaddleFluid 中使用 :code:`fluid.layers.data` 来接入输入数据， :code:`fluid.layer.data` 需要提供输入 Tensor 的 形状信息，
+当遇到无法确定的维度 时， 相应维度指定为 None ，如下面的代码片段所示：
+
+::
+
+  import paddle.fluid as fluid
+
+  x = fluid.layers.data(name="x", shape=[2, None, 3], dtype="int64")
+
+3. 常量 Tensor 在 PaddleFluid 中需要通过组合 Tensor 和 :code:`fluid.layers.assign` 来实现。
+
+
+计算原语：Operation/Operator
+----------------------------
+
+Tensor 是今天所有主流深度学习框架的统一数据表示（输入、输出、中间计算结果、模型的可学习参数都是 Tensor）。
+另一方面，对数据的操作，在主流深度学习框架中也高度统一为：Operator/Operation。
+在中文中，通常我们会习惯将其称之为算子。
+
+注：在 PaddleFluid 中使用 Operator 称呼对 Tensor 的操作。
+
+Operation/Operator 接受多个 Tensor 作为输入，输出若干个 Tensor，表示了从输入到输出的变化。
+
+PaddleFluid中的Operator
+""""""""""""""""""""""""
+
+PaddleFluid 支持的所有算子，可以在 `API 帮助文档 <http://www.paddlepaddle.org/docs/develop/api/en/fluid/layers.html>`_ 中查看。
+
+为了便于用户使用，在 Python 端，Fluid 中的 Operator 被进一步封装入 :code:`paddle.fluid.layers` ，
+:code:`paddle.fluid.networks` 等模块。这是因为：一些常见的对Tensor的操作可能是有更多基础操作构成，
+例如：l2 norm 内部由 reduce、elementwise_add，scale 等多个 Operator 组合计算逻辑完成，
+为了提高使用的便利性，框架内部对基础 Operator 进行了一些封装，包括创建 Operator 依赖可学习参数，
+可学习参数的初始化细节等，减少用户重复开发的成本。
+
+对所有深度学习框架都面临同样的封装，在绝大多数情况下，用户很少会直接与框架底层的 Operator 直接打交道，而是使用框架提供的 layers，networks 等模块，降低开发的代码量。不论是什么样的概念，他们在各框架之间的本质和作用都是相同的：对 Tensor 的变换。
+
+总结
+>>>>>>
+
+不论叫作 Operation、Operator 还是 layers，他们在各深度学习平台中的含义和作用都是相同的：对 Tensor 的变换。是一个深度学习平台提供的基础计算能力。可以在每个平台各自的 API 帮助文档中查到。
+
+在各个深度学习平台都已加入 ONNX 项目的今天，每个深度学习平台提供给大家的基本算子都已趋同，与此同时，每个平台也各有其特点，会提供一些独特的算子，方便某一类任务的开发。
+
+构建模型并执行
+--------------
+
+整个训练任务运行方法如下：
+
+Fluid中的Program和Executor
+"""""""""""""""""""""""""""
+
+1. Fluid 使用 :code:`Program` 描述神经网络模型，对用户来说，并没有计算图的概念。
+用户定义的所有 Tensor 以及对 Tensor 的操作：Operator 都会被加入一段 :code:`Program` 中；
+
+一段 Program 由嵌套的 :code:`Block` 构成，但用户无需显示地创建 :code:`Block` 或是显示地注意到 :code:`Block` 的存在；
+在 Fluid 程序中， :code:`Block` 是在调用 :code:`while_op` ， :code:`if_op` ， :code:`parallel_do` 等特殊 :code:`Operator` 时，由这些 :code:`Operator` 来创建；
+对用户使用来说，只需要知道自己正在向一段 Fluid Program 中添加变量（ :code:`Tensor` ）和操作（ :code:`Operator` ）即可。
+
+2. Fluid 利用 :code:`Executor` 来执行一段 Fluid :code:`Program` 。
+
+为进一步理解 Fluid 中 :code:`Executor` 的作用，需要先解释一下 Fluid 程序的执行流程。 下图展示单机上，Fluid 程序的执行流程：
+
+.. figure:: fluid_local_train.jpeg
+
+   :scale: 50%
+   :align: center
+
+   Figure.1
+
+   Fluid本地训练任务执行流程图
+
+1. Fluid 设计思想和灵感非常类似于程序设计语言，和高级编译语言 C++/Java 编写程序的过程非常类似，Fluid 程序执行分为两个重要阶段：编译时和运行时；
+
+2. 编译期，用户通过调用 Fluid 提供的算子，向一段 :code:`Program` 中添加变量（Tensor）以及对变量的操作（Operators 或者 Layers）。用户只需要描述核心的前向计算，不需要关心反向计算，分布式下，异构设备下如何计算；
+
+3. 原始的 :code:`Program` 在平台内部转换为中间描述语言： :code:`ProgramDesc` ；
+
+4. 编译期最重要的一个功能模块是 Transpiler。Transpiler 接受一段 :code:`ProgramDesc` ，输出一段变化后的 :code:`ProgramDesc` ，作为后端 Executor 最终需要执行的 :code:`Fluid Program` ；
+
+最为常用的 Transipler 包括：
+
+1. 内存优化 Transipler：通过对变量读写依赖关系分析，插入内存回收 Operator 以维持运行过程中较小的内存开销；
+
+2. 分布式环境下的 Transpiler：接受用户定义的 local Program ，生成 Parameter Client 和 Parameter Server 执行的两段 :code:`Program` 。
+
+3. 后端 Executor 接受 Transpiler 输出的这段 :code:`Program` ，依次执行其中的 Operator（可以类比为程序语言中的指令），在执行过程中会为 Operator 创建所需的输入输出并进行管理。
+
+从上面的过程中可以看到，Fluid 程序的执行过程分为：编译器的定义 :code:`Program` ，和创建 :code:`Executor` 运行 :code:`Program` 。
+ :code:`Executor` 执行一段 :code:`Program` 的过程是不可交互和不可中断的。
+
+在 Fluid 中，可以创建多余一段 :code:`Program` 。默认情况，一个 PaddleFluid 程序中存在 2 段 Program：
+
+1.  :code:`fluid.framework.default_startup_program` ：其中定义了创建模型参数，输入输出，以及模型中可学习参数的初始化等各种操作；
+
+-  :code:`default_startup_program` 可以由框架自动生成，使用时无需显示地创建；
+- 如果调用修改了参数的默认初始化方式，框架会自动的将相关的修改加入 :code:`default_startup_program` 。
+
+2.  :code:`fluid.framework.default_main_program` ：定义了神经网络模型，前向反向计算，以及优化算法对网络中可学习参数的更新；
+
+- 使用 Fluid 的核心就是构建起 :code:`default_main_program` 。
+
+3. PaddleFluid 中的 :code:`Scope` 类似于 TensorFlow 中的 collection 这一概念，但在 Fluid 中 :code:`Scope` 是框架后端概念，用户无法直接操作。因此，在使用框架时无需关心。
+
+总结
+"""""
+
+Fluid 中通过 Executor 来执行一段用户定义的 Fluid :code:`Program` 。
+1. Executor 连接了 Fluid 的前端和后端；
+
+2. Executor 接受用户定义的原始模型（一段 :code:`Program` ），通过调用系统中不同功能更的 :code:`Transpiler` 完成对原始 :code:`Program` 的变化，进行优化。
+
+完整实例：如何完成一个机器学习模型的训练
+===================================
+
+
+
+这一节，我们以 MNIST 手写数字识别问题 —— 机器学习任务的“Hello World”问题和数据，为例，通过一个可以运行的完整实例，来学习上文介绍的概念如何在PaddleFluid 平台使用。
+
+步骤1：定义数据
+----------------
+
+PaddleFluid 中以 :code:`fluid.layers.data` 来接收输入数据。
+
+::
+
+  import numpy as np
+
+  import paddle.fluid as fluid
+  import paddle.v2 as paddle
+
+  # define the input layers for the network.
+  x = fluid.layers.data(name="img", shape=[1, 28, 28], dtype="float32")
+  y_ = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+Fluid 中 Tensor 的第 0 维度固定为 batch size。在上面代码段中，图像输入 :code:`x` 的形状为：[1, 28, 28]。这三个维度的含义分别是：channel 数目，图像的高度和宽度。
+
+实际上 Fluid 框架内部,一幅图像输入是一个 4-D Tensor，所有 Tensor 的第 0 维固定为 batch size。框架内部会自动为batch size进行填充占位。无需对batch size指定填充占位。
+
+如果除去 batch size（第 0 维度）外，如果 Tensor 某一维度的大小只能在运行时确定，可以在该位置上直接指定 :code:`None` 进行占位。
+
+步骤2：定义模型
+--------------
+
+通过调用 Fluid 提供的算子定义含有一个隐层的神经网络。Fluid 模型的分为模型结构和优化方法两部分。这一点与 TensorFlow 程序十分相似似，使用概念可以直接对应进行迁移。
+
+::
+
+  # define the network topology.
+  y = fluid.layers.fc(input=x, size=10, act="softmax")
+  loss = fluid.layers.cross_entropy(input=y, label=y_)
+  avg_loss = fluid.layers.mean(loss)
+
+  # define the optimization algorithm.
+  optimizer = fluid.optimizer.Adam(learning_rate=1e-3)
+  optimizer.minimize(avg_loss)
+
+Fluid 使用 Program 而不是计算图描述模型，一般情况下，用户无需关心 Program 的细节，当调用以上 layers 时，会向一个全局的 Program： :code:`fluid.framework.default_main_program` 中插入变量（Tensor）和对变量的操作（上述代码段中的 layers 和 optimzier）。
+
+步骤3：参数初始化
+----------------
+
+如上文介绍，Fluid 程序中的 Executor 是连接 Fluid 前端和后端的接口。
+
+默认一个Fluid模型存在至少两段 Program。用于初始化网络中的可学习参数的那一段 :code:`Program` 叫作 :code:`fluid.default_startup_program()` 。
+
+只有执行器 executor 可以执行 Fluid Program，因此，在初始化网络中的可学习参数之前，需要首先创建一个 Fluid executor。
+
+::
+
+  # define the executor.
+  place = fluid.CPUPlace()
+  exe = fluid.Executor(place)
+  exe.run(fluid.default_startup_program())
+
+在以上代码段中， :code:`place` 用于告诉 executor 一段 Fluid Program 在何种设备上执行，
+常见的有 :code:`fluid.CPUPlace()` 和 :code:`fluid.CUDAPlace()` 。
+
+步骤4：数据输入 + 执行模型训练
+----------------------------
+
+我们在步骤 2 中定义的神经网络模型最终被插入一段叫做 :code:`fluid.framework.default_main_program` 的 Fluid Program 中。
+
+网络可学习参数初始化之后，可以通过让执行器 Executor 执行这段 :code:`fluid.framework.default_main_program` 来进行训练。
+
+::
+
+  train_reader = paddle.batch(
+        paddle.reader.shuffle(paddle.dataset.mnist.train(), buf_size=5000),
+        batch_size=BATCH_SIZE)
+  feeder = fluid.DataFeeder(place=place, feed_list=[x, y_])
+
+  for pass_id in range(100):
+    for batch_id, data in enumerate(train_reader()):
+        loss = exe.run(
+            fluid.framework.default_main_program(),
+            feed=feeder.feed(data),
+            fetch_list=[avg_loss])
+        print("Cur Cost : %f" % (np.array(loss[0])[0]))
+
+从上面的代码片段中可以看到，Fluid 程序的训练过程和 TensorFlow 程序的训练过程非常接近，
+都放在一个 :code:`for` 循环中，循环读取一个 mini-batch 数据，
+调用执行器执行 Fluid :code:`default_main_program` ：接收 mini-batch 输入，在其上进行前向，反向和参数更新计算。
+
+`注：上面程序使用了 Fluid 内置的 MNIST 数据，和我们提供给 TensorFlow 示例程序的 MNIST 数据完全一样。`
+
+步骤5：观察模型效果
+-----------------
+
+以上步骤已经构成了完整的 Tensorflow 模型训练程序，每个 batch 观察一次 loss，可以直观看到模型的迭代效果：
+
+.. figure:: fluid_mnist.png
+
+   :scale: 40%
+   :align: center
+
+   Figure.2
+
+   Fluid MNIST手写数字识别任务代价下降曲线
+
+附：完整代码
+------------
+
+::
+
+  import numpy as np
+
+  import paddle.fluid as fluid
+  import paddle.v2 as paddle
+
+
+  def main():
+      BATCH_SIZE = 128
+
+      # define the input layers for the network.
+      x = fluid.layers.data(name="img", shape=[1, 28, 28], dtype="float32")
+      y_ = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+      # define the network topology.
+      y = fluid.layers.fc(input=x, size=10, act="softmax")
+      loss = fluid.layers.cross_entropy(input=y, label=y_)
+      avg_loss = fluid.layers.mean(loss)
+
+      optimizer = fluid.optimizer.Adam(learning_rate=5e-3)
+      optimizer.minimize(avg_loss)
+
+      # define the executor.
+      place = fluid.CPUPlace()
+      exe = fluid.Executor(place)
+      exe.run(fluid.default_startup_program())
+
+      train_reader = paddle.batch(
+          paddle.reader.shuffle(paddle.dataset.mnist.train(), buf_size=5000),
+          batch_size=BATCH_SIZE)
+      feeder = fluid.DataFeeder(place=place, feed_list=[x, y_])
+
+      for pass_id in range(100):
+          for batch_id, data in enumerate(train_reader()):
+              loss = exe.run(
+                  fluid.framework.default_main_program(),
+                  feed=feeder.feed(data),
+                  fetch_list=[avg_loss])
+              print("Cur Cost : %f" % (np.array(loss[0])[0]))
+
+  if __name__ == "__main__":
+      main()
diff --git a/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_local_train.jpeg b/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_local_train.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..0a495901fafb85987e34acc3c454fb87e8160fca
Binary files /dev/null and b/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_local_train.jpeg differ
diff --git a/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_mnist.png b/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_mnist.png
new file mode 100644
index 0000000000000000000000000000000000000000..e5ad0ba058c863cf68ef0789e58fcf67b3115fdb
Binary files /dev/null and b/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_mnist.png differ
diff --git a/doc/fluid/new_docs/user_guides/howto/configure_simple_model/index.rst b/doc/fluid/new_docs/user_guides/howto/configure_simple_model/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5946a2ccb7e43004eae39ec4b3c6112c66c1fd04
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/configure_simple_model/index.rst
@@ -0,0 +1,88 @@
+..  _user_guide_configure_simple_model:
+
+##############
+配置简单的网络
+##############
+
+在解决实际问题时，可以先从逻辑层面对问题进行建模，明确模型所需要的 **输入数据类型**、**计算逻辑**、**求解目标** 以及 **优化算法**。PaddlePaddle提供了丰富的算子来实现模型逻辑。下面以一个简单回归任务举例说明如何使用PaddlePaddle构建模型。该例子完整代码参见 `fit_a_line <https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_fit_a_line.py>`_。
+
+问题描述及定义
+##############
+
+问题描述: 给定一组数据 :math:`<X, Y>`，求解出函数 :math:`f`，使得 :math:`y=f(x)`，其中 :math:`x\subset X` 表示一条样本的特征，为 :math:`13` 维的实数向量；:math:`y \subset Y` 为一实数表示该样本对应的值。
+
+我们可以尝试用回归模型来对问题建模，回归问题的损失函数有很多，这里选择常用的均方误差。为简化问题，这里假定 :math:`f` 为简单的线性变换函数，同时选用随机梯度下降算法来求解模型。
+
++----------------+----------------------------------------------+
+| 输入数据类型   |  样本特征: 13 维 实数                        |
++                +----------------------------------------------+
+|                |  样本标签: 1 维 实数                         |
++----------------+----------------------------------------------+
+| 计算逻辑       | 使用线性模型，产生 1维实数作为模型的预测输出 |
++----------------+----------------------------------------------+
+| 求解目标       | 最小化模型预测输出与样本标签间的均方误差     |
++----------------+----------------------------------------------+
+| 优化算法       | 随机梯度下降                                 |
++----------------+----------------------------------------------+
+
+使用PaddlePadle建模
+###################
+
+从逻辑层面明确了输入数据格式、模型结构、损失函数以及优化算法后，需要使用PaddlePaddle提供的API及算子来实现模型逻辑。一个典型的模型主要包含4个部分，分别是：输入数据格式定义，模型前向计算逻辑，损失函数以及优化算法。
+
+数据层
+------
+
+PaddlePaddle提供了 :code:`fluid.layers.data()` 算子来描述输入数据的格式。
+
+:code:`fluid.layers.data()` 算子的输出是一个Variable。这个Variable的实际类型是Tensor。Tensor具有强大的表征能力，可以表示多维数据。为了精确描述数据结构，通常需要指定数据shape以及数值类型type。其中shape为一个整数向量，type可以是一个字符串类型。目前支持的数据类型参考    :ref:`user_guide_paddle_support_data_types` 。 模型训练一般会使用batch的方式读取数据，而batch的size在训练过程中可能不固定。data算子会依据实际数据来推断batch size，所以这里提供shape时不用关心batch size，只需关心一条样本的shape即可，更高级用法请参考 :ref:`user_guide_customize_batch_size_rank`。从上知，:math:`x` 为 :math:`13` 维的实数向量，:math:`y` 为实数，可使用下面代码定义数据层：
+
+.. code-block:: python
+
+    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+该模型使用的数据比较简单，事实上data算子还可以描述变长的、嵌套的序列数据。也可以使用 :code:`open_files` 打开文件进行训练。更详细的文档可参照 :ref:`user_guide_prepare_data`。
+
+前向计算逻辑
+------------
+
+实现一个模型最重要的部分是实现计算逻辑，PaddlePaddle提供了丰富的算子。这些算子的封装粒度不同，通常对应一种或一组变换逻辑。算子输出即为对输入数据执行变换后的结果。用户可以灵活使用算子来完成复杂的模型逻辑。比如图像相关任务中会使用较多的卷积算子、序列任务中会使用LSTM/GRU等算子。复杂模型通常会组合多种算子，以完成复杂的变换。PaddlePaddle提供了非常自然的方式来组合算子，一般地可以使用下面的方式：
+
+.. code-block:: python
+
+    op_1_out = fluid.layers.op_1(input=op_1_in, ...)
+    op_2_out = fluid.layers.op_2(input=op_1_out, ...)
+    ...
+
+其中op_1和op_2表示算子类型，可以是fc来执行线性变换(全连接)，也可以是conv来执行卷积变换等。通过算子的输入输出的连接来定义算子的计算顺序以及数据流方向。上面的例子中，op_1的输出是op_2的输入，那么在执行计算时，会先计算op_1，然后计算op_2。更复杂的模型可能需要使用控制流算子，依据输入数据来动态执行，针对这种情况，PaddlePaddle提供了IfElseOp和WhileOp等。算子的文档可参考 :code:`fluid.layers`。具体到这个任务, 我们使用一个fc算子：
+
+.. code-block:: python
+
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+
+损失函数
+--------
+
+损失函数对应求解目标，我们可以通过最小化损失来求解模型。大多数模型使用的损失函数，输出是一个实数值。但是PaddlePaddle提供的损失算子一般是针对一条样本计算。当输入一个batch的数据时，损失算子的输出有多个值，每个值对应一条样本的损失，所以通常会在损失算子后面使用mean等算子，来对损失做归约。模型在一次前向迭代后会得到一个损失值，PaddlePaddle会自动执行链式求导法则计算模型里面每个参数和变量对应的梯度值。这里使用均方误差损失：
+
+.. code-block:: python
+
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    avg_cost = fluid.layers.mean(cost)
+
+优化方法
+--------
+
+确定损失函数后，可以通过前向计算得到损失值，然后通过链式求导法则得到参数的梯度值。获取梯度值后需要更新参数，最简单的算法是随机梯度下降法：:math:`w=w - \eta \cdot g`。但是普通的随机梯度下降算法存在一些问题: 比如收敛不稳定等。为了改善模型的训练速度以及效果，学术界先后提出了很多优化算法，包括： :code:`Momentum`、:code:`RMSProp`、:code:`Adam` 等。这些优化算法采用不同的策略来更新模型参数，一般可以针对具体任务和具体模型来选择优化算法。不管使用何种优化算法，学习率一般是一个需要指定的比较重要的超参数，需要通过实验仔细调整。这里采用随机梯度下降算法：
+
+.. code-block:: python
+
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+
+更多优化算子可以参考 :code:`fluid.optimizer()` 。
+
+下一步做什么？
+##############
+
+使用PaddlePaddle实现模型时需要关注 **数据层**、**前向计算逻辑**、**损失函数** 和 **优化方法**。不同的任务需要的数据格式不同，涉及的计算逻辑不同，损失函数不同，优化方法也不同。PaddlePaddle提供了丰富的模型示例，可以以这些示例为参考来构建自己的模型结构。用户可以访问 `模型库 <https://github.com/PaddlePaddle/models/tree/develop/fluid>`_ 查看官方提供的示例。
diff --git a/doc/fluid/new_docs/user_guides/howto/debug/index.rst b/doc/fluid/new_docs/user_guides/howto/debug/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0878e17b4069be6b08bc85a35e77ba6421633218
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/debug/index.rst
@@ -0,0 +1,10 @@
+############
+Debug 工具
+############
+
+PaddlePaddle 提供了如下方式方便 Debug 训练 情况
+
+.. toctree::
+   :maxdepth: 2
+
+   visualdl.md
diff --git a/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md b/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md
new file mode 100644
index 0000000000000000000000000000000000000000..a2f30823a6fcd379f94e6e98d043b0d00681827f
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md
@@ -0,0 +1,218 @@
+# VisualDL (Visualize the Deep Learning)
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/VisualDL/develop/docs/images/vs-logo.png" width="60%" />
+</p>
+
+## 介绍
+VisualDL是一个面向深度学习任务设计的可视化工具，包含了scalar、参数分布、模型结构、图像可视化等功能，项目正处于高速迭代中，新的组件会不断加入。
+
+目前大多数DNN平台均使用Python作为配置语言，VisualDL原生支持python的使用，
+通过在模型的Python配置中添加几行，便可以为训练过程提供丰富的可视化支持。
+
+除了Python SDK之外，VisualDL底层采用C++编写，其暴露的C++ SDK也可以集成到其他平台中，
+实现原生的性能和定制效果。
+
+## 组件
+VisualDL 目前支持4种组件：
+
+- graph
+- scalar
+- image
+- histogram
+
+### Graph
+兼容 ONNX(Open Neural Network Exchange)[https://github.com/onnx/onnx], 通过与 python SDK的结合，VisualDL可以兼容包括 PaddlePaddle, pytorch, mxnet在内的大部分主流DNN平台。
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/daming-lu/large_files/master/graph_demo.gif" width="60%" />
+</p>
+
+### Scalar
+可以用于展示训练测试的误差趋势
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/daming-lu/large_files/master/loss_scalar.gif" width="60%"/>
+</p>
+
+### Image
+可以用于可视化任何tensor，或模型生成的图片
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/daming-lu/large_files/master/loss_image.gif" width="60%"/>
+</p>
+
+### Histogram
+
+用于可视化任何tensor中元素分布的变化趋势
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/daming-lu/large_files/master/histogram.gif" width="60%"/>
+</p>
+
+## 快速尝试
+请使用下面的命令，来快速测试 VisualDL。
+
+```
+# 安装，建議是在虚拟环境或anaconda下。
+pip install --upgrade visualdl
+
+# 运行一个例子，vdl_create_scratch_log 将创建测试日志
+vdl_create_scratch_log
+visualDL --logdir=scratch_log --port=8080
+
+# 访问 http://127.0.0.1:8080
+```
+
+如果以上步骤出现问题，很可能是因为python或pip不同版本或不同位置所致，以下安装方法能解决。
+
+## 使用 virtualenv 安装
+
+[Virtualenv](https://virtualenv.pypa.io/en/stable/) 能创建独立Python环境，也能确保Python和pip的相对位置正确。
+
+在macOS上，安装pip和virtualenv如下：
+```
+sudo easy_install pip
+pip install --upgrade virtualenv
+```
+
+在Linux上，安装pip和virtualenv如下:
+```
+sudo apt-get install python3-pip python3-dev python-virtualenv
+```
+
+然后创建一个虚拟环境：
+```
+virtualenv ~/vdl  # for Python2.7
+virtualenv -p python3 ~/vdl for Python 3.x
+```
+
+```~/vdl``` 是你的Virtualenv目录, 你也可以选择任一目录。
+
+激活虚拟环境如下：
+```
+source ~/vdl/bin/activate
+```
+
+现在再安装 VisualDL 和运行范例：
+
+```
+pip install --upgrade visualdl
+
+# 运行一个例子，vdl_create_scratch_log 将创建测试日志
+vdl_create_scratch_log
+visualDL --logdir=scratch_log --port=8080
+
+# 访问 http://127.0.0.1:8080
+```
+
+如果在虚拟环境下仍然遇到安装问题，请尝试以下方法。
+
+
+## 使用 Anaconda 安装
+
+Anaconda是一个用于科学计算的Python发行版，提供了包管理与环境管理的功能，可以很方便地解决多版本python并存、切换以及各种第三方包安装问题。
+
+请根据[Anaconda下载网站](https://www.anaconda.com/download) 的指示去下载和安装Anaconda.
+下载Python 3.6版本的command-Line installer.
+
+创建conda环境名字为```vdl```或任何名字:
+```
+conda create -n vdl pip python=2.7 # or python=3.3, etc.
+```
+
+激活conda环境如下:
+```
+source activate vdl
+```
+
+现在再安装 VisualDL 和运行范例：
+
+```
+pip install --upgrade visualdl
+
+# 运行一个例子，vdl_create_scratch_log 将创建测试日志
+vdl_create_scratch_log
+visualDL --logdir=scratch_log --port=8080
+
+# 访问 http://127.0.0.1:8080
+```
+
+如果仍然遇到安装问题，请尝试以下用源代码安装方法。
+
+### 使用代码安装
+```
+#建議是在虚拟环境或anaconda下。
+git clone https://github.com/PaddlePaddle/VisualDL.git
+cd VisualDL
+
+python setup.py bdist_wheel
+pip install --upgrade dist/visualdl-*.whl
+```
+
+如果打包和安装遇到其他问题，不安装只想运行Visual DL可以看[这里](https://github.com/PaddlePaddle/VisualDL/blob/develop/docs/how_to_dev_frontend_en.md)
+
+
+## SDK
+VisualDL 同时提供了python SDK 和 C++ SDK 来实现不同方式的使用。
+
+### Python SDK
+VisualDL 现在支持 Python 2和 Python 3。
+
+以最简单的Scalar组件为例，尝试创建一个scalar组件并插入多个时间步的数据：
+
+```python
+import random
+from visualdl import LogWriter
+
+logdir = "./tmp"
+logger = LogWriter(logdir, sync_cycle=10000)
+
+# mark the components with 'train' label.
+with logger.mode("train"):
+    # create a scalar component called 'scalars/scalar0'
+    scalar0 = logger.scalar("scalars/scalar0")
+
+# add some records during DL model running.
+for step in range(100):
+    scalar0.add_record(step, random.random())
+```
+
+### C++ SDK
+上面 Python SDK 中代码完全一致的C++ SDK用法如下
+```c++
+#include <cstdlib>
+#include <string>
+#include "visualdl/sdk.h"
+
+namespace vs = visualdl;
+namespace cp = visualdl::components;
+
+int main() {
+  const std::string dir = "./tmp";
+  vs::LogWriter logger(dir, 10000);
+
+  logger.SetMode("train");
+  auto tablet = logger.AddTablet("scalars/scalar0");
+
+  cp::Scalar<float> scalar0(tablet);
+
+  for (int step = 0; step < 1000; step++) {
+    float v = (float)std::rand() / RAND_MAX;
+    scalar0.AddRecord(step, v);
+  }
+
+  return 0;
+}
+```
+## 启动Board
+当训练过程中已经产生了日志数据，就可以启动board进行实时预览可视化信息
+
+```
+visualDL --logdir <some log dir>
+```
+
+board 还支持一下参数来实现远程的访问：
+
+- `--host` 设定IP
+- `--port` 设定端口
+- `--model_pb` 指定 ONNX 格式的模型文件
diff --git a/doc/fluid/new_docs/user_guides/howto/evaluation/index.rst b/doc/fluid/new_docs/user_guides/howto/evaluation/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6f6698cadcba4d9645fdc4a8a74d899598b96d99
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/evaluation/index.rst
@@ -0,0 +1,10 @@
+############
+模型评估和调试
+############
+
+PaddlePaddle Fluid提供了常用的模型评估指标，并提供了VisualDL工具可视化模型效果。
+
+.. toctree::
+   :maxdepth: 2
+
+   metrics
diff --git a/doc/fluid/new_docs/user_guides/howto/evaluation/metrics.rst b/doc/fluid/new_docs/user_guides/howto/evaluation/metrics.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f37968a50350a90e698cb1a63bd501635753e7fb
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/evaluation/metrics.rst
@@ -0,0 +1,62 @@
+############
+模型评估
+############
+
+模型评估是用指标反映模型在预期目标下精度，根据模型任务决定观察指标，作为在训练中调整超参数，评估模型效果的重要依据。
+metric函数的输入为当前模型的预测preds和labels，输出是自定义的。metric函数和loss函数非常相似，但是metric并不是模型训练网络组成部分。
+
+用户可以通过训练网络得到当前的预测preds和labels，在Python端定制metric函数；也可以通过定制c++ Operator的方式，在GPU上加速metric计算。
+
+paddle.fluid.metrics模块包含该功能
+
+
+常用指标
+############
+
+metric函数根据模型任务不同，指标构建方法因任务而异。
+
+回归类型任务labels是实数，因此loss和metric函数构建相同，可参考MSE的方法。
+分类任务常用指标为分类指标，本文提到的一般是二分类指标，多分类和多标签需要查看对应的API文档。例如排序指标auc，多分类可以作为0，1分类任务，auc指标仍然适用。
+Fluid中包含了常用分类指标，例如Precision, Recall, Accuracy等,更多请阅读API文档。以 :ref:`Precision` 为例，具体方法为
+
+.. code-block:: python
+
+   >>> import paddle.fluid as fluid
+   >>> labels = fluid.layers.data(name="data", shape=[1], dtype="int32")
+   >>> data = fluid.layers.data(name="data", shape=[32, 32], dtype="int32")
+   >>> pred = fluid.layers.fc(input=data, size=1000, act="tanh")
+   >>> acc = fluid.metrics.Precision()
+   >>> for pass in range(PASSES):
+   >>>   acc.reset()
+   >>>   for data in train_reader():
+   >>>       loss, preds, labels = exe.run(fetch_list=[cost, preds, labels])
+   >>>   acc.update(preds=preds, labels=labels)
+   >>>   numpy_acc = acc.eval()
+      
+
+其他任务例如MultiTask Learning，Metric Learning，Learning To Rank各种指标构造方法请参考API文档。
+
+自定义指标
+############
+Fluid支持自定义指标，灵活支持各类计算任务。下文通过一个简单的计数器metric函数，实现对模型的评估。
+其中preds是模型预测值，labels是给定的标签。
+
+.. code-block:: python
+
+   >>> class MyMetric(MetricBase):
+   >>>     def __init__(self, name=None):
+   >>>         super(MyMetric, self).__init__(name)
+   >>>         self.counter = 0  # simple counter
+
+   >>>     def reset(self):
+   >>>         self.counter = 0
+
+   >>>     def update(self, preds, labels):
+   >>>         if not _is_numpy_(preds):
+   >>>             raise ValueError("The 'preds' must be a numpy ndarray.")
+   >>>         if not _is_numpy_(labels):
+   >>>             raise ValueError("The 'labels' must be a numpy ndarray.")
+   >>>         self.counter += sum(preds == labels)
+
+   >>>     def eval(self):
+   >>>         return self.counter
diff --git a/doc/fluid/new_docs/user_guides/howto/modification/foo.rst b/doc/fluid/new_docs/user_guides/howto/modification/foo.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9d43c91a8544c3b281b2e8d556cb8b8e069d7e0a
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/modification/foo.rst
@@ -0,0 +1,3 @@
+###
+FAQ
+###
diff --git a/doc/fluid/new_docs/user_guides/howto/prepare_data/feeding_data.rst b/doc/fluid/new_docs/user_guides/howto/prepare_data/feeding_data.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c3bf033bb8316eeb4901c0cdc61e0556c8816dac
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/prepare_data/feeding_data.rst
@@ -0,0 +1,169 @@
+.. _user_guide_use_numpy_array_as_train_data:
+
+###########################
+使用Numpy Array作为训练数据
+###########################
+
+PaddlePaddle Fluid支持使用 :code:`fluid.layers.data()` 配置数据层；
+再使用 Numpy Array 或者直接使用Python创建C++的
+:code:`fluid.LoDTensor` , 通过 :code:`Executor.run(feed=...)` 传给
+:code:`fluid.Executor` 或 :code:`fluid.ParallelExecutor` 。
+
+数据层配置
+##########
+
+通过 :code:`fluid.layers.data()` 可以配置神经网络中需要的数据层。具体方法为:
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+
+   image = fluid.layers.data(name="image", shape=[3, 224, 224])
+   label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+   # use image/label as layer input
+   prediction = fluid.layers.fc(input=image, size=1000, act="softmax")
+   loss = fluid.layers.cross_entropy(input=prediction, label=label)
+   ...
+
+上段代码中，:code:`image` 和 :code:`label` 是通过 :code:`fluid.layers.data`
+创建的两个输入数据层。其中 :code:`image` 是 :code:`[3, 224, 224]` 维度的浮点数据;
+:code:`label` 是 :code:`[1]` 维度的整数数据。这里需要注意的是:
+
+1. Fluid中默认使用 :code:`-1` 表示 batch size 维度，默认情况下会在 :code:`shape`
+   的第一个维度添加 :code:`-1` 。 所以 上段代码中， 我们可以接受将一个
+   :code:`[32, 3, 224, 224]` 的numpy array传给 :code:`image` 。 如果想自定义batch size
+   维度的位置的话，请设置 :code:`fluid.layers.data(append_batch_size=False)` 。
+   请参考进阶使用中的 :ref:`user_guide_customize_batch_size_rank` 。
+
+
+2. Fluid中用来做类别标签的数据类型是 :code:`int64`，并且标签从0开始。可用数据类型请参考 :ref:`user_guide_paddle_support_data_types`。
+
+.. _user_guide_feed_data_to_executor:
+
+传递训练数据给执行器
+####################
+
+:code:`Executor.run` 和 :code:`ParallelExecutor.run` 都接受一个 :code:`feed` 参数。
+这个参数是一个Python的字典。它的键是数据层的名字，例如上文代码中的 :code:`image`。
+它的值是对应的numpy array。
+
+例如:
+
+.. code-block:: python
+
+   exe = fluid.Executor(fluid.CPUPlace())
+   exe.run(feed={
+      "image": numpy.random.random(size=(32, 3, 224, 224)).astype('float32'),
+      "label": numpy.random.random(size=(32, 1)).astype('int64')
+   })
+
+进阶使用
+########
+
+如何传入序列数据
+----------------
+
+序列数据是PaddlePaddle Fluid支持的特殊数据类型，可以使用 :code:`LoDTensor` 作为
+输入数据类型。它需要用户: 1. 传入一个mini-batch需要被训练的所有数据;
+2.每个序列的长度信息。
+用户可以使用 :code:`fluid.create_lod_tensor` 来创建 :code:`LoDTensor`。
+
+传入序列信息的时候，需要设置序列嵌套深度，:code:`lod_level`。
+例如训练数据是词汇组成的句子，:code:`lod_level=1`；训练数据是 词汇先组成了句子，
+句子再组成了段落，那么 :code:`lod_level=2`。
+
+例如:
+
+.. code-block:: python
+
+   sentence = fluid.layers.data(name="sentence", dtype="int64", shape=[1], lod_level=1)
+
+   ...
+
+   exe.run(feed={
+     "sentence": create_lod_tensor(
+       data=numpy.array([1, 3, 4, 5, 3, 6, 8], dtype='int64').reshape(-1, 1),
+       lod=[4, 1, 2],
+       place=fluid.CPUPlace()
+     )
+   })
+
+训练数据 :code:`sentence` 包含三个样本，他们的长度分别是 :code:`4, 1, 2`。
+他们分别是 :code:`data[0:4]`， :code:`data[4:5]` 和 :code:`data[5:7]`。
+
+如何分别设置ParallelExecutor中每个设备的训练数据
+------------------------------------------------
+
+用户将数据传递给使用 :code:`ParallelExecutor.run(feed=...)` 时，
+可以显示指定每一个训练设备(例如GPU)上的数据。
+用户需要将一个列表传递给 :code:`feed` 参数，列表中的每一个元素都是一个字典。
+这个字典的键是数据层的名字，值是数据层的值。
+
+例如:
+
+.. code-block:: python
+
+   parallel_executor = fluid.ParallelExecutor()
+   parallel_executor.run(
+     feed=[
+        {
+          "image": numpy.random.random(size=(32, 3, 224, 224)).astype('float32'),
+          "label": numpy.random.random(size=(32, 1)).astype('int64')
+        },
+        {
+          "image": numpy.random.random(size=(16, 3, 224, 224)).astype('float32'),
+          "label": numpy.random.random(size=(16, 1)).astype('int64')
+        },
+     ]
+   )
+
+上述代码中，GPU0会训练 32 个样本，而 GPU1训练 16 个样本。
+
+
+.. _user_guide_customize_batch_size_rank:
+
+自定义BatchSize维度
+-------------------
+
+PaddlePaddle Fluid默认batch size是数据的第一维度，以 :code:`-1` 表示。但是在高级
+使用中，batch_size 可以固定，也可以是其他维度或者多个维度来表示。这都需要设置
+:code:`fluid.layers.data(append_batch_size=False)` 来完成。
+
+1. 固定batch size维度
+
+  .. code-block:: python
+
+     image = fluid.layers.data(name="image", shape=[32, 784], append_batch_size=False)
+
+  这里，:code:`image` 永远是一个 :code:`[32, 784]` 大小的矩阵。
+
+2. 使用其他维度表示batch size
+
+  .. code-block:: python
+
+     sentence = fluid.layers.data(name="sentence",
+                                  shape=[80, -1, 1],
+                                  append_batch_size=False,
+                                  dtype="int64")
+
+  这里 :code:`sentence` 的中间维度是batch size。这种数据排布会用在定长的循环神经
+  网络中。
+
+
+.. _user_guide_paddle_support_data_types:
+
+Fluid目前支持的数据类型
+-----------------------
+
+PaddlePaddle Fluid目前支持的数据类型包括:
+
+   * float16： 部分操作支持
+   * float32:  主要实数类型
+   * float64:  次要实数类型，支持大部分操作
+   * int32:  次要标签类型
+   * int64: 主要标签类型
+   * uint64: 次要标签类型
+   * bool: 控制流数据类型
+   * int16: 次要标签类型
+   * uint8: 输入数据类型，可用于图像像素
\ No newline at end of file
diff --git a/doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst b/doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..56fa928029903f1e3bd3e8064c146797f01b2b85
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst
@@ -0,0 +1,52 @@
+..  _user_guide_prepare_data:
+
+########
+准备数据
+########
+
+PaddlePaddle Fluid支持两种传入数据的方式:
+
+1. 用户需要使用 :code:`fluid.layers.data`
+配置数据输入层，并在 :code:`fluid.Executor` 或 :code:`fluid.ParallelExecutor`
+中，使用 :code:`executor.run(feed=...)` 传入训练数据。
+
+2. 用户需要先将训练数据
+转换成 Paddle 识别的 :code:`fluid.recordio_writer` ， 再使用
+:code:`fluid.layers.open_files` 以及 :code:`fluid.layers.reader` 配置数据读取。
+
+这两种准备数据方法的比较如下:
+
+.. _user_guide_prepare_data_comparision:
+
++------------+----------------------------------+---------------------------------------+
+|            |        Feed数据                  |         使用Reader                    |
++============+==================================+=======================================+
+| API接口    | :code:`executor.run(feed=...)`   |         :code:`fluid.layers.reader`       |
++------------+----------------------------------+---------------------------------------+
+| 数据格式   |           Numpy Array            | :code:`fluid.recordio_writer` |
++------------+----------------------------------+---------------------------------------+
+| 数据增强   | Python端使用其他库完成           | 使用Fluid中的Operator 完成            |
++------------+----------------------------------+---------------------------------------+
+|   速度     |                 慢               |                 快                    |
++------------+----------------------------------+---------------------------------------+
+| 推荐用途   |   调试模型                       |   工业训练                            |
++------------+----------------------------------+---------------------------------------+
+
+这些准备数据的详细使用方法，请参考:
+
+.. toctree::
+   :maxdepth: 2
+
+   feeding_data
+   use_recordio_reader
+
+Python Reader
+#############
+
+为了方便用户在Python中定义数据处理流程，PaddlePaddle Fluid支持 Python Reader，
+具体请参考:
+
+.. toctree::
+   :maxdepth: 2
+
+   reader.md
diff --git a/doc/fluid/new_docs/user_guides/howto/prepare_data/reader.md b/doc/fluid/new_docs/user_guides/howto/prepare_data/reader.md
new file mode 100644
index 0000000000000000000000000000000000000000..aa50e4d26166536eaf8044d527debd8ad46060f6
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/prepare_data/reader.md
@@ -0,0 +1,210 @@
+```eval_rst
+.. _user_guide_reader:
+```
+
+# Python Reader
+
+During the training and testing phases, PaddlePaddle programs need to read data. To help the users write code that performs reading input data, we define the following:
+
+- A *reader*: A function that reads data (from file, network, random number generator, etc) and yields the data items.
+- A *reader creator*: A function that returns a reader function.
+- A *reader decorator*: A function, which takes in one or more readers, and returns a reader.
+- A *batch reader*: A function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items.
+
+and also provide a function which can convert a reader to a batch reader, frequently used reader creators and reader decorators.
+
+## Data Reader Interface
+
+*Data reader* doesn't have to be a function that reads and yields data items. It can just be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`) as follows:
+
+```
+iterable = data_reader()
+```
+
+The item produced from the iterable should be a **single** entry of data and **not** a mini batch. The entry of data could be a single item or a tuple of items. Item should be of one of the [supported types](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int etc.)
+
+An example implementation for single item data reader creator is as follows:
+
+```python
+def reader_creator_random_image(width, height):
+    def reader():
+        while True:
+            yield numpy.random.uniform(-1, 1, size=width*height)
+    return reader
+```
+
+An example implementation for multiple item data reader creator is as follows:
+```python
+def reader_creator_random_image_and_label(width, height, label):
+    def reader():
+        while True:
+            yield numpy.random.uniform(-1, 1, size=width*height), label
+    return reader
+```
+
+## Batch Reader Interface
+
+*Batch reader* can be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list should be a tuple.
+
+Here are some valid outputs:
+
+```python
+# a mini batch of three data items. Each data item consist three columns of data, each of which is 1.
+[(1, 1, 1),
+(2, 2, 2),
+(3, 3, 3)]
+
+# a mini batch of three data items, each data item is a list (single column).
+[([1,1,1],),
+([2,2,2],),
+([3,3,3],)]
+```
+
+Please note that each item inside the list must be a tuple, below is an invalid output:
+```python
+ # wrong, [1,1,1] needs to be inside a tuple: ([1,1,1],).
+ # Otherwise it is ambiguous whether [1,1,1] means a single column of data [1, 1, 1],
+ # or three columns of data, each of which is 1.
+[[1,1,1],
+[2,2,2],
+[3,3,3]]
+```
+
+It is easy to convert from a reader to a batch reader:
+
+```python
+mnist_train = paddle.dataset.mnist.train()
+mnist_train_batch_reader = paddle.batch(mnist_train, 128)
+```
+
+It is also straight forward to create a custom batch reader:
+
+```python
+def custom_batch_reader():
+    while True:
+        batch = []
+        for i in xrange(128):
+            batch.append((numpy.random.uniform(-1, 1, 28*28),)) # note that it's a tuple being appended.
+        yield batch
+
+mnist_random_image_batch_reader = custom_batch_reader
+```
+
+## Usage
+
+Following is how we can use the reader with PaddlePaddle:
+The batch reader, a mapping from item(s) to data layer, the batch size and the number of total passes will be passed into `paddle.train` as follows:
+
+```python
+# two data layer is created:
+image_layer = paddle.layer.data("image", ...)
+label_layer = paddle.layer.data("label", ...)
+
+# ...
+batch_reader = paddle.batch(paddle.dataset.mnist.train(), 128)
+paddle.train(batch_reader, {"image":0, "label":1}, 128, 10, ...)
+```
+
+## Data Reader Decorator
+
+The *Data reader decorator* takes in a single reader or multiple data readers and returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` in the syntax.
+
+Since we have a strict interface for data readers (no parameters and return a single data item), a data reader can be used in a flexible way using data reader decorators. Following are a few examples:
+
+### Prefetch Data
+
+Since reading data may take some time and training can not proceed without data, it is generally a good idea to prefetch the data.
+
+Use `paddle.reader.buffered` to prefetch data:
+
+```python
+buffered_reader = paddle.reader.buffered(paddle.dataset.mnist.train(), 100)
+```
+
+`buffered_reader` will try to buffer (prefetch) `100` data entries.
+
+### Compose Multiple Data Readers
+
+For example, if we want to use a source of real images (say reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
+
+We can do the following :
+
+```python
+def reader_creator_random_image(width, height):
+    def reader():
+        while True:
+            yield numpy.random.uniform(-1, 1, size=width*height)
+    return reader
+
+def reader_creator_bool(t):
+    def reader:
+        while True:
+            yield t
+    return reader
+
+true_reader = reader_creator_bool(True)
+false_reader = reader_creator_bool(False)
+
+reader = paddle.reader.compose(paddle.dataset.mnist.train(), data_reader_creator_random_image(20, 20), true_reader, false_reader)
+# Skipped 1 because paddle.dataset.mnist.train() produces two items per data entry.
+# And we don't care about the second item at this time.
+paddle.train(paddle.batch(reader, 128), {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...)
+```
+
+### Shuffle
+
+Given the shuffle buffer size `n`, `paddle.reader.shuffle` returns a data reader that buffers `n` data entries and shuffles them before a data entry is read.
+
+Example:
+```python
+reader = paddle.reader.shuffle(paddle.dataset.mnist.train(), 512)
+```
+
+## Q & A
+
+### Why does a reader return only a single entry, and not a mini batch?
+
+Returning a single entry makes reusing existing data readers much easier (for example, if an existing reader returns 3 entries instead if a single entry, the training code will be more complicated because it need to handle cases like a batch size 2).
+
+We provide a function: `paddle.batch` to turn (a single entry) reader into a batch reader.
+
+### Why do we need a batch reader, isn't is sufficient to give the reader and batch_size as arguments during training ?
+
+In most of the cases, it would be sufficient to give the reader and batch_size as arguments to the train method. However sometimes the user wants to customize the order of data entries inside a mini batch, or even change the batch size dynamically. For these cases using a batch reader is very efficient and helpful.
+
+### Why use a dictionary instead of a list to provide mapping?
+
+Using a dictionary (`{"image":0, "label":1}`) instead of a list (`["image", "label"]`) gives the advantage that the user can easily reuse the items (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or even skip an item (e.g., using `{"image_a":0, "label":2}`).
+
+### How to create a custom data reader creator ?
+
+```python
+def image_reader_creator(image_path, label_path, n):
+    def reader():
+        f = open(image_path)
+        l = open(label_path)
+        images = numpy.fromfile(
+            f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32')
+        images = images / 255.0 * 2.0 - 1.0
+        labels = numpy.fromfile(l, 'ubyte', count=n).astype("int")
+        for i in xrange(n):
+            yield images[i, :], labels[i] # a single entry of data is created each time
+        f.close()
+        l.close()
+    return reader
+
+# images_reader_creator creates a reader
+reader = image_reader_creator("/path/to/image_file", "/path/to/label_file", 1024)
+paddle.train(paddle.batch(reader, 128), {"image":0, "label":1}, ...)
+```
+
+### How is `paddle.train` implemented
+
+An example implementation of paddle.train is:
+
+```python
+def train(batch_reader, mapping, batch_size, total_pass):
+    for pass_idx in range(total_pass):
+        for mini_batch in batch_reader(): # this loop will never end in online learning.
+            do_forward_backward(mini_batch, mapping)
+```
diff --git a/doc/fluid/new_docs/user_guides/howto/prepare_data/use_recordio_reader.rst b/doc/fluid/new_docs/user_guides/howto/prepare_data/use_recordio_reader.rst
new file mode 100644
index 0000000000000000000000000000000000000000..dfda33f1b03516fe2c704f55d095955282b19109
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/prepare_data/use_recordio_reader.rst
@@ -0,0 +1,167 @@
+.. _user_guide_use_recordio_as_train_data:
+
+############################
+使用RecordIO文件作为训练数据
+############################
+
+相比于 :ref:`user_guide_use_numpy_array_as_train_data`，
+:ref:`user_guide_use_recordio_as_train_data` 的性能更好；
+但是用户需要先将训练数据集转换成RecordIO文件格式，再使用
+:code:`fluid.layers.open_files()` 层在神经网络配置中导入 RecordIO 文件。
+用户还可以使用 :code:`fluid.layers.double_buffer()` 加速数据从内存到显存的拷贝，
+使用 :code:`fluid.layers.Preprocessor` 工具进行数据增强。
+
+将训练数据转换成RecordIO文件格式
+################################
+
+:code:`fluid.recordio_writer` 中，每个记录都是一个
+:code:`vector<LoDTensor>`, 即一个支持序列信息的Tensor数组。这个数组包括训练所需
+的所有特征。例如对于图像分类来说，这个数组可以包含图片和分类标签。
+
+用户可以使用 :code:`fluid.recordio_writer.convert_reader_to_recordio_file()` 可以将
+:ref:`user_guide_reader` 转换成一个RecordIO文件。或者可以使用
+:code:`fluid.recordio_writer.convert_reader_to_recordio_files()` 将一个
+:ref:`user_guide_reader` 转换成多个RecordIO文件。
+
+具体使用方法为:
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+   import numpy
+
+   def reader_creator():
+       def __impl__():
+           for i in range(1000):
+               yield [
+                        numpy.random.random(size=[3,224,224], dtype="float32"),
+                        numpy.random.random(size=[1], dtype="int64")
+                     ]
+       return __impl__
+
+   img = fluid.layers.data(name="image", shape=[3, 224, 224])
+   label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+   feeder = fluid.DataFeeder(feed_list=[img, label], place=fluid.CPUPlace())
+
+   BATCH_SIZE = 32
+   reader = paddle.batch(reader_creator(), batch_size=BATCH_SIZE)
+   fluid.recordio_writer.convert_reader_to_recordio_file(
+      "train.recordio", feeder=feeder, reader_creator=reader)
+
+其中 :code:`reader_creator` 创建了一个 :code:`Reader`。
+:ref:`_api_fluid_data_feeder_DataFeeder`
+是将 :code:`Reader` 转换成 :code:`LoDTensor` 的工具。详细请参考
+:ref:`user_guide_reader` 。
+
+上述程序将 :code:`reader_creator` 的数据转换成了 :code:`train.recordio` 文件，
+其中每一个record 含有 32 条样本。如果batch size会在训练过程中调整，
+用户可以将每一个Record的样本数设置成1。并参考
+:ref:`user_guide_use_recordio_as_train_data_use_op_create_batch`。
+
+
+配置神经网络, 打开RecordIO文件
+##############################
+
+RecordIO文件转换好之后，用户可以使用 :code:`fluid.layers.open_files()`
+打开文件，并使用 :code:`fluid.layers.read_file` 读取文件内容。
+简单使用方法如下:
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+
+   file_obj = fluid.layers.open_files(
+     filenames=["train.recordio"],
+     shape=[[3, 224, 224], [1]],
+     lod_levels=[0, 0],
+     dtypes=["float32", "int64"],
+     pass_num=100
+   )
+
+   image, label = fluid.layers.read_file(file_obj)
+
+其中如果设置了 :code:`pass_num` ，那么当所有数据读完后，会重新读取数据，
+直到读取了 :code:`pass_num` 遍。
+
+
+
+进阶使用
+########
+
+
+使用 :code:`fluid.layers.double_buffer()`
+------------------------------------------
+
+:code:`Double buffer` 使用双缓冲技术，将训练数据从内存中复制到显存中。配置双缓冲
+需要使用 :code:`fluid.layers.double_buffer()` 修饰文件对象。 例如:
+
+.. code-block:: python
+
+   import paddle.fliud as fluid
+   file_obj = fluid.layers.open_files(...)
+   file_obj = fluid.layers.double_buffer(file_obj)
+
+   image, label = fluid.layers.read_file(file_obj)
+
+双缓冲技术可以参考
+`Multiple buffering <https://en.wikipedia.org/wiki/Multiple_buffering>`_ 。
+
+配置数据增强
+------------
+
+使用 :code:`fluid.layers.Preprocessor` 可以配置文件的数据增强方法。例如
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+   file_obj = fluid.layers.open_files(...)
+   preprocessor = fluid.layers.Preprocessor(reader=data_file)
+   with preprocessor.block():
+       image, label = preprocessor.inputs()
+       image = image / 2
+       label = label + 1
+       preprocessor.outputs(image, label)
+
+如上代码所示，使用 :code:`Preprocessor` 定义了一个数据增强模块，并在
+:code:`with preprocessor.block()` 中定义了数据增强的具体操作。 用户通过配置
+:code:`preprocessor.inputs()` 获得数据文件中的各个字段。 并用
+:code:`preprocessor.outputs()` 标记预处理后的输出。
+
+.. _user_guide_use_recordio_as_train_data_use_op_create_batch:
+
+使用Op组batch
+-------------
+
+使用 :code:`fluid.layers.batch()` 可以在训练的过程中动态的组batch。例如
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+   file_obj = fluid.layers.open_files(...)
+   file_obj = fluid.layers.batch(file_obj, batch_size=32)
+
+   img, label = fluid.layers.read_file(file_obj)
+
+需要注意的是，如果数据集中的最后几个样本不能组成 :code:`batch_size` 大小的批量数据，
+那么这几个样本直接组成一个批量数据进行训练。
+
+读入数据的shuffle
+-----------------
+
+使用 :code:`fluid.layers.shuffle()` 可以在训练过程中动态重排训练数据。例如
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+   file_obj = fluid.layers.open_files(...)
+   file_obj = fliud.layers.shuffle(file_obj, buffer_size=8192)
+
+   img, label = fliud.layers.read_file(file_obj)
+
+需要注意的是:
+
+1. :code:`shuffle` 实现方法是:
+先读入 :code:`buffer_size` 条样本，再随机的选出样本进行训练。
+
+2. :code:`shuffle` 中 :code:`buffer_size` 会占用训练内存，需要确定训练过程中内存
+足够支持缓存 :code:`buffer_size` 条数据。
diff --git a/doc/fluid/new_docs/user_guides/howto/training/checkpoint_doc_cn.md b/doc/fluid/new_docs/user_guides/howto/training/checkpoint_doc_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..c4afd536c67b24a17e4437ecedf779ddcddcbc98
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/training/checkpoint_doc_cn.md
@@ -0,0 +1,60 @@
+# Checkpoint功能使用指南
+
+## 背景
+单机/多机在训练过程中会由于软件/硬件的问题出现异常，导致训练中断，进而导致训练无结果或结果不可用，浪费大量时间和机器性能。
+
+## 目的
+Checkpoint功能能够在训练中途对训练数据中间数据进行保存，出现异常恢复训练的时候能够加载中途保存的数据继续训练， 实现单机/多机的容错训练的功能。
+
+## 说明
+### 目前已实现的参数保存：
+1. 基于Trainer 0 实现训练过程中的参数保存
+2. 基于PServer 实现了```Distribute Lookup Table```相关参数保存
+### Fluid Checkpoint 保存数据目录结构：
+
+```
+checkpoint_dir (用户定义的checkpoint目录)
+├── checkpoint_0 (第一次保存)
+│   ├── __lockup_table__ (Distribute Lookup Table 目录)
+│   │   ├── table_pserver_0 (Pserver 0 号保存的lookup table 数据)
+│   │   └── table_pserver_1
+│   ├── __model__ (model 目录)
+│   │   └── var.w_1
+│   └── trainer_0 (trainer 自有数据保存)
+│       ├── epoch_id
+│       └── step_id
+└── checkpoint_1 (第二次保存)
+```
+
+## 使用方法
+### 声明Fluid.CheckpointConfig
+用户对checkpoint功能的配置，主要是配置对象```Fluid```中的```CheckpointConfig```.
+
+```CheckpointConfig``` 包括4个参数：
+
+| 参数 | 类型 | 说明 | 
+| - | :-: | - | 
+| checkpoint_dir | int| checkpoint存储目录 | 
+| max_num_checkpoints | int | 最大保存的checkpoint副本数 | 
+| epoch_interval | int | 每隔epoch_interval轮epoch |
+| step_interval | int | 每隔step_interval轮step |
+
+### 在Fluid.Trainer对象的声明中加入Fluid.CheckpointConfig的声明
+Trainer的__init__方法的参数中包含了对```CheckpointConfig```， 需要传入在声明Trainer前声明的```CheckpointConfig```对象。
+如：
+```python
+config = CheckpointConfig(
+    checkpoint_dir = "/tmp/ckpt", max_num_checkpoints = 2, 
+    epoch_interval = 2, step_interval = 10)
+trainer = Trainer(..., checkpoint_config=config)
+```
+定义和声明完成后， 训练在运行过程中就会在指定的step和epoch处进行保存，出现异常时，就会自动从最新的checkpoint目录进行参数恢复啦！
+
+## 相关API
+[Trainer API 说明](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/trainer.py)
+
+## 注意
+1. 保证每个训练的```checkpoint_dir``` 与其他训练独立。
+2. 最大副本数量```max_num_checkpoints```需要根据磁盘容量以及模型的大小进行调整， 保证磁盘的可用性。
+3. ```epoch_interval```  和 ```step_interval```  不宜过小， 频繁的进行checkpoint会拖慢训练速度。
+4. **分布式训练**的过程中：每个Trainer都会在```checkpoint_dir```目录中保存当前Trainer的参数（只有Trainer 0会保存模型的参数），需要**分布式文件系统(HDFS等)**将同```checkpoint_dir```目录的数据进行合并才能得到完整的数据，恢复训练的时候需要用完整的数据进行恢复。
diff --git a/doc/fluid/new_docs/user_guides/howto/training/checkpoint_doc_en.md b/doc/fluid/new_docs/user_guides/howto/training/checkpoint_doc_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..14d37246ca0cab8715e244fda9624d0d59f8ec5f
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/training/checkpoint_doc_en.md
@@ -0,0 +1,62 @@
+# Checkpoint User Guide
+
+## Background
+In many cases, Stand-alone training and Distributed training can be aborted by the software problem or hardware problem. More seriously, we waste so much time and the performance of the machine but get nothing, which makes us frustrating and we have to restart it again.
+
+## Purpose
+The feature of ```Checkpoint``` can save Intermediate model variables, lookup table variable, and other needs data in checkpoint directory. When the exception occurs, we can load these variables from the checkpoint directory immediately.
+## Introduce
+### Complete Features Currently：
+1. The Trainer 0 will save model variables in training.
+2. Each of the Trainer will save its own arguments needed.
+3. Each of the Parameter Server will save ```Distribute Lookup Table``` variables in training.
+### Fluid Checkpoint directory structure：
+
+```
+checkpoint_dir (the checkpoint directory user define)
+├── checkpoint_0 (the first save directory)
+│   ├── __lockup_table__ (Distribute Lookup Table directory)
+│   │   ├── table_pserver_0 (Lookup table's data about Pserver 0)
+│   │   └── table_pserver_1
+│   ├── __model__ (model directory)
+│   │   └── var.w_1
+│   └── trainer_0 (each trainer will save its own data)
+│       ├── epoch_id
+│       └── step_id
+└── checkpoint_1 (the second save directory)
+```
+
+## usage
+### Fluid.CheckpointConfig construct
+When the user wants to use ```Checkpoint``` feature, the main thing user have to do is declare ```CheckpointConfig``` and construct it.
+
+```CheckpointConfig``` has 4 member variables need to be initialized：
+
+| Member Variable | Type | Comment | 
+| - | :-: | - | 
+| checkpoint_dir | int| checkpoint directory | 
+| max_num_checkpoints | int | Maximum number of checkpoint copies | 
+| epoch_interval | int |  epoch interval times |
+| step_interval | int | step interval times |
+
+### Add Fluid.CheckpointConfig's declaration in Fluid.Trainer
+Because the initialization of Trainer needs an instance of ```CheckpointConfig```., we should declare ```CheckpointConfig``` in ```Fluid``` first.
+
+For example：
+```python
+config = CheckpointConfig(
+    checkpoint_dir = "/tmp/ckpt", max_num_checkpoints = 2, 
+    epoch_interval = 2, step_interval = 10)
+trainer = Trainer(..., checkpoint_config=config)
+```
+
+After all the things done, the train will save checkpoint at the specified epoch and step, when the train is aborted, the user can restart it, the train will restore from the latest copy.
+
+## Related API
+[Related Trainer API](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/trainer.py)
+
+## Attention
+1. Make the ```checkpoint_dir``` only be used by one train job.
+2. The number of ```max_num_checkpoints``` need to be adjusted by the disk size and model size.
+3. Too frequently to slow down the train speed, so too ```small epoch_interval``` and ```step_interval``` are not suitable.
+4. **In distributed train**, each Trainer will save arguments in its ```checkpoint_dir``` (Only Trainer 0 will save model variables). We need **distributed file system (HDFS, etc)** to merge all the ```checkpoint_dir``` to get the whole data.
diff --git a/doc/fluid/new_docs/user_guides/howto/training/cluster_howto.rst b/doc/fluid/new_docs/user_guides/howto/training/cluster_howto.rst
new file mode 100644
index 0000000000000000000000000000000000000000..00ec9e819c81fae3263b1f1e6bcedf524f2b3991
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/training/cluster_howto.rst
@@ -0,0 +1,160 @@
+.. _cluster_howto
+
+Fluid分布式训练使用手册
+====================
+
+分布式训练基本思想
+---------------
+
+分布式深度学习训练通常分为两种并行化方法：数据并行，模型并行，参考下图：
+
+.. image:: src/parallelism.png
+
+在模型并行方式下，模型的层和参数将被分布在多个节点上，模型在一个mini-batch的前向和反向训练中，将经过多次跨\
+节点之间的通信。每个节点只保存整个模型的一部分；在数据并行方式下，每个节点保存有完整的模型的层和参数，每个节点\
+独自完成前向和反向计算，然后完成梯度的聚合并同步的更新所有节点上的参数。Fluid目前版本仅提供数据并行方式，另外\
+诸如模型并行的特例实现（超大稀疏模型训练）功能将在后续的文档中予以说明。
+
+在数据并行模式的训练中，Fluid使用了两种通信模式，用于应对不同训练任务对分布式训练的要求，分别为RPC通信和Collective
+通信。其中RPC通信方式使用 `gRPC <https://github.com/grpc/grpc/>`_ ，Collective通信方式使用
+`NCCL2 <https://developer.nvidia.com/nccl>`_ 。
+
+.. csv-table:: 下面是一个RPC通信和Collective通信的横向对比：
+   :header: "Feature", "Coolective", "RPC"
+
+   "Ring-Based通信", "Yes", "No"
+   "异步训练", "Yes", "Yes"
+   "分布式模型", "No", "Yes"
+   "容错训练", "No", "Yes"
+   "性能", "Faster", "Fast"
+
+- RPC通信方式的结构：
+
+  .. image:: src/dist_train_pserver.png
+
+  使用RPC通信方式的数据并行分布式训练，会启动多个pserver进程和多个trainer进程，每个pserver进程\
+  会保存一部分模型参数，并负责接收从trainer发送的梯度并更新这些模型参数；每个trainer进程会保存一份\
+  完整的模型，并使用一部分数据进行训练，然后向pserver发送梯度，最后从pserver拉取更新后的参数。
+
+  pserver进程可以在和trainer完全不同的计算节点上，也可以和trainer公用节点。一个分布式任务所需要的\
+  pserver进程个数通常需要根据实际情况调整，已达到最佳的性能，然而通常来说pserver的进程不会比trainer\
+  更多。
+
+  在使用GPU训练时，pserver可以选择使用GPU或只使用CPU，如果pserver也使用GPU，则会增加一次从CPU拷贝\
+  接收到的梯度数据到GPU的开销，在某些情况下会导致整体训练性能降低。
+
+- NCCL2通信方式的结构：
+
+  .. image:: src/dist_train_nccl2.png
+
+  使用NCCL2（Collective通信方式）进行分布式训练，是不需要启动pserver进程的，每个trainer进程都保存\
+  一份完整的模型参数，在完成计算梯度之后通过trainer之间的相互通信，Reduce梯度数据到所有节点的所有设备\
+  然后每个节点在各自完成参数更新。
+
+使用parameter server方式的训练
+------------------------------
+
+使用 :code:`trainer` API，程序可以自动的通过识别环境变量决定是否已分布式方式执行。
+
+.. csv-table:: 需要在您的分布式环境中配置的环境变量包括：
+   :header: "环境变量", "说明"
+
+   "PADDLE_TRAINING_ROLE", "当前进程的角色，可以是PSERVER或TRAINER"
+   "PADDLE_PSERVER_PORT", "parameter使用的端口"
+   "PADDLE_PSERVER_IPS", "parameter server的IP地址列表，用逗号分开"
+   "PADDLE_TRAINERS", "分布式任务中trainer节点的个数"
+   "PADDLE_CURRENT_IP", "当前节点的IP"
+   "PADDLE_TRAINER_ID", "trainer节点的id，从0~n-1，不能有重复"
+
+使用更加底层的 :code:`transpiler` API可以提供自定义的分布式训练的方法，比如可以在同一台机器上，
+启动多个pserver和trainer进行训练，使用底层API的方法可以参考下面的样例代码：
+
+.. code-block:: python
+
+   role = "PSERVER"
+   trainer_id = 0
+   pserver_endpoints = "127.0.0.1:6170,127.0.0.1:6171"
+   current_endpoint = "127.0.0.1:6170"
+   trainers = 4
+   t = fluid.DistributeTranspiler()
+   t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
+   if role == "PSERVER":
+       pserver_prog = t.get_pserver_program(current_endpoint)
+       pserver_startup = t.get_startup_program(current_endpoint,
+                                               pserver_prog)
+       exe.run(pserver_startup)
+       exe.run(pserver_prog)
+   elif role == "TRAINER":
+       train_loop(t.get_trainer_program())
+
+
+选择同步或异步训练
+++++++++++++++++++
+
+Fluid分布式任务可以支持同步训练或异步训练，在同步训练方式下，所有的trainer节点，会在每个mini-batch
+同步地合并所有节点的梯度数据并发送给parameter server完成更新，在异步训练方式下，每个trainer没有相互\
+同步等待的过程，可以独立的parameter server的参数。通常情况下，使用异步训练方式，可以在trainer节点\
+更多的时候比同步训练方式有更高的总体吞吐量。
+
+在调用 :code:`transpile` 函数时，默认会生成同步训练的分布式程序，通过指定 :code:`sync_mode=False`
+参数即可生成异步训练的程序：
+
+.. code-block:: python
+
+   t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers, sync_mode=False)
+
+
+选择参数分布方法
+++++++++++++++++
+
+参数 :code:`split_method` 可以指定参数在parameter server上的分布方式。
+
+Fluid默认使用 `RoundRobin <https://en.wikipedia.org/wiki/Round-robin_scheduling>`_
+方式将参数分布在多个parameter server上。此方式在默认未关闭参数切分的情况下，参数会较平均的分布在所有的
+parameter server上。如果需要使用其他，可以传入其他的方法，目前可选的方法有： :code:`RoundRobin` 和
+:code:`HashName` 。也可以使用自定义的分布方式，只需要参考
+`这里 <https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/transpiler/ps_dispatcher.py#L44>`_
+编写自定义的分布函数。
+
+
+关闭切分参数
+++++++++++++
+
+参数 :code:`slice_var_up` 指定是否将较大（大于8192个元素）的参数切分到多个parameter server已均衡计算负载，默认为开启。
+
+当模型中的可训练参数体积比较均匀或者使用自定义的参数分布方法是参数均匀分布在多个parameter server上，
+可以选择关闭切分参数，这样可以降低切分和重组带来的计算和拷贝开销：
+
+.. code-block:: python
+
+   t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers, slice_var_up=False)
+
+
+使用NCCL2通信方式的训练
+--------------------
+
+注NCCL2模式目前仅支持trainer API，NCCL2方式并没有很多可选项，也没有"transpiler"，所以并没有底层API。
+使用NCCL2方式同样需要配置每个节点的环境变量，此处与parameter server模式有所不同，并不需要启动独立的\
+parameter server的进程，只需要启动多个trainer进程即可。
+
+
+.. csv-table:: NCCL2模式环境变量说明：
+   :header: "环境变量", "说明"
+
+   "PADDLE_TRAINER_IPS", "所有Trainer节点的IP列表，用逗号分隔"
+   "PADDLE_TRAINER_ID", "trainer节点的id，从0~n-1，不能有重复"
+   "PADDLE_PSERVER_PORT", "一个端口，用于在NCCL2初始化时，广播NCCL ID"
+   "PADDLE_CURRENT_IP", "当前节点的IP"
+
+目前使用NCCL2进行分布式训练仅支持同步训练方式。使用NCCL2方式的分布式训练，更适合模型体积较大，并需要使用\
+同步训练和GPU训练，如果硬件设备支持RDMA和GPU Direct，可以达到很高的分布式训练性能。
+
+注意如果系统中有多个网络设备，需要手动指定NCCL2使用的设备，
+假设需要使用 :code:`eth2` 为通信设备，需要设定如下环境变量：
+
+.. code-block:: bash
+
+   export NCCL_SOCKET_IFNAME=eth2
+
+另外NCCL2提供了其他的开关环境变量，比如指定是否开启GPU Direct，是否使用RDMA等，详情可以参考
+`ncclknobs <https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/index.html#ncclknobs>`_ 。
diff --git a/doc/fluid/new_docs/user_guides/howto/training/cluster_quick_start.rst b/doc/fluid/new_docs/user_guides/howto/training/cluster_quick_start.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6131c92d6f5386c7e91b2917d25dd7ae830ff182
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/training/cluster_quick_start.rst
@@ -0,0 +1,143 @@
+..  _cluster_quick_start:
+
+分布式训练快速开始
+==================
+
+准备工作
+--------
+
+在本篇文章中，我们将会在介绍如何快速在一个集群中启动一个 PaddlePaddle
+的分布式训练任务，在开始之前，请按如下步骤做些准备工作：
+
+1. 准备一个至少4个节点的集群，并且保证网络可以联通，在本文中我们使用
+   ``*.paddlepaddle.com`` 来表示每个节点的主机名称，您可以根据集群的实际情况来修改它。
+
+2. 在开始之前确保已经阅读过 :ref:`how_to_install`
+   并且可以在集群的所有节点上可以正常运行 PaddlePaddle。
+
+启动集群训练任务
+----------------
+
+在启动集群训练脚本时，需要在不同的节点上指定不同的环境变量，具体如下：
+
++-----------------+-----------------+-----------------+---------------------+
+| 环境变量        | 数据类型        | 样例            | 描述                |
++=================+=================+=================+=====================+
+| PADDLE_TRAINING | str             | PSERVER,TRAINER | 训练节点的角色      |
+| _ROLE           |                 |                 |                     |
++-----------------+-----------------+-----------------+---------------------+
+| PADDLE_PSERVER_ | str             | ps0.paddlepaddl | 所有 pserver        |
+| IPS             |                 | e.com,ps1.paddl | 节点的 IP           |
+|                 |                 | epaddle.com…    | 地址或              |
+|                 |                 |                 | hostname,           |
+|                 |                 |                 | 用“,”分隔           |
++-----------------+-----------------+-----------------+---------------------+
+| PADDLE_PSERVER_ | int             | 6174            | pserver             |
+| PORT            |                 |                 | 节点监听的端口      |
++-----------------+-----------------+-----------------+---------------------+
+| PADDLE_TRAINERS | int             | 2               | 训练任务中          |
+|                 |                 |                 | trainer             |
+|                 |                 |                 | 节点的数量          |
++-----------------+-----------------+-----------------+---------------------+
+| PADDLE_CURRENT_ | str             | ps0.paddlepaddl | 当前 pserver        |
+| IP              |                 | e.com           | 节点的 IP           |
+|                 |                 |                 | 地址或 hostanme     |
++-----------------+-----------------+-----------------+---------------------+
+| PADDLE_TRAINER_ | int             | 0               | 当前 trainer        |
+| ID              |                 |                 | 节点的唯一 ID,      |
+|                 |                 |                 | 取值范围为从0开始到 |
+|                 |                 |                 | PADDLE_TRAINERS-1   |
++-----------------+-----------------+-----------------+---------------------+
+
+样例代码
+~~~~~~~~
+
+将下面程序代码保存为 ``fluid_dist.py``
+
+.. code:: python
+
+   import paddle
+   import paddle.fluid as fluid
+   import contextlib
+   import numpy
+   import unittest
+
+   # train reader
+   BATCH_SIZE = 20
+
+   train_reader = paddle.batch(
+       paddle.reader.shuffle(
+           paddle.dataset.uci_housing.train(), buf_size=500),
+       batch_size=BATCH_SIZE)
+
+   test_reader = paddle.batch(
+       paddle.reader.shuffle(
+           paddle.dataset.uci_housing.test(), buf_size=500),
+       batch_size=BATCH_SIZE)
+
+
+   def train_program():
+       y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+       x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+       y_predict = fluid.layers.fc(input=x, size=1, act=None)
+
+       loss = fluid.layers.square_error_cost(input=y_predict, label=y)
+       avg_loss = fluid.layers.mean(loss)
+
+       return avg_loss
+
+   def optimizer_func():
+       return fluid.optimizer.SGD(learning_rate=0.001)
+
+   def train(use_cuda, train_program):
+       place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+       trainer = fluid.Trainer(
+           train_func=train_program, place=place, optimizer_func=optimizer_func)
+
+       def event_handler(event):
+           if isinstance(event, fluid.EndStepEvent):
+               if event.step == 10:
+                   test_metrics = trainer.test(
+                       reader=test_reader, feed_order=['x', 'y'])
+                   print("step {0}, loss: {1}".format(event.step, test_metrics))
+                   trainer.stop()
+
+       trainer.train(
+           reader=train_reader,
+           num_epochs=100,
+           event_handler=event_handler,
+           feed_order=['x', 'y'])
+
+   train(False, train_program)
+
+启动trainer节点和pserver节点
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. list-table::
+   :header-rows: 1
+
+   * - 启动节点
+     - 启动命令
+     - 说明
+   * - ps0.paddlepaddle.com
+     - :code:`PADDLE_TRAINING_ROLE=PSERVER PADDLE_CURRENT_IP=ps0.paddlepaddle.com PADDLE_PSERVER_IPS=ps0.paddlepaddle.com,ps1.paddlepaddle.com PADDLE_TRAINERS=2 PADDLE_PSERVER_PORT=6174 python fluid_dist.py`
+     - 启动 pserver 节点
+   * - ps1.paddlepaddle.com
+     - :code:`PADDLE_TRAINING_ROLE=PSERVER PADDLE_CURRENT_IP=ps1.paddlepaddle.com PADDLE_PSERVER_IPS=ps0.paddlepaddle.com,ps1.paddlepaddle.com PADDLE_TRAINERS=2 PADDLE_PSERVER_PORT=6174 python fluid_dist.py`
+     - 启动 pserver 节点
+   * - trainer0.paddlepaddle.com
+     - :code:`PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_IPS=ps0.paddlepaddle.com,ps1.paddlepaddle.com PADDLE_TRAINERS=2 PADDLE_TRAINER_ID=0 PADDLE_PSERVER_PORT=6174 python fluid_dist.py`
+     - 启动第0号 trainer 节点
+   * - trainer1.paddlepaddle.com
+     - :code:`PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_IPS=ps0.paddlepaddle.com,ps1.paddlepaddle.com PADDLE_TRAINERS=2 PADDLE_TRAINER_ID=1 PADDLE_PSERVER_PORT=6174 python fluid_dist.py`
+     - 启动第1号 trainer 节点
+
+**注意**
+
+-  需要先启动pserver节点再启动trainer节点
+-  看到trainer节点输出如下日志表示训练任务执行正确
+
+   .. code:: bash
+
+      step 10, loss: [258.2326202392578]
diff --git a/doc/fluid/new_docs/user_guides/howto/training/index.rst b/doc/fluid/new_docs/user_guides/howto/training/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..68475101e26b3f695c8003995cc1c6a95426ff27
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/training/index.rst
@@ -0,0 +1,12 @@
+############
+训练神经网络
+############
+
+PaddlePaddle Fluid支持单机训练，和多节点训练。每种训练模式下，都支持多种训练方法。
+
+.. toctree::
+   :maxdepth: 2
+
+   single_node
+   multi_node
+   save_load_variables
diff --git a/doc/fluid/new_docs/user_guides/howto/training/multi_node.rst b/doc/fluid/new_docs/user_guides/howto/training/multi_node.rst
new file mode 100644
index 0000000000000000000000000000000000000000..24316f0be0d8f211e680fa15cb432732b5967c79
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/training/multi_node.rst
@@ -0,0 +1,9 @@
+########
+多机训练
+########
+
+.. toctree::
+   :maxdepth: 2
+
+   cluster_quick_start.rst
+   cluster_howto.rst
diff --git a/doc/fluid/new_docs/user_guides/howto/training/save_load_variables.rst b/doc/fluid/new_docs/user_guides/howto/training/save_load_variables.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a96776f4a17a1d6da170bdff9d81771c38912bb5
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/training/save_load_variables.rst
@@ -0,0 +1,172 @@
+.. _user_guide_save_load_vars:
+
+##################
+保存与载入模型变量
+##################
+
+模型变量分类
+############
+
+在PaddlePaddle Fluid中，所有的模型变量都用 :code:`fluid.Variable()` 作为基类进行表示。
+在该基类之下，模型变量主要可以分为以下几种类别：
+
+1. 模型参数
+  模型参数是深度学习模型中被训练和学习的变量，在训练过程中，训练框架根据反向传播算法计算出每一个模型参数当前的梯度，
+  并用优化器根据梯度对参数进行更新。模型的训练过程本质上可以看做是模型参数不断迭代更新的过程。
+  在PaddlePaddle Fluid中，模型参数用 :code:`fluid.framework.Parameter` 来表示，
+  这是一个 :code:`fluid.Variable()` 的派生类，除了 :code:`fluid.Variable()` 具有的各项性质以外，
+  :code:`fluid.framework.Parameter` 还可以配置自身的初始化方法、更新率等属性。
+
+2. 长期变量
+  长期变量指的是在整个训练过程中持续存在、不会因为一个迭代的结束而被销毁的变量，例如动态调节的全局学习率等。
+  在PaddlePaddle Fluid中，长期变量通过将 :code:`fluid.Variable()` 的 :code:`persistable`
+  属性设置为 :code:`True` 来表示。所有的模型参数都是长期变量，但并非所有的长期变量都是模型参数。
+
+3. 临时变量
+  不属于上面两个类别的所有模型变量都是临时变量，这种类型的变量只在一个训练迭代中存在，在每一个迭代结束后，
+  所有的临时变量都会被销毁，然后在下一个迭代开始之前，又会先构造出新的临时变量供本轮迭代使用。
+  一般情况下模型中的大部分变量都属于这一类别，例如输入的训练数据、一个普通的layer的输出等等。
+
+
+
+如何保存模型变量
+################
+
+根据用途的不同，我们需要保存的模型变量也是不同的。例如，如果我们只是想保存模型用来进行以后的预测，
+那么只保存模型参数就够用了。但如果我们需要保存一个checkpoint以备将来恢复训练，
+那么我们应该将各种长期变量都保存下来，甚至还需要记录一下当前的epoch和step的id。
+因为一些模型变量虽然不是参数，但对于模型的训练依然必不可少。
+
+因此，根据需求的不同，我们提供了两套API来分别进行模型的参数和checkpoint的保存。
+
+保存模型用于对新样本的预测
+==========================
+
+如果我们保存模型的目的是用于对新样本的预测，那么只保存模型参数就足够了。我们可以使用
+:code:`fluid.io.save_params()` 接口来进行模型参数的保存。
+
+例如：
+
+.. code-block:: python
+
+    import paddle.fluid as fluid
+
+    exe = fluid.Executor(fluid.CPUPlace())
+    param_path = "./my_paddle_model"
+    prog = fluid.default_main_program()
+    fluid.io.save_params(executor=exe, dirname=param_path, main_program=None)
+
+上面的例子中，通过调用 :code:`fluid.io.save_params` 函数，PaddlePaddle Fluid会对默认
+:code:`fluid.Program` 也就是 :code:`prog` 中的所有模型变量进行扫描，
+筛选出其中所有的模型参数，并将这些模型参数保存到指定的 :code:`param_path` 之中。
+
+
+保存checkpoint用于将来恢复训练
+==============================
+
+在训练过程中，我们可能希望在一些节点上将当前的训练状态保存下来，
+以便在将来需要的时候恢复训练环境继续进行训练。这一般被称作“checkpoint”。
+想要保存checkpoint，可以使用 :code:`fluid.io.save_checkpiont()` 接口。
+
+例如：
+
+.. code-block:: python
+
+    import paddle.fluid as fluid
+
+    exe = fluid.Executor(fluid.CPUPlace())
+    path = "./checkpoints"
+    prog = fluid.default_main_program()
+    trainer_args = {"epoch_id": 200,
+                    "step_id": 20} # just an example
+    fluid.io.save_checkpoint(executor=exe,
+                                checkpoint_dir=path,
+                                trainer_id=0,
+                                trainer_args=trainer_args,
+                                main_program=prog,
+                                max_num_checkpoints=3)
+
+上面的例子中，通过调用 :code:`fluid.io.save_checkpoint` 函数，PaddlePaddle Fluid会对默认
+:code:`fluid.Program` 也就是 :code:`prog` 中的所有模型变量进行扫描，
+根据一系列内置的规则自动筛选出其中所有需要保存的变量，并将他们保存到指定的 :code:`path` 目录下。
+
+:code:`fluid.io.save_checkpoint` 的各个参数中， :code:`trainer_id` 在单机情况下设置为0即可； :code:`trainer_args`
+为一个Python dict，用于给定当前的epoch_id和step_id；
+:code:`max_num_checkpoints` 用于表示的最大checkpoint数量，
+如果目录中已经存在的checkpoint数量超过这个值，那最早的checkpoint将被删除。
+
+如何载入模型变量
+################
+
+与模型变量的保存相对应，我们提供了两套API来分别载入模型的参数和载入模型的checkpoint。
+
+载入模型用于对新样本的预测
+==========================
+
+对于通过 :code:`fluid.io.save_params` 保存的模型，可以使用 :code:`fluid.io.load_params`
+来进行载入。
+
+例如：
+
+.. code-block:: python
+
+    import paddle.fluid as fluid
+
+    exe = fluid.Executor(fluid.CPUPlace())
+    param_path = "./my_paddle_model"
+    prog = fluid.default_main_program()
+    fluid.io.load_params(executor=exe, dirname=param_path,
+                         main_program=prog)
+
+上面的例子中，通过调用 :code:`fluid.io.load_params` 函数，PaddlePaddle Fluid会对
+:code:`prog` 中的所有模型变量进行扫描，筛选出其中所有的模型参数，
+并尝试从 :code:`param_path` 之中读取加载它们。
+
+需要格外注意的是，这里的 :code:`prog` 必须和调用 :code:`fluid.io.save_params`
+时所用的 :code:`prog` 中的前向部分完全一致，且不能包含任何参数更新的操作。如果两者存在不一致，
+那么可能会导致一些变量未被正确加载；如果错误地包含了参数更新操作，那可能会导致正常预测过程中参数被更改。
+这两个 :code:`fluid.Program` 之间的关系类似于训练 :code:`fluid.Program`
+和测试 :code:`fluid.Program` 之间的关系，详见： :ref:`user_guide_test_while_training`。
+
+另外，需特别注意运行 :code:`fluid.default_startup_program()` 必须在调用 :code:`fluid.io.load_params`
+之前。如果在之后运行，可能会覆盖已加载的模型参数导致错误。
+
+
+载入checkpoint用于恢复训练
+==========================
+
+对于通过 :code:`fluid.io.save_checkpoint` 保存的模型，可以使用 :code:`fluid.io.load_checkpoint`
+来进行载入。
+
+例如：
+
+.. code-block:: python
+
+    import paddle.fluid as fluid
+
+    exe = fluid.Executor(fluid.CPUPlace())
+    path = "./checkpoints"
+    prog = fluid.default_main_program()
+    fluid.io.load_checkpoint(executor=exe, checkpoint_dir=path,
+                             serial=9, main_program=prog)
+
+上面的例子中，通过调用 :code:`fluid.io.save_checkpoint` 函数，PaddlePaddle Fluid会对
+:code:`prog` 中的所有模型变量进行扫描，根据内置规则自动筛选出需要加载的变量，
+并尝试从 :code:`path` 之中加载它们。
+
+参数 :code:`serial` 用来标记具体要加载的checkpoint的版本号。在保存checkpoint的时候，
+一个checkpoint会被保存在一个子目录中，并在目录名上体现出自己的版本号。
+一般越大的版本号表示这个checkpoint越新。
+
+这里的 :code:`prog` 必须和调用 :code:`fluid.io.save_checkpoint` 时所用的 :code:`prog`
+完全一致，否则会导致变量加载错误或者未加载。另外，与 :code:`fluid.io.save_params` 类似，
+运行 :code:`fluid.default_startup_program()` 也必须在 :code:`fluid.io.load_checkpoint`
+之前进行。
+
+多机checkpoint保存
+##################
+
+.. toctree::
+   :maxdepth: 2
+
+   checkpoint_doc_cn.md
\ No newline at end of file
diff --git a/doc/fluid/new_docs/user_guides/howto/training/single_node.rst b/doc/fluid/new_docs/user_guides/howto/training/single_node.rst
new file mode 100644
index 0000000000000000000000000000000000000000..23eac0f831f2d6d052b7fc35b536d4ab633df851
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/training/single_node.rst
@@ -0,0 +1,119 @@
+########
+单机训练
+########
+
+准备工作
+########
+
+要进行PaddlePaddle Fluid单机训练，需要先 :ref:`user_guide_prepare_data` 和
+:ref:`user_guide_configure_simple_model` 。当\
+:ref:`user_guide_configure_simple_model` 完毕后，可以得到两个\
+:code:`fluid.Program`， :code:`startup_program` 和 :code:`main_program`。
+默认情况下，可以使用 :code:`fluid.default_startup_program()` 与\ :code:`fluid.default_main_program()` 获得全局的 :code:`fluid.Program`。
+
+例如:
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+
+   image = fluid.layers.data(name="image", shape=[784])
+   label = fluid.layers.data(name="label", shape=[1])
+   hidden = fluid.layers.fc(input=image, size=100, act='relu')
+   prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+   loss = fluid.layers.mean(
+       fluid.layers.cross_entropy(
+           input=prediction,
+           label=label
+       )
+   )
+
+   sgd = fluid.optimizer.SGD(learning_rate=0.001)
+   sgd.minimize(loss)
+
+   # Here the fluid.default_startup_program() and fluid.default_main_program()
+   # has been constructed.
+
+在上述模型配置执行完毕后， :code:`fluid.default_startup_program()` 与\
+:code:`fluid.default_main_program()` 配置完毕了。
+
+初始化参数
+##########
+
+参数随机初始化
+==============
+
+用户配置完模型后，参数初始化操作会被写入到\
+:code:`fluid.default_startup_program()` 中。使用 :code:`fluid.Executor()` 运行
+这一程序，即可在全局 :code:`fluid.global_scope()` 中随机初始化参数。例如:
+
+.. code-block:: python
+
+   exe = fluid.Executor(fluid.CUDAPlace(0))
+   exe.run(program=fluid.default_startup_program())
+
+值得注意的是: 如果使用多GPU训练，参数需要先在GPU0上初始化，再经由\
+:code:`fluid.ParallelExecutor` 分发到多张显卡上。
+
+
+载入预定义参数
+==============
+
+在神经网络训练过程中，经常会需要载入预定义模型，进而继续进行训练。\
+如何载入预定义参数，请参考 :ref:`user_guide_save_load_vars`。
+
+
+单卡训练
+########
+
+执行单卡训练可以使用 :code:`fluid.Executor()` 中的 :code:`run()` 方法，运行训练\
+:code:`fluid.Program` 即可。在运行的时候，用户可以通过 :code:`run(feed=...)`\
+参数传入数据；用户可以通过 :code:`run(fetch=...)` 获取持久的数据。例如:\
+
+.. code-block:: python
+
+   ...
+   loss = fluid.layers.mean(...)
+
+   exe = fluid.Executor(...)
+   # the result is an numpy array
+   result = exe.run(feed={"image": ..., "label": ...}, fetch_list=[loss])
+
+这里有几点注意事项:
+
+1. feed的数据格式，请参考文章 :ref:`user_guide_feed_data_to_executor`。
+2. :code:`Executor.run` 的返回值是 :code:`fetch_list=[...]` 的variable值。被fetch\
+   的Variable必须是persistable的。 :code:`fetch_list` 可以传入Variable的列表，\
+   也可以传入Variable的名字列表。:code:`Executor.run` 返回Fetch结果列表。
+3. 如果需要取回的数据包含序列信息，可以设置
+   :code:`exe.run(return_numpy=False, ...)` 直接返回 :code:`fluid.LoDTensor`
+   。用户可以直接访问 :code:`fluid.LoDTensor` 中的信息。
+
+多卡训练
+########
+
+执行多卡训练可以使用 :code:`fluid.ParallelExecutor` 运行训练
+:code:`fluid.Program`。例如:
+
+.. code-block:: python
+
+   train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name,
+                                main_program=fluid.default_main_program())
+   train_exe.run(fetch_list=[loss.name], feed={...})
+
+这里有几点注意事项:
+
+1. :code:`ParallelExecutor` 的构造函数需要指明要执行的 :code:`fluid.Program` ,
+   并在执行过程中不能修改。默认值是 :code:`fluid.default_main_program()` 。
+2. :code:`ParallelExecutor` 需要明确指定是否使用 CUDA 显卡进行训练。在显卡训练\
+   模式下会占用全部显卡。用户可以配置 `CUDA_VISIBLE_DEVICES <http://www.acceleware.com/blog/cudavisibledevices-masking-gpus>`_ 来修改占用\
+   的显卡。
+
+进阶使用
+########
+
+.. toctree::
+   :maxdepth: 2
+
+   test_while_training
+   save_load_variables
diff --git a/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_nccl2.graffle b/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_nccl2.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..16f6b8835c4ffb82babca56b62ba44494fd6a947
Binary files /dev/null and b/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_nccl2.graffle differ
diff --git a/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_nccl2.png b/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_nccl2.png
new file mode 100644
index 0000000000000000000000000000000000000000..587a1a48affdde6809d7f8bf77e1055db7cd8c14
Binary files /dev/null and b/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_nccl2.png differ
diff --git a/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_pserver.graffle b/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_pserver.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..046c4903231e8ca441884674c08b381766c0bbae
Binary files /dev/null and b/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_pserver.graffle differ
diff --git a/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_pserver.png b/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_pserver.png
new file mode 100644
index 0000000000000000000000000000000000000000..cd2f92ad1a14ac12efc2c257c8aa3d1ae403b2b1
Binary files /dev/null and b/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_pserver.png differ
diff --git a/doc/fluid/new_docs/user_guides/howto/training/src/parallelism.png b/doc/fluid/new_docs/user_guides/howto/training/src/parallelism.png
new file mode 100644
index 0000000000000000000000000000000000000000..6c078b5241559a05219447db67b5d8a35aeefd3f
Binary files /dev/null and b/doc/fluid/new_docs/user_guides/howto/training/src/parallelism.png differ
diff --git a/doc/fluid/new_docs/user_guides/howto/training/test_while_training.rst b/doc/fluid/new_docs/user_guides/howto/training/test_while_training.rst
new file mode 100644
index 0000000000000000000000000000000000000000..37d5c0d78179ccead7a81dffb4ae2f0d835a5949
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/training/test_while_training.rst
@@ -0,0 +1,120 @@
+.. _user_guide_test_while_training:
+
+##################
+训练过程中评测模型
+##################
+
+模型的测试评价与训练的 :code:`fluid.Program` 不同。在测试评价中:
+
+1. 评价测试不进行反向传播，不优化更新参数。
+2. 评价测试执行的操作可以不同。
+
+   * 例如 BatchNorm 操作，在训练和测试时执行不同的算法。
+
+   * 评价模型与训练相比可以是完全不同的模型。
+
+生成测试 :code:`fluid.Program`
+#################################
+
+通过克隆训练 :code:`fluid.Program` 生成测试 :code:`fluid.Program`
+=======================================================================
+
+:code:`Program.clone()` 方法可以复制出新的 :code:`fluid.Program` 。 通过设置
+:code:`Program.clone(for_test=True)` 复制含有用于测试的操作Program。简单的使用方法如下:
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+
+   img = fluid.layers.data(name="image", shape=[784])
+   prediction = fluid.layers.fc(
+     input=fluid.layers.fc(input=img, size=100, act='relu'),
+     size=10,
+     act='softmax'
+   )
+   label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+   loss = fluid.layers.mean(fluid.layers.cross_entropy(input=prediction, label=label))
+   acc = fluid.layers.accuracy(input=prediction, label=label)
+
+   test_program = fluid.default_main_program().clone(for_test=True)
+
+   adam = fluid.optimizer.Adam(learning_rate=0.001)
+   adam.minimize(loss)
+
+在使用 :code:`Optimizer` 之前，将 :code:`fluid.default_main_program()` 复制\
+成一个 :code:`test_program` 。之后使用测试数据运行 :code:`test_program`,\
+就可以做到运行测试程序，而不影响训练结果。
+
+分别配置训练 :code:`fluid.Program` 和测试 :code:`fluid.Program`
+=====================================================================
+
+如果训练程序和测试程序相差较大时，用户也可以通过完全定义两个不同的
+:code:`fluid.Program`，分别进行训练和测试。在PaddlePaddle Fluid中，\
+所有的参数都有名字。如果两个不同的操作，甚至两个不同的网络使用了同样名字的参数，\
+那么他们的值和内存空间都是共享的。
+
+PaddlePaddle Fluid中使用 :code:`fluid.unique_name` 包来随机初始化用户未定义的\
+参数名称。通过 :code:`fluid.unique_name.guard` 可以确保多次调用某函数\
+参数初始化的名称一致。
+
+例如:
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+
+   def network(is_test):
+       file_obj = fluid.layers.open_files(filenames=["test.recordio"] if is_test else ["train.recordio"], ...)
+       img, label = fluid.layers.read_file(file_obj)
+       hidden = fluid.layers.fc(input=img, size=100, act="relu")
+       hidden = fluid.layers.batch_norm(input=hidden, is_test=is_test)
+       ...
+       return loss
+
+   with fluid.unique_name.guard():
+       train_loss = network(is_test=False)
+       sgd = fluid.optimizer.SGD(0.001)
+       sgd.minimize(train_loss)
+
+   test_program = fluid.Program()
+   with fluid.unique_name.guard():
+       with fluid.program_gurad(test_program, fluid.Program()):
+           test_loss = network(is_test=True)
+
+   # fluid.default_main_program() is the train program
+   # fluid.test_program is the test program
+
+执行测试 :code:`fluid.Program`
+#################################
+
+使用 :code:`Executor` 执行测试 :code:`fluid.Program`
+=======================================================
+
+用户可以使用 :code:`Executor.run(program=...)` 来执行测试
+:code:`fluid.Program`。
+
+例如
+
+.. code-block:: python
+
+   exe = fluid.Executor(fluid.CPUPlace())
+   test_acc = exe.run(program=test_program, feed=test_data_batch, fetch_list=[acc])
+   print 'Test accuracy is ', test_acc
+
+使用 :code:`ParallelExecutor` 执行测试 :code:`fluid.Program`
+===============================================================
+
+用户可以使用训练用的 :code:`ParallelExecutor` 与测试 :code:`fluid.Program`
+一起新建一个测试的 :code:`ParallelExecutor` ；再使用测试
+:code:`ParallelExecutor.run` 来执行测试。
+
+例如:
+
+.. code-block:: python
+
+   train_exec = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name)
+
+   test_exec = fluid.ParallelExecutor(use_cuda=True, share_vars_from=train_exec,
+                                      main_program=test_program)
+   test_acc = test_exec.run(fetch_list=[acc], ...)
+
diff --git a/doc/fluid/new_docs/user_guides/index.rst b/doc/fluid/new_docs/user_guides/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..453cb71cfdf72e031ce0f0517e2db936eca38dfc
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/index.rst
@@ -0,0 +1,18 @@
+########
+使用指南
+########
+
+
+..  todo::
+
+    完善导引介绍
+
+..  toctree::
+    :maxdepth: 2
+
+    howto/prepare_data/index
+    howto/configure_simple_model/index
+    howto/training/index
+    howto/debug/index
+    howto/evaluation/index
+    models/index.rst
diff --git a/doc/fluid/new_docs/user_guides/models/index.rst b/doc/fluid/new_docs/user_guides/models/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..998e95c4885dc313d9449f5466f80c53d34fe82a
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/models/index.rst
@@ -0,0 +1,137 @@
+Fluid 模型库
+============
+
+图像分类
+--------
+
+图像分类是根据图像的语义信息对不同类别图像进行区分，是计算机视觉中重要的基础问题，是物体检测、图像分割、物体跟踪、行为分析、人脸识别等其他高层视觉任务的基础，在许多领域都有着广泛的应用。如：安防领域的人脸识别和智能视频分析等，交通领域的交通场景识别，互联网领域基于内容的图像检索和相册自动归类，医学领域的图像识别等。
+
+在深度学习时代，图像分类的准确率大幅度提升，在图像分类任务中，我们向大家介绍了如何在经典的数据集ImageNet上，训练常用的模型，包括AlexNet、VGG、GoogLeNet、ResNet、Inception-v4、MobileNet、DPN(Dual
+Path
+Network)、SE-ResNeXt模型，也开源了\ `训练的模型 <https://github.com/PaddlePaddle/models/blob/develop/fluid/image_classification/README_cn.md#已有模型及其性能>`__\ 方便用户下载使用。同时提供了能够将Caffe模型转换为PaddlePaddle
+Fluid模型配置和参数文件的工具。
+
+-  `AlexNet <https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models>`__
+-  `VGG <https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models>`__
+-  `GoogleNet <https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models>`__
+-  `Residual
+   Network <https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models>`__
+-  `Inception-v4 <https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models>`__
+-  `MobileNet <https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models>`__
+-  `Dual Path
+   Network <https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models>`__
+-  `SE-ResNeXt <https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models>`__
+-  `Caffe模型转换为Paddle
+   Fluid配置和模型文件工具 <https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid>`__
+
+目标检测
+--------
+
+目标检测任务的目标是给定一张图像或是一个视频帧，让计算机找出其中所有目标的位置，并给出每个目标的具体类别。对于人类来说，目标检测是一个非常简单的任务。然而，计算机能够“看到”的是图像被编码之后的数字，很难解图像或是视频帧中出现了人或是物体这样的高层语义概念，也就更加难以定位目标出现在图像中哪个区域。与此同时，由于目标会出现在图像或是视频帧中的任何位置，目标的形态千变万化，图像或是视频帧的背景千差万别，诸多因素都使得目标检测对计算机来说是一个具有挑战性的问题。
+
+在目标检测任务中，我们介绍了如何基于\ `PASCAL
+VOC <http://host.robots.ox.ac.uk/pascal/VOC/>`__\ 、\ `MS
+COCO <http://cocodataset.org/#home>`__\ 数据训练通用物体检测模型，当前介绍了SSD算法，SSD全称Single Shot MultiBox Detector，是目标检测领域较新且效果较好的检测算法之一，具有检测速度快且检测精度高的特点。
+
+开放环境中的检测人脸，尤其是小的、模糊的和部分遮挡的人脸也是一个具有挑战的任务。我们也介绍了如何基于 `WIDER FACE <http://mmlab.ie.cuhk.edu.hk/projects/WIDERFace/>`_ 数据训练百度自研的人脸检测PyramidBox模型，该算法于2018年3月份在WIDER FACE的多项评测中均获得 `第一名 <http://mmlab.ie.cuhk.edu.hk/projects/WIDERFace/WiderFace_Results.html>`_。
+
+-  `Single Shot MultiBox
+   Detector <https://github.com/PaddlePaddle/models/blob/develop/fluid/object_detection/README_cn.md>`__
+-  `Face Detector: PyramidBox <https://github.com/PaddlePaddle/models/tree/develop/fluid/face_detection/README_cn.md>`_
+
+图像语义分割
+------------
+
+图像语意分割顾名思义是将图像像素按照表达的语义含义的不同进行分组/分割，图像语义是指对图像内容的理解，例如，能够描绘出什么物体在哪里做了什么事情等，分割是指对图片中的每个像素点进行标注，标注属于哪一类别。近年来用在无人车驾驶技术中分割街景来避让行人和车辆、医疗影像分析中辅助诊断等。
+
+在图像语义分割任务中，我们介绍如何基于图像级联网络(Image Cascade
+Network,ICNet)进行语义分割，相比其他分割算法，ICNet兼顾了准确率和速度。
+
+-  `ICNet <https://github.com/PaddlePaddle/models/tree/develop/fluid/icnet>`__
+
+场景文字识别
+------------
+
+许多场景图像中包含着丰富的文本信息，对理解图像信息有着重要作用，能够极大地帮助人们认知和理解场景图像的内容。场景文字识别是在图像背景复杂、分辨率低下、字体多样、分布随意等情况下，将图像信息转化为文字序列的过程，可认为是一种特别的翻译过程：将图像输入翻译为自然语言输出。场景图像文字识别技术的发展也促进了一些新型应用的产生，如通过自动识别路牌中的文字帮助街景应用获取更加准确的地址信息等。
+
+在场景文字识别任务中，我们介绍如何将基于CNN的图像特征提取和基于RNN的序列翻译技术结合，免除人工定义特征，避免字符分割，使用自动学习到的图像特征，完成端到端地无约束字符定位和识别。当前，介绍了CRNN-CTC模型，后续会引入基于注意力机制的序列到序列模型。
+
+-  `CRNN-CTC模型 <https://github.com/PaddlePaddle/models/tree/develop/fluid/ocr_recognition>`__
+
+语音识别
+--------
+
+自动语音识别（Automatic Speech Recognition,
+ASR）是将人类声音中的词汇内容转录成计算机可输入的文字的技术。语音识别的相关研究经历了漫长的探索过程，在HMM/GMM模型之后其发展一直较为缓慢，随着深度学习的兴起，其迎来了春天。在多种语言识别任务中，将深度神经网络(DNN)作为声学模型，取得了比GMM更好的性能，使得
+ASR
+成为深度学习应用最为成功的领域之一。而由于识别准确率的不断提高，有越来越多的语言技术产品得以落地，例如语言输入法、以智能音箱为代表的智能家居设备等
+—— 基于语言的交互方式正在深刻的改变人类的生活。
+
+与 `DeepSpeech <https://github.com/PaddlePaddle/DeepSpeech>`__
+中深度学习模型端到端直接预测字词的分布不同，本实例更接近传统的语言识别流程，以音素为建模单元，关注语言识别中声学模型的训练，利用\ `kaldi <http://www.kaldi-asr.org>`__\ 进行音频数据的特征提取和标签对齐，并集成
+kaldi 的解码器完成解码。
+
+-  `DeepASR <https://github.com/PaddlePaddle/models/blob/develop/fluid/DeepASR/README_cn.md>`__
+
+机器翻译
+--------
+
+机器翻译（Machine
+Translation）将一种自然语言(源语言)转换成一种自然语言（目标语音），是自然语言处理中非常基础和重要的研究方向。在全球化的浪潮中，机器翻译在促进跨语言文明的交流中所起的重要作用是不言而喻的。其发展经历了统计机器翻译和基于神经网络的神经机器翻译(Nueural
+Machine Translation, NMT)等阶段。在 NMT
+成熟后，机器翻译才真正得以大规模应用。而早阶段的 NMT
+主要是基于循环神经网络 RNN
+的，其训练过程中当前时间步依赖于前一个时间步的计算，时间步之间难以并行化以提高训练速度。因此，非
+RNN 结构的 NMT 得以应运而生，例如基于卷积神经网络 CNN
+的结构和基于自注意力机制（Self-Attention）的结构。
+
+本实例所实现的 Transformer
+就是一个基于自注意力机制的机器翻译模型，其中不再有RNN或CNN结构，而是完全利用
+Attention 学习语言中的上下文依赖。相较于RNN/CNN,
+这种结构在单层内计算复杂度更低、易于并行化、对长程依赖更易建模，最终在多种语言之间取得了最好的翻译效果。
+
+-  `Transformer <https://github.com/PaddlePaddle/models/blob/develop/fluid/neural_machine_translation/transformer/README_cn.md>`__
+
+强化学习
+--------
+
+强化学习是近年来一个愈发重要的机器学习方向，特别是与深度学习相结合而形成的深度强化学习(Deep
+Reinforcement Learning,
+DRL)，取得了很多令人惊异的成就。人们所熟知的战胜人类顶级围棋职业选手的
+AlphaGo 就是 DRL
+应用的一个典型例子，除游戏领域外，其它的应用还包括机器人、自然语言处理等。
+
+深度强化学习的开山之作是在Atari视频游戏中的成功应用，
+其可直接接受视频帧这种高维输入并根据图像内容端到端地预测下一步的动作，所用到的模型被称为深度Q网络(Deep
+Q-Network, DQN)。本实例就是利用PaddlePaddle Fluid这个灵活的框架，实现了
+DQN 及其变体，并测试了它们在 Atari 游戏中的表现。
+
+-  `DeepQNetwork <https://github.com/PaddlePaddle/models/blob/develop/fluid/DeepQNetwork/README_cn.md>`__
+
+中文词法分析
+------------
+
+中文分词(Word Segmentation)是将连续的自然语言文本，切分出具有语义合理性和完整性的词汇序列的过程。因为在汉语中，词是承担语义的最基本单位，切词是文本分类、情感分析、信息检索等众多自然语言处理任务的基础。 词性标注（Part-of-speech Tagging）是为自然语言文本中的每一个词汇赋予一个词性的过程，这里的词性包括名词、动词、形容词、副词等等。 命名实体识别（Named Entity Recognition，NER）又称作“专名识别”，是指识别自然语言文本中具有特定意义的实体，主要包括人名、地名、机构名、专有名词等。 我们将这三个任务统一成一个联合任务，称为词法分析任务，基于深度神经网络，利用海量标注语料进行训练，提供了一个端到端的解决方案。
+
+我们把这个联合的中文词法分析解决方案命名为LAC。LAC既可以认为是Lexical Analysis of Chinese的首字母缩写，也可以认为是LAC Analyzes Chinese的递归缩写。
+
+- `LAC <https://github.com/baidu/lac/blob/master/README.md>`__
+
+情感倾向分析
+------------
+
+情感倾向分析针对带有主观描述的中文文本，可自动判断该文本的情感极性类别并给出相应的置信度。情感类型分为积极、消极、 中性。情感倾向分析能够帮助企业理解用户消费习惯、分析热点话题和危机舆情监控，为企业提供有力的决策支持。本次我们开放 AI开放平台中情感倾向分析采用的模型(http://ai.baidu.com/tech/nlp/sentiment_classify )， 提供给用户使用。
+
+- `Senta <https://github.com/baidu/Senta/blob/master/README.md>`__
+
+AnyQ
+----
+
+`AnyQ <https://github.com/baidu/AnyQ>`__\ (ANswer Your Questions)
+开源项目主要包含面向FAQ集合的问答系统框架、文本语义匹配工具SimNet。
+问答系统框架采用了配置化、插件化的设计，各功能均通过插件形式加入，当前共开放了20+种插件。开发者可以使用AnyQ系统快速构建和定制适用于特定业务场景的FAQ问答系统，并加速迭代和升级。
+
+SimNet是百度自然语言处理部于2013年自主研发的语义匹配框架，该框架在百度各产品上广泛应用，主要包括BOW、CNN、RNN、MM-DNN等核心网络结构形式，同时基于该框架也集成了学术界主流的语义匹配模型，如MatchPyramid、MV-LSTM、K-NRM等模型。使用SimNet构建出的模型可以便捷的加入AnyQ系统中，增强AnyQ系统的语义匹配能力。
+
+-  `SimNet in PaddlePaddle
+   Fluid <https://github.com/baidu/AnyQ/blob/master/tools/simnet/train/paddle/README.md>`__
diff --git a/doc/survey/op_fusion_design.md b/doc/survey/op_fusion_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..d6e48f4f58269b67450cb012f6dcc59e1083abba
--- /dev/null
+++ b/doc/survey/op_fusion_design.md
@@ -0,0 +1,20 @@
+# Operator fusion  
+Fusing multiple operators together is an important method to optimize the program execution, particularly for GPU or other specialized accelerators. An obvious benefit is to avoid the overhead of saving the intermediate result back into global memory.   
+
+There are generally two ways to fuse operators, fusing directly connected operators and fusing non directly connected operators. The first method is mainly used by [NNVM Compiler](https://github.com/dmlc/tvm/) and [XLA](https://www.tensorflow.org/performance/xla/). The second method is mainly used by Dynet and TensorFlow Fold to do auto-batching. The principle of fusing operator is according to some rules to combine multiple operations into one, for example, `Y = X * W` and `Z = Y + B` can be fused to `Z = X * W + B`, and `Y1 = X1 * W` and `Y2 = X2 * W` can be fused to `[Y1;Y2] = [X1;X2] * W`. In order to get a short-term profit, we decided to try to manually specify these rules.   
+
+## Challenge
+The challenge of fusing operators is:
+  - how to make the rules.
+  - how to implement these rules efficiently.
+
+### How to make the rules?
+
+The problem of determining the best single location for a fusion operator is an NP-hard combinatorial problem. After analysis the operators of the DL model, we found there are two group of operators can be fused explicitly, one is the simple and adjacent operations, for example, `tmp = x + y` and `z = Relu(tmp)`, and the other is the operators that have the same function, for example, a serials of `SGD` or `Momentum`. They usually appear in the model in a large number. So we should think about how to fuse them separately first.
+
+### How to implement these rules efficiently?
+#### How to fuse the adjacent operations efficiently?
+Here we use a template function to represent the fused operations. The pros of using a template function are that it is simple and efficient, and the cons are that it is not easy to expand, and it can only be used to express some simple operations. So taking into account our current needs, the template function is more appropriate.
+
+#### How to fuse the operators that have the same function efficiently?
+We take SGD operator as an example, the training model may have hundreds of parameters and correspondingly have the same number of SGD operators. The expression(`w = w - lr*w_g`) of those operators is the same, so during of training, the executor will execute this expression hundreds time in CPU or other specialized accelerators. If we can fuse them and make the address of all `w` and all `w_g` continuous respectively, we only need execute one time. For some accelerators, the time of launching kernel is not neglected, so the time of hundreds of times of launching and executing kernel may be larger than launching and executing only once. There usually are many operators that similar to `SGD` in the DL model, such as `AllReduce` and `FC`.
diff --git a/doc/v2/api/index_en.rst b/doc/v2/api/index_en.rst
index 70c5c524aaf0a9ae003bf4340c3f268c225d4419..5813509dce46677444f0234db8e0eaa4f113e3a0 100644
--- a/doc/v2/api/index_en.rst
+++ b/doc/v2/api/index_en.rst
@@ -4,7 +4,6 @@ API
 ..  toctree::
     :maxdepth: 1
 
-    overview.rst
     model_configs.rst
     data.rst
     run_logic.rst
diff --git a/doc/v2/build_and_install/build_from_source_cn.rst b/doc/v2/build_and_install/build_from_source_cn.rst
index 6421c5308271c2508597d849c79709255caf349a..d0dacb104f148c2aeb323365cbd6f014ae00ed5a 100644
--- a/doc/v2/build_and_install/build_from_source_cn.rst
+++ b/doc/v2/build_and_install/build_from_source_cn.rst
@@ -35,11 +35,16 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
    # 2. 可选步骤：源码中构建用于编译PaddlePaddle的Docker镜像
    docker build -t paddle:dev .
    # 3. 执行下面的命令编译CPU-Only的二进制
-   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
+   docker run -it -v $PWD:/paddle -w /paddle -e "PYTHON_ABI=cp27-cp27mu" -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
    # 4. 或者也可以使用为上述可选步骤构建的镜像（必须先执行第2步）
    docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
 
-注：上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。
+注：
+
+- 上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。
+
+- 如果您使用的是 manylinux 的镜像进行编译, 那么您需要通过环境变量 :code:`PYTHON_ABI` 来指定一个 `Python ABI <https://www.python.org/dev/peps/pep-0425/#id8>`__.
+PaddlePaddle目前支持的 Python ABI 有 :code:`cp27-cp27m` 和 :code:`cp27-cp27mu`.
 
 编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：
 
diff --git a/doc/v2/build_and_install/build_from_source_en.rst b/doc/v2/build_and_install/build_from_source_en.rst
index b08b45d43ec7f1deb2889832079a731ee724a44c..664b68da8b7dd3e005ebf3ec34de77729e5ab355 100644
--- a/doc/v2/build_and_install/build_from_source_en.rst
+++ b/doc/v2/build_and_install/build_from_source_en.rst
@@ -36,13 +36,18 @@ If you don't wish to use docker，you need to install several compile dependenci
    # 2. Optional: build development docker image from source
    docker build -t paddle:dev .
    # 3. Run the following command to build a CPU-Only binaries
-   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
+   docker run -it -v $PWD:/paddle -w /paddle -e "PYTHON_ABI=cp27-cp27mu" -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
    # 4. Or, use your built Docker image to build PaddlePaddle (must run step 2)
    docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
 
-NOTE: The above command try to mount the current working directory (root directory of source code)
+NOTE: 
+
+- The above command try to mount the current working directory (root directory of source code)
 into :code:`/paddle` directory inside docker container.
 
+- You need to pass in the required environment variable :code:`PYTHON_ABI` to specify a `Python ABI <https://www.python.org/dev/peps/pep-0425/#id8>`__.
+Currently PaddlePaddle supported Python ABIs include :code:`cp27-cp27m` and :code:`cp27-cp27mu` .
+
 When the compile finishes, you can get the output whl package under
 build/python/dist, then you can choose to install the whl on local
 machine or copy it to the target machine.
diff --git a/doc/v2/faq/parameter/index_en.rst b/doc/v2/faq/parameter/index_en.rst
index 61c7845af7e531013a06125f7c35b59081dafb42..9edb8dd620f972d019db9c0063cefce616de0ebd 100644
--- a/doc/v2/faq/parameter/index_en.rst
+++ b/doc/v2/faq/parameter/index_en.rst
@@ -1,5 +1,198 @@
-#################
-Parameter Setting
-#################
+##################
+Parameter Settings
+##################
 
-TBD
+.. contents::
+
+1. How to Choose the Learning Rate of SGD Algorithm
+--------------------------
+
+An important issue when training with :code:`sgd/async_sgd` is to choose the correct value for :code:`learning_rate`. If it is too large, the training may not converge. If too small, the convergence may be slow, resulting in a long training time.
+
+Usually, we start with a relatively large learning rate. If the training does not converge, then we need to reduce the learning rate continuously by a factor of 10 until the training converges. We examine the convergence of the training by estimating the minimum cost at a constant output of the model.
+
+If the cost of the training process is significantly higher than the cost of the output, then we judge that the training does not converge. For example, if we have a three-class problem and use multi-class-cross-entropy as the cost, the ratio of 0, 1, and 2 in the data will be :code:`0.2, 0.5, 0.3`. The minimum cost thus will be :code:`-(0.2*log(0.2)+0.5*log(0.5)+0.3*log(0.3))=1.03`. If the cost is greater than this number after training a pass (or even before), then the training may not be converged and the learning rate should be reduced.
+
+2. How to Implement Learning Rate Annealing
+------------------------------------------------
+
+We use the Adam algorithm as an example. Set the parameters of :code:`learning_rate_schedule` in the corresponding optimization algorithm as follows:
+
+.. code-block:: python
+
+    Optimizer = paddle.optimizer.Adam(
+        Learning_rate=1e-3,
+        Learning_rate_decay_a=0.5,
+        Learning_rate_decay_b=0.75,
+        Learning_rate_schedule="poly",)
+
+PaddlePaddle currently supports 8 learning rate schedules. The 8 learning rate schedules and their corresponding learning rates are calculated as follows:
+
+* "constant"
+  
+  Lr = learning_rate
+
+* "poly"
+
+  Lr = learning_rate * pow(1 + learning_rate_decay_a * num_samples_processed, -learning_rate_decay_b)
+
+  Variable :code:`num_samples_processed` is the number of trained samples.
+
+* "caffe_poly"
+
+  Lr = learning_rate * pow(1.0 - num_samples_processed / learning_rate_decay_a, learning_rate_decay_b)
+
+* "exp"
+
+  Lr = learning_rate * pow(learning_rate_decay_a, num_samples_processed / learning_rate_decay_b)
+
+* "discexp"
+
+  Lr = learning_rate * pow(learning_rate_decay_a, floor(num_samples_processed / learning_rate_decay_b))
+
+* "linear"
+
+  Lr = max(learning_rate - learning_rate_decay_a * num_samples_processed, learning_rate_decay_b)
+
+* "manual"
+
+  This is a learning rate annealing method that is segmented by the number of trained samples. When using this learning rate schedule, we modify the learning rate attenuation factor piecewise function by changing the parameter :code:`learning_rate_args`. The current learning rate is the product of :code:`learning_rate` and the current attenuation factor. Take the Adam algorithm as an example:
+
+  .. code-block:: python
+
+      Optimizer = paddle.optimizer.Adam(
+          Learning_rate=1e-3,
+          Learning_rate_schedule="manual",
+          Learning_rate_args="1000:1.0,2000:0.9,3000:0.8",)
+
+  In this example, when the number of trained samples is less than or equal to 1000, the learning rate is: code:`1e-3*1.0`; when the number of trained samples is greater than 1000 or less than or equal to 2000, the learning rate is:code:`1e- 3 * 0.9`; when the number of trained samples is greater than 2,000, the learning rate is: code:`1e-3*0.8`.
+
+* "pass_manual"
+
+  This is a learning rate annealing method that piecewisely pick values according to the number of trained passes. When using this learning rate schedule, we set the learning rate attenuation factor piecewise function by the parameter :code:`learning_rate_args`. The current learning rate is the product of :code:`learning_rate` and the current attenuation factor. Take the Adam algorithm as an example:
+
+  .. code-block:: python
+
+      Optimizer = paddle.optimizer.Adam(
+          Learning_rate=1e-3,
+          Learning_rate_schedule="pass_manual",
+          Learning_rate_args="1:1.0,2:0.9,3:0.8",)
+
+  In this example, when the number of trained passes is less than or equal to 1, the learning rate is :code:`1e-3*1.0`; when the number of trained passes is greater than 1 or less than 2, the learning rate is :code:`1e- 3 * 0.9`; when the number of trained passes is greater than 2, the learning rate is :code:`1e-3*0.8`.
+
+3. How to Initialize Parameters
+-----------------
+
+By default, PaddlePaddle initializes parameters with an average of 0 and a standard deviation of :math:`\frac{1}{\sqrt{d}}`, where :math:`d` is the width of the parameter matrix. This initialization method does not produce bad results under normal circumstances. If users want to customize the initialization method, PaddlePaddle provides two ways to initialize the parameters:
+
+* Gaussian distribution. Set :code:`param_attr` to :code:`param_attr=ParamAttr(initial_mean=0.0, initial_std=1.0)`
+* Uniform distribution. Set :code:`param_attr` to :code:`param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0)`
+
+For example, to set a full connection layer parameter initialization mode and bias initialization mode, you can use the following code:
+
+.. code-block:: python
+
+    Hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0),
+                      Bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0))
+
+The above code initializes the bias to 1.0 and initializes the parameters to a uniform distribution of :code:`[1.0, -1.0]`.
+
+4. How to Share Parameters
+---------------
+
+PaddlePaddle's parameters use :code:`name` as the ID. Parameters with the same name will share parameters//. We can set the name of the parameters using :code:`ParamAttr(name="YOUR_PARAM_NAME")`. More conveniently, we can make the parameters to be shared use the same :code:`ParamAttr` object.
+
+A simple fully connected network has its configuration of parameter sharing as follows \:
+
+.. literalinclude:: ../../python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
+
+Here :code:`hidden_a` and :code:`hidden_b` have the same parameter and bias. The two input of the softmax layer also use the same parameter :code:`softmax_param`.
+
+5. How to Load Pre-training Parameters
+------------------------
+* For layers that load pre-training parameters, set :code:`is_static = True` so that the parameters of that layer remain unchanged during the training process. Take the embedding layer as an example, the code is as follows:
+
+.. code-block:: python
+
+    Emb_para = paddle.attr.Param(name='emb', is_static=True)
+    Paddle.layer.embedding(size=word_dim, input=x, param_attr=emb_para)
+
+
+* Load pre-training parameters from the model file into :code:`numpy.array`. After creating the parameters, load the pre-training parameters using :code:`parameters.set()`. The first 16 bytes of the model parameter file saved by PaddlePaddle is the header information. The user must loads : :code:`numpy.array` starting with the 17th byte. Take the embedding layer as an example, the code is as follows:
+
+.. code-block:: python
+
+    Def load_parameter(file_name, h, w):
+        With open(file_name, 'rb') as f:
+            F.read(16) # skip header.
+            Return np.fromfile(f, dtype=np.float32).reshape(h, w)
+
+    Parameters = paddle.parameters.create(my_cost)
+    Parameters.set('emb', load_parameter(emb_param_file, 30000, 256))
+
+6. Format of the Stored Parameter and How to Convert the File to Plain Text
+--------------------------------------------------
+
+The model parameter file saved by PaddlePaddle consists of 16 bytes of header information and network parameters. In the header information, the first four bytes show PaddlePaddle's version information. The user should fill in with 0s. The next four bytes represent the number of bytes occupied by each parameter. If the saved network parameter is a float type, the number is four; if it is a double, the number is eight. The third group of four bytes represents the total number of saved parameters.
+
+When restoring the model parameters saved by PaddlePaddle back to plain text, we use the corresponding data type :code:`numpy.array` to load specific network parameters. At this time, you can skip the header information of the PaddlePaddle model parameter file. If not specified to compile with a precision for double in PaddlePaddle, then the parameter file will be caiculated with a precision for float, and the argument will be stored as a float. In this case, when using :code:`numpy.array`, generally we set :code:`dtype=float32`. An example is as follows:
+
+.. code-block:: python
+
+    Def read_parameter(fname, width):
+        s = open(fname).read()
+        # skip header
+        Vec = np.fromstring(s[16:], dtype=np.float32)
+        # width is the size of the corresponding layer
+        Np.savetxt(fname + ".csv", vec.reshape(width, -1),
+                Fmt="%.6f", delimiter=",")
+
+
+When the plaintext parameters are converted into PaddlePaddle loadable model parameters, the header information is constructed first, then the network parameters are written. The following code converts the randomly generated matrix into model parameters that can be loaded by PaddlePaddle:
+
+.. code-block:: python
+
+    Def gen_rand_param(param_file, width, height, need_trans):
+        Np.random.seed()
+        Header = struct.pack("iil", 0, 4, height * width)
+        Param = np.float32(np.random.rand(height, width))
+        With open(param_file, "w") as fparam:
+            Fparam.write(header + param.tostring())
+
+7. A Protocol Message Rejected Because of its Large Size
+-------------------------------------------------- ----------
+
+If you are training NLP related models, and the following error occurs:
+
+.. code-block:: bash
+
+    [libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes). To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit( ) in google/protobuf/io/coded_stream.h.
+    F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr)
+
+The possible reason is that one of the args passed to the dataprovider is too large, which is usually caused by directly passing a large dictionary. A wrongly defineed `_py_data_sources2` is similar to:
+
+.. code-block:: python
+
+     Src_dict = dict()
+     For line_count, line in enumerate(open(src_dict_path, "r")):
+        Src_dict[line.strip()] = line_count
+
+     Define_py_data_sources2(
+        Train_list,
+        Test_list,
+        Module="dataprovider",
+        Obj="process",
+        Args={"src_dict": src_dict})
+
+The solution is to pass the address of the dictionary as args to the dataprovider, and then load the dictionary according to the address in the dataprovider. Change `_py_data_sources2` to:
+
+.. code-block:: python
+
+     Define_py_data_sources2(
+        Train_list,
+        Test_list,
+        Module="dataprovider",
+        Obj="process",
+        Args={"src_dict_path": src_dict_path})
+
+The full source code can be found in the `sequence_recurrent <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_recurrent.py>`_ example.
diff --git a/doc/v2/howto/capi/compile_paddle_lib_cn.md b/doc/v2/howto/capi/compile_paddle_lib_cn.md
index 2c87e9afc6911526cd51d6c691f262960accc9e8..8878ee9d85064ba27708ed92790aa9b83ba316e5 100644
--- a/doc/v2/howto/capi/compile_paddle_lib_cn.md
+++ b/doc/v2/howto/capi/compile_paddle_lib_cn.md
@@ -22,23 +22,23 @@
 </tr>
 <tr>
 <td>cpu_noavx_openblas</td>
-<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
 </tr>
 <tr>
 <td>cuda7.5_cudnn5_avx_mkl</td>
-<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
 </tr>
 <tr>
 <td>cuda8.0_cudnn5_avx_mkl</td>
-<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
 </tr>
 <tr>
 <td>cuda8.0_cudnn7_avx_mkl</td>
-<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
 </tr>
 <tr>
 <td>cuda9.0_cudnn7_avx_mkl</td>
-<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
 </tr>
 </tbody></table>
 
diff --git a/doc/v2/howto/capi/compile_paddle_lib_en.md b/doc/v2/howto/capi/compile_paddle_lib_en.md
index 3fa8a18a9fbea21b494c416e6b938990fbb68337..70a6edef27e75af6b38d7d4824c928eba0d29b9a 100644
--- a/doc/v2/howto/capi/compile_paddle_lib_en.md
+++ b/doc/v2/howto/capi/compile_paddle_lib_en.md
@@ -13,31 +13,31 @@
 <tbody>
 <tr>
 <td>cpu_avx_mkl</td>
-<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
 </tr>
 <tr>
 <td>cpu_avx_openblas</td>
-<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
 </tr>
 <tr>
 <td>cpu_noavx_openblas</td>
-<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
 </tr>
 <tr>
 <td>cuda7.5_cudnn5_avx_mkl</td>
-<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
 </tr>
 <tr>
 <td>cuda8.0_cudnn5_avx_mkl</td>
-<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
 </tr>
 <tr>
 <td>cuda8.0_cudnn7_avx_mkl</td>
-<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
 </tr>
 <tr>
 <td>cuda9.0_cudnn7_avx_mkl</td>
-<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
 </tr>
 </tbody></table>
 
diff --git a/doc/v2/howto/capi/workflow_of_capi_cn.md b/doc/v2/howto/capi/workflow_of_capi_cn.md
index 3acdbae28e9b35f8a9104a89c9a5799f8c892334..db1568a2afbea3cca0d4e1fe053ba9536a60ab3d 100644
--- a/doc/v2/howto/capi/workflow_of_capi_cn.md
+++ b/doc/v2/howto/capi/workflow_of_capi_cn.md
@@ -28,9 +28,9 @@
 
 ### 准备预测模型
 
-准备预测模型部分，我们以手写数字识别任务为例进行介绍。手写数字识别任务定义了一个含有[两个隐层的简单全连接网络](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md#softmax回归softmax-regression)，网络接受一幅图片作为输入，将图片分类到 0 ~ 9 类别标签之一。完整代码可以查看[此目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense) 中的相关脚本。
+准备预测模型部分，我们以手写数字识别任务为例进行介绍。手写数字识别任务定义了一个含有[两个隐层的简单全连接网络](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md#softmax回归softmax-regression)，网络接受一幅图片作为输入，将图片分类到 0 ~ 9 类别标签之一。完整代码可以查看[此目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense) 中的相关脚本。
 
-调用C-API开发预测程序需要一个训练好的模型，运行[MNIST手写数字识别目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)下的[mnist_v2.py](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/examples/model_inference/dense/mnist_v2.py)脚本，在终端执行`python mnist_v2.py`，会使用 PaddlePaddle 内置的 [MNIST 数据集](http://yann.lecun.com/exdb/mnist/)进行训练。训练好的模型默认保存在当前运行目录下的`models`目录中。
+调用C-API开发预测程序需要一个训练好的模型，运行[MNIST手写数字识别目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense)下的[mnist_v2.py](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py)脚本，在终端执行`python mnist_v2.py`，会使用 PaddlePaddle 内置的 [MNIST 数据集](http://yann.lecun.com/exdb/mnist/)进行训练。训练好的模型默认保存在当前运行目录下的`models`目录中。
 
 下面，我们将训练结束后存储下来的模型转换成预测模型。
 
@@ -48,7 +48,7 @@
     dump_v2_config(predict, "trainer_config.bin", True)
     ```
 
-    对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)这个示例，[`mnist_v2.py`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense/mnist_v2.py)脚本集成了序列化神经网络结构的过程，可以直接运行 `python mnist_v2.py --task dump_config` 对神经网络结构进行序列化，结果会写入当前运行目录下的`trainer_config.bin`文件中。
+    对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense)这个示例，[`mnist_v2.py`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py)脚本集成了序列化神经网络结构的过程，可以直接运行 `python mnist_v2.py --task dump_config` 对神经网络结构进行序列化，结果会写入当前运行目录下的`trainer_config.bin`文件中。
 
     使用这种方式，需要**在运行时将神经网络的多个可学习参数放在同一个目录中**，C-API可以通过分别指定序列化后的网络结构文件和参数目录来加载训练好的模型。
 
@@ -68,7 +68,7 @@
     merge_v2_model(net, param_file, output_file)
     ```
 
-    对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)这个示例，可直接运行 `python` [merge_v2_model.py](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense/merge_v2_model.py)。序列化结果会写入当前运行目录下的`output.paddle.model`文件中。使用这种方式，运行时C-API可以通过指定`output.paddle.model`文件的路径来加载预测模型。
+    对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense)这个示例，可直接运行 `python` [merge_v2_model.py](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense/merge_v2_model.py)。序列化结果会写入当前运行目录下的`output.paddle.model`文件中。使用这种方式，运行时C-API可以通过指定`output.paddle.model`文件的路径来加载预测模型。
 
 #### 注意事项
 1. 为使用C-API，在调用`dump_v2_config`序列化神经网络结构时，参数`binary`必须指定为`True`。
@@ -77,10 +77,10 @@
 
 ### 编写预测代码
 
-预测代码更多详细示例代码请参考[C-API使用示例](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference) 目录下的代码示例。这一节对图1中预测代码编写的5个步骤进行介绍和说明。
+预测代码更多详细示例代码请参考[C-API使用示例](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference) 目录下的代码示例。这一节对图1中预测代码编写的5个步骤进行介绍和说明。
 
 #### step 1. 初始化PaddlePaddle运行环境
-第一步需调用[`paddle_init`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/main.h#L27) 初始化PaddlePaddle运行环境，该接口接受两个参数：参数的个数和参数列表。
+第一步需调用[`paddle_init`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/main.h#L27) 初始化PaddlePaddle运行环境，该接口接受两个参数：参数的个数和参数列表。
 
 #### step2. 加载模型
 
@@ -88,8 +88,8 @@
 
 概念上，在 PaddlePaddle 内部，一个GradientMachine类的对象管理着一组计算层（PaddlePaddle Layers）来完成前向和反向计算，并处理与之相关的所有细节。在调用C-API预测时，只需进行前向计算而无需调用反向计算。这篇文档之后部分会使用`gradient machine`来特指调用PaddlePaddle C-API创建的GradientMachine类的对象。每一个 `gradient machine` 都会管理维护一份训练好的模型，下面是C-API提供的，两种常用的模型加载方式：
 
-1. 调用[`paddle_gradient_machine_load_parameter_from_disk`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L61)接口，从磁盘加载预测模型。这时`gradient machine`会独立拥有一份训练好的模型；
-1. 调用[`paddle_gradient_machine_create_shared_param`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L88)接口，与其它`gradient machine`的共享已经加载的预测模型。这种情况多出现在使用多线程预测时，通过多个线程共享同一个模型来减少内存开销。可参考[此示例](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/examples/model_inference/multi_thread/main.c)。
+1. 调用[`paddle_gradient_machine_load_parameter_from_disk`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/gradient_machine.h#L61)接口，从磁盘加载预测模型。这时`gradient machine`会独立拥有一份训练好的模型；
+1. 调用[`paddle_gradient_machine_create_shared_param`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/gradient_machine.h#L88)接口，与其它`gradient machine`的共享已经加载的预测模型。这种情况多出现在使用多线程预测时，通过多个线程共享同一个模型来减少内存开销。可参考[此示例](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/examples/model_inference/multi_thread/main.c)。
 
 - 注意事项
 
@@ -117,7 +117,7 @@ C-API支持的所有输入数据类型和他们的组织方式，请参考“输
 
 #### step 4. 前向计算
 
-完成上述准备之后，通过调用 [`paddle_gradient_machine_forward`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L73) 接口完成神经网络的前向计算。
+完成上述准备之后，通过调用 [`paddle_gradient_machine_forward`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/gradient_machine.h#L73) 接口完成神经网络的前向计算。
 
 #### step 5. 清理
 
diff --git a/doc/v2/howto/rnn/hierarchical_layer_en.rst b/doc/v2/howto/rnn/hierarchical_layer_en.rst
index 236f58a160c7f77c28e4b1216b83b3d3cdaaa459..fb668f1babb47f49b2dab6d2411565e99599d8b0 100644
--- a/doc/v2/howto/rnn/hierarchical_layer_en.rst
+++ b/doc/v2/howto/rnn/hierarchical_layer_en.rst
@@ -1,4 +1,89 @@
-Layers supporting hierarchical sequence as input
-================================================
-
-TBD
+###########################
+Layers that Support Hierarchical Sequences as Input
+###########################
+ 
+.. contents::
+ 
+Overview 
+====
+ 
+A sequence is a common data type in natural language processing tasks. An independent word can be regarded as a non-sequential input or a 0-level sequence. A sentence made up of words is a single-level sequence; a number of sentences make up a paragraph, which is a double-level sequence.
+ 
+A double-level sequence is a nested sequence where each element is a single-level sequence. This is a very flexible way of organizing data that helps us construct some complex input information.
+ 
+We can define non-sequences, single-level sequences, and double-level sequences at the following levels.
+ 
++ 0-level sequence: an independent element. Its type can be any input data type supported by PaddlePaddle;
++ Single-level sequence: multiple elements arranged in a row; each element is a 0-level sequence. The order of elements is an important input information;
++ Double-level sequence: multiple elements arranged in a row; each element is a single-layer sequence called a subseq of a double-level sequence, and each element of the subseq is a 0-level sequence.
+ 
+In PaddlePaddle, the following layers accept double-layer sequences as input and perform corresponding calculations.
+ 
+`pooling`
+========
+ 
+The use of pooling is as follows:
+ 
+.. code-block:: bash
+ 
+        Seq_pool = pooling(input=layer,
+                           Pooling_type=pooling.Max(),
+                           Agg_level=AggregateLevel.TO_SEQUENCE)
+        
+- `pooling_type` currently supports two types: pooling.Max() and pooling.Avg().
+ 
+- When ʻagg_level=AggregateLevel.TO_NO_SEQUENCE` (default):
+ 
+  - Effect: a double-level sequence input will be converted into a 0-level sequence, and a single-level sequence will be converted into a 0-level sequence 
+  - Input: a double-level sequence or a single-level sequence
+  - Output: a 0-level sequence which is the average (or maximum) of the entire input sequence (single or double)
+ 
+- When ʻagg_level=AggregateLevel.TO_SEQUENCE`:
+ 
+  - Effect: a double-level sequence will be transformed into a single-level sequence
+  - Input: a double-level sequence
+  - Output: a single-level sequence where each element of the sequence is the average (or maximum) value of each subseq element of the original double-level sequence.
+ 
+`last_seq` and `first_seq`
+=====================
+ 
+An example of using `last_seq` is as follows (usage of `first_seq` is similar).
+ 
+.. code-block:: bash
+ 
+        Last = last_seq(input=layer,
+                        Agg_level=AggregateLevel.TO_SEQUENCE)
+        
+- When ʻagg_level=AggregateLevel.TO_NO_SEQUENCE` (default):
+ 
+  - Effect: a double-level sequence input will be converted into a 0-level sequence, and a single-level sequence will be converted into a 0-level sequence
+  - Input: a double-level sequence or a single-level sequence
+  - Output: a 0-level sequence, which is the last or the first element of the input sequence (double or single level).
+ 
+- When ʻagg_level=AggregateLevel.TO_SEQUENCE`:
+  - Effect: a double-level sequence will be transformed into a single-level sequence
+  - Input: a double-level sequence
+  - Output: a single-layer sequence in which each element is the last (or first) element of each subseq in a double-level sequence.
+ 
+`expand`
+======
+ 
+The use of expand is as follows.
+ 
+.. code-block:: bash
+ 
+        Ex = expand(input=layer1,
+                    Expand_as=layer2,
+                    Expand_level=ExpandLevel.FROM_NO_SEQUENCE)
+        
+- When `expand_level=ExpandLevel.FROM_NO_SEQUENCE` (default):
+ 
+  - Effect: a 0-level sequence is extended to a single-level sequence or a double-level sequence
+  - Input: layer1 must be a 0-level sequence to be extended; layer2 can be a single-level sequence or a double-level sequence that provides the extended length information
+  - Output: a single-level sequence or a double-level sequence; the type of the output sequence and the number of elements contained in the sequence are the same as layer2. If the output is a single-level sequence, each element of the single-level sequence will be a copy of the layer1 element. If the output is a double-level sequence, each element in the double-level sequence will be a copy of the layer1 element
+ 
+- When `expand_level=ExpandLevel.FROM_SEQUENCE`:
+ 
+  - Effect: a single-level sequence is extended to a double-level sequence
+  - Input: layer1 must be a single-level sequence to be extended; layer2 must be a double-level sequence providing extended length information
+  - Output: a double-level sequence with the same number of elements as that of layer2. It is required that the number of elements in the single-level sequence be the same as the number of subseq in the double-level sequences. The i-th element of the single-level sequence (the 0-level sequence) is expanded into a single-level sequence that constitutes the i-th subseq of the output, the double-level sequence.
diff --git a/paddle/contrib/float16/float16_transpiler.py b/paddle/contrib/float16/float16_transpiler.py
index 91ba101edb65cd45bd5e37a0c6ad25e515593a81..66e0345c299730c113ffbdc8dd3c1fa32f872f3d 100644
--- a/paddle/contrib/float16/float16_transpiler.py
+++ b/paddle/contrib/float16/float16_transpiler.py
@@ -118,7 +118,7 @@ class Float16Transpiler:
 
         for var in self.block.vars.keys():
             if var not in args:
-                self.block.remove_var(var)
+                self.block._remove_var(var)
 
     def _modify_feed_fetch(self):
         '''
@@ -165,7 +165,7 @@ class Float16Transpiler:
                     dtype=core.VarDesc.VarType.FP16,
                     shape=var.shape,
                     persistable=var.persistable)
-                self.block.insert_op(
+                self.block._insert_op(
                     i + 1,
                     type="cast",
                     inputs={"X": var},
@@ -188,7 +188,7 @@ class Float16Transpiler:
                     persistable=var.persistable)
                 find_op(var)
                 var.op.rename_output(var_name, tmp_var_name)
-                self.block.insert_op(
+                self.block._insert_op(
                     i,
                     type="cast",
                     inputs={"X": tmp_var},
@@ -253,4 +253,4 @@ class Float16Transpiler:
 
             # old var will be replaced by the fp16 var in program desc
             self.input_map[var.name] = fp16_var_name
-            self.block.remove_var(var.name)
+            self.block._remove_var(var.name)
diff --git a/paddle/contrib/inference/demo/CMakeLists.txt b/paddle/contrib/inference/demo/CMakeLists.txt
deleted file mode 100644
index ecece6fe3471ad7b89c84c3e2b67af4ae9eb3c36..0000000000000000000000000000000000000000
--- a/paddle/contrib/inference/demo/CMakeLists.txt
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-inference_api_test(simple_on_word2vec ARGS test_word2vec)
-
-option(WITH_INFERENCE_DEMO "Compile with Inference demo" OFF)
-if(NOT WITH_INFERENCE_DEMO)
-  return()
-endif()
-
-set(DEMO_INSTALL_DIR "${PADDLE_BINARY_DIR}/inference_demo")
-set(URL_ROOT http://paddlemodels.bj.bcebos.com/inference-vis-demos%2F)
-
-function(inference_download_test_demo TARGET)
-    if (NOT WITH_TESTING)
-        return()
-    endif()
-    set(options "")
-    set(oneValueArgs URL)
-    set(multiValueArgs SRCS)
-    cmake_parse_arguments(tests "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    set(test_dir "${DEMO_INSTALL_DIR}/${TARGET}")
-    message(STATUS "inference demo ${test_dir}")
-
-    if(NOT EXISTS "${test_dir}")
-        message(STATUS "Download ${TARGET} model from ${tests_URL}")
-        execute_process(COMMAND bash -c "mkdir -p ${test_dir}")
-        execute_process(COMMAND bash -c "cd ${test_dir}; wget -q ${tests_URL}")
-        execute_process(COMMAND bash -c "cd ${test_dir}; tar xzf *.tar.gz")
-    endif()
-
-    cc_test(${TARGET} SRCS "${tests_SRCS}"
-        DEPS paddle_inference_api paddle_fluid
-        ARGS --data=${test_dir}/data.txt
-             --modeldir=${test_dir}/model
-             --refer=${test_dir}/result.txt)
-endfunction()
-
-# disable mobilenet test
-#inference_download_test_demo(mobilenet_inference_demo
-#    SRCS vis_demo.cc
-#    URL ${URL_ROOT}mobilenet.tar.gz)
-inference_download_test_demo(se_resnext50_inference_demo
-    SRCS vis_demo.cc
-    URL ${URL_ROOT}se_resnext50.tar.gz)
-inference_download_test_demo(ocr_inference_demo
-    SRCS vis_demo.cc
-    URL ${URL_ROOT}ocr.tar.gz)
diff --git a/paddle/contrib/inference/demo/README.md b/paddle/contrib/inference/demo/README.md
deleted file mode 100644
index f1d256660299a68dc5d9d73dbe4a401a0e7d9680..0000000000000000000000000000000000000000
--- a/paddle/contrib/inference/demo/README.md
+++ /dev/null
@@ -1,36 +0,0 @@
-# Infernce Demos
-
-Input data format:
-
-- Each line contains a single record
-- Each record's format is
-
-```
-<space splitted floats as data>\t<space splitted ints as shape>
-```
-
-Follow the C++ codes in `vis_demo.cc`.
-
-## MobileNet
-
-To execute the demo, simply run
-
-```sh
-./mobilenet_inference_demo --modeldir <model> --data <datafile>
-```
-
-## SE-ResNeXt-50
-
-To execute the demo, simply run
-
-```sh
-./se_resnext50_inference_demo --modeldir <model> --data <datafile>
-```
-
-## OCR
-
-To execute the demo, simply run
-
-```sh
-./ocr_inference_demo --modeldir <model> --data <datafile>
-```
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
deleted file mode 100644
index ba2d30314715a57c5ab85e5ae1d8ac0512bbc74f..0000000000000000000000000000000000000000
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/contrib/inference/paddle_inference_api_anakin_engine.h"
-#include <cuda.h>
-
-namespace paddle {
-
-PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor(
-    const AnakinConfig &config) {
-  CHECK(Init(config));
-}
-
-bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) {
-  if (!(graph_.load(config.model_file))) {
-    return false;
-  }
-  graph_.ResetBatchSize("input_0", config.max_batch_size);
-  // optimization for graph
-  if (!(graph_.Optimize())) {
-    return false;
-  }
-  // construct executer
-  executor_.init(graph_);
-  return true;
-}
-
-bool PaddleInferenceAnakinPredictor::Run(
-    const std::vector<PaddleTensor> &inputs,
-    std::vector<PaddleTensor> *output_data) {
-  for (const auto &input : inputs) {
-    if (input.dtype != PaddleDType::FLOAT32) {
-      LOG(ERROR) << "Only support float type inputs. " << input.name
-                 << "'s type is not float";
-      return false;
-    }
-    auto d_tensor_in_p = executor_.get_in(input.name);
-    float *d_data_p = d_tensor_in_p->mutable_data();
-    if (cudaMemcpy(d_data_p,
-                   static_cast<float *>(input.data.data()),
-                   d_tensor_in_p->valid_size() * sizeof(float),
-                   cudaMemcpyHostToDevice) != 0) {
-      LOG(ERROR) << "copy data from CPU to GPU error";
-      return false;
-    }
-  }
-
-  executor_.prediction();
-
-  if (output_data->empty()) {
-    LOG(ERROR) << "At least one output should be set with tensors' names.";
-    return false;
-  }
-  for (auto &output : *output_data) {
-    auto *tensor = executor_.get_out(output.name);
-    output.shape = tensor->shape();
-    if (output.data.length() < tensor->valid_size() * sizeof(float)) {
-      output.data.Resize(tensor->valid_size() * sizeof(float));
-    }
-    // Copy data from GPU -> CPU
-    if (cudaMemcpy(output.data.data(),
-                   tensor->mutable_data(),
-                   tensor->valid_size() * sizeof(float),
-                   cudaMemcpyDeviceToHost) != 0) {
-      LOG(ERROR) << "copy data from GPU to CPU error";
-      return false;
-    }
-  }
-  return true;
-}
-
-anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
-    &PaddleInferenceAnakinPredictor::get_executer() {
-  return executor_;
-}
-
-// the cloned new Predictor of anakin share the same net weights from original
-// Predictor
-std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {
-  VLOG(3) << "Anakin Predictor::clone";
-  std::unique_ptr<PaddlePredictor> cls(new PaddleInferenceAnakinPredictor());
-  // construct executer from other graph
-  auto anakin_predictor_p =
-      dynamic_cast<PaddleInferenceAnakinPredictor *>(cls.get());
-  if (!anakin_predictor_p) {
-    LOG(ERROR) << "fail to call Init";
-    return nullptr;
-  }
-  anakin_predictor_p->get_executer().init(graph_);
-
-  return std::move(cls);
-}
-
-// A factory to help create difference predictor.
-template <>
-std::unique_ptr<PaddlePredictor>
-CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(
-    const AnakinConfig &config) {
-  VLOG(3) << "Anakin Predictor create.";
-  std::unique_ptr<PaddlePredictor> x(
-      new PaddleInferenceAnakinPredictor(config));
-  return x;
-};
-
-}  // namespace paddle
diff --git a/paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc b/paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc
deleted file mode 100644
index b100630dbe412ca811f1a8f2b8191356f5ebec2f..0000000000000000000000000000000000000000
--- a/paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-#include "paddle/contrib/inference/paddle_inference_api.h"
-
-namespace paddle {
-
-DEFINE_string(dirname, "", "Directory of the inference model.");
-
-void Main(bool use_gpu) {
-  //# 1. Create PaddlePredictor with a config.
-  TensorRTConfig config;
-  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
-  config.use_gpu = use_gpu;
-  config.fraction_of_gpu_memory = 0.15;
-  config.device = 0;
-  auto predictor =
-      CreatePaddlePredictor<TensorRTConfig,
-                            PaddleEngineKind::kAutoMixedTensorRT>(config);
-
-  for (int batch_id = 0; batch_id < 3; batch_id++) {
-    //# 2. Prepare input.
-    int64_t data[4] = {1, 2, 3, 4};
-
-    PaddleTensor tensor{.name = "",
-                        .shape = std::vector<int>({4, 1}),
-                        .data = PaddleBuf(data, sizeof(data)),
-                        .dtype = PaddleDType::INT64};
-
-    // For simplicity, we set all the slots with the same data.
-    std::vector<PaddleTensor> slots(4, tensor);
-
-    //# 3. Run
-    std::vector<PaddleTensor> outputs;
-    CHECK(predictor->Run(slots, &outputs));
-
-    //# 4. Get output.
-    ASSERT_EQ(outputs.size(), 1UL);
-    LOG(INFO) << "output buffer size: " << outputs.front().data.length();
-    const size_t num_elements = outputs.front().data.length() / sizeof(float);
-    // The outputs' buffers are in CPU memory.
-    for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
-      LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
-    }
-  }
-}
-
-TEST(paddle_inference_api_tensorrt_subgraph_engine, main) { Main(true); }
-
-}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
new file mode 100644
index 0000000000000000000000000000000000000000..dd172ff9c97814c089ddb2e5bf729880cf0c9cdb
--- /dev/null
+++ b/paddle/fluid/API.spec
@@ -0,0 +1,425 @@
+paddle.fluid.Program.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Program.block ArgSpec(args=['self', 'index'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Program.clone ArgSpec(args=['self', 'for_test'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.Program.copy_data_info_from ArgSpec(args=['self', 'other'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Program.create_block ArgSpec(args=['self', 'parent_idx'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.Program.current_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Program.get_desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Program.inference_optimize ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Program.optimized_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Program.prune ArgSpec(args=['self', 'targets'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Program.rollback ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Program.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.Operator.__init__ ArgSpec(args=['self', 'block', 'desc', 'type', 'inputs', 'outputs', 'attrs'], varargs=None, keywords=None, defaults=(None, None, None, None))
+paddle.fluid.Operator.all_attrs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Operator.attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Operator.attr_type ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Operator.block_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Operator.has_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Operator.has_kernel ArgSpec(args=['self', 'op_type'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Operator.input ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Operator.output ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Operator.rename_input ArgSpec(args=['self', 'old_name', 'new_name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Operator.rename_output ArgSpec(args=['self', 'old_name', 'new_name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Operator.set_attr ArgSpec(args=['self', 'name', 'val'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Operator.to_string ArgSpec(args=['self', 'throw_on_error'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Parameter.__init__ ArgSpec(args=['self', 'block', 'shape', 'dtype'], varargs=None, keywords='kwargs', defaults=None)
+paddle.fluid.Parameter.astype ArgSpec(args=['self', 'dtype'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Parameter.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
+paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
+paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Executor.as_lodtensor ArgSpec(args=['self', 'data'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False))
+paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
+paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.fetch_var ArgSpec(args=['name', 'scope', 'return_numpy'], varargs=None, keywords=None, defaults=(None, True))
+paddle.fluid.Go.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.Go.construct_go_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.make_channel ArgSpec(args=['dtype', 'capacity'], varargs=None, keywords=None, defaults=(0,))
+paddle.fluid.channel_send ArgSpec(args=['channel', 'value', 'is_copy'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.channel_recv ArgSpec(args=['channel', 'return_value'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.channel_close ArgSpec(args=['channel'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Select.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.Select.case ArgSpec(args=['self', 'channel_action_fn', 'channel', 'value', 'is_copy'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.Select.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Trainer.__init__ ArgSpec(args=['self', 'train_func', 'optimizer_func', 'param_path', 'place', 'parallel', 'checkpoint_config'], varargs=None, keywords=None, defaults=(None, None, False, None))
+paddle.fluid.Trainer.save_params ArgSpec(args=['self', 'param_path'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Trainer.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Trainer.test ArgSpec(args=['self', 'reader', 'feed_order'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Trainer.train ArgSpec(args=['self', 'num_epochs', 'event_handler', 'reader', 'feed_order'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.BeginEpochEvent.__init__ ArgSpec(args=['self', 'epoch_id'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.EndEpochEvent.__init__ ArgSpec(args=['self', 'epoch_id'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.BeginStepEvent.__init__ ArgSpec(args=['self', 'epoch_id', 'step_id'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.EndStepEvent.__init__ ArgSpec(args=['self', 'epoch_id', 'step_id', 'metrics'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.CheckpointConfig.__init__ ArgSpec(args=['self', 'checkpoint_dir', 'max_num_checkpoints', 'epoch_interval', 'step_interval'], varargs=None, keywords=None, defaults=(None, 3, 1, 10))
+paddle.fluid.Inferencer.__init__ ArgSpec(args=['self', 'infer_func', 'param_path', 'place', 'parallel'], varargs=None, keywords=None, defaults=(None, False))
+paddle.fluid.Inferencer.infer ArgSpec(args=['self', 'inputs', 'return_numpy'], varargs=None, keywords=None, defaults=(True,))
+paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True))
+paddle.fluid.InferenceTranspiler.__init__ 
+paddle.fluid.InferenceTranspiler.fuse_batch_norm ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.InferenceTranspiler.fuse_relu_mkldnn ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
+paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.DistributeTranspilerConfig.__init__ 
+paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0))
+paddle.fluid.ParallelExecutor.bcast_params ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True))
+paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None
+paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None
+paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ReduceStrategy, arg0: int) -> None
+paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.BuildStrategy) -> None
+paddle.fluid.create_lod_tensor ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.create_random_int_lodtensor ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))
+paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.io.load_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))
+paddle.fluid.io.load_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.io.load_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.io.get_inference_program ArgSpec(args=['target_vars', 'main_program'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False))
+paddle.fluid.initializer.UniformInitializer.__init__ ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0))
+paddle.fluid.initializer.NormalInitializer.__init__ ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0))
+paddle.fluid.initializer.XavierInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'fan_out', 'seed'], varargs=None, keywords=None, defaults=(True, None, None, 0))
+paddle.fluid.initializer.BilinearInitializer.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0))
+paddle.fluid.initializer.force_init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
+paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'use_mkldnn', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, False, None, False, None))
+paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32'))
+paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None))
+paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None))
+paddle.fluid.layers.dynamic_gru ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None))
+paddle.fluid.layers.gru_unit ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid'))
+paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None))
+paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, False, None, None))
+paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, False, None, None))
+paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn'], varargs=None, keywords=None, defaults=(None, None, True))
+paddle.fluid.layers.softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None))
+paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'use_mkldnn', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, False, None))
+paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'use_mkldnn', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, False, None))
+paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'use_mkldnn', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, False, None, None, None, False, False))
+paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
+paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
+paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None))
+paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None))
+paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
+paddle.fluid.layers.reduce_mean ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
+paddle.fluid.layers.reduce_max ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
+paddle.fluid.layers.reduce_min ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
+paddle.fluid.layers.reduce_prod ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
+paddle.fluid.layers.sequence_first_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.sequence_last_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
+paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None))
+paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None))
+paddle.fluid.layers.l2_normalize ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None))
+paddle.fluid.layers.matmul ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'name'], varargs=None, keywords=None, defaults=(False, False, None))
+paddle.fluid.layers.topk ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_times'], varargs=None, keywords=None, defaults=(0, False))
+paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
+paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples'], varargs=None, keywords=None, defaults=(None, None, None, None))
+paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None))
+paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
+paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1))
+paddle.fluid.layers.reshape ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None))
+paddle.fluid.layers.lod_reset ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.layers.lrn ArgSpec(args=['input', 'n', 'k', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(5, 1.0, 0.0001, 0.75, None))
+paddle.fluid.layers.pad ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None))
+paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None))
+paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0))
+paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,))
+paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR'))
+paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',))
+paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.relu ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.log ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
+paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
+paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
+paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.shuffle ArgSpec(args=['reader', 'buffer_size'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.batch ArgSpec(args=['reader', 'batch_size'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.double_buffer ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.layers.random_data_generator ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,))
+paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True))
+paddle.fluid.layers.Preprocessor.__init__ ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.Preprocessor.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.layers.Preprocessor.inputs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.Preprocessor.outputs ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None)
+paddle.fluid.layers.load ArgSpec(args=['out', 'file_path', 'load_as_fp16'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.create_tensor ArgSpec(args=['dtype', 'name', 'persistable'], varargs=None, keywords=None, defaults=(None, False))
+paddle.fluid.layers.create_parameter ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None))
+paddle.fluid.layers.create_global_var ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None))
+paddle.fluid.layers.cast ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.concat ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None))
+paddle.fluid.layers.sums ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.assign ArgSpec(args=['input', 'output'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.fill_constant_batch_size_like ArgSpec(args=['input', 'shape', 'dtype', 'value', 'input_dim_idx', 'output_dim_idx'], varargs=None, keywords=None, defaults=(0, 0))
+paddle.fluid.layers.fill_constant ArgSpec(args=['shape', 'dtype', 'value', 'force_cpu', 'out'], varargs=None, keywords=None, defaults=(False, None))
+paddle.fluid.layers.argmin ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
+paddle.fluid.layers.argmax ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
+paddle.fluid.layers.argsort ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(-1, None))
+paddle.fluid.layers.ones ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.Switch.case ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.Switch.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.increment ArgSpec(args=['x', 'value', 'in_place'], varargs=None, keywords=None, defaults=(1.0, True))
+paddle.fluid.layers.array_write ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.create_array ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.less_than ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords='ignored', defaults=(None, None))
+paddle.fluid.layers.equal ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords='ignored', defaults=(None,))
+paddle.fluid.layers.array_read ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.array_length ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.IfElse.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.IfElse.false_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.IfElse.input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.IfElse.output ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None)
+paddle.fluid.layers.IfElse.true_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.DynamicRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.DynamicRNN.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.layers.DynamicRNN.memory ArgSpec(args=['self', 'init', 'shape', 'value', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, False, 'float32'))
+paddle.fluid.layers.DynamicRNN.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None)
+paddle.fluid.layers.DynamicRNN.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.DynamicRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.DynamicRNN.update_memory ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.StaticRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.StaticRNN.memory ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1))
+paddle.fluid.layers.StaticRNN.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None)
+paddle.fluid.layers.StaticRNN.step ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.StaticRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.StaticRNN.step_output ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.StaticRNN.update_memory ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.reorder_lod_tensor_by_rank ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.ParallelDo.__init__ ArgSpec(args=['self', 'places', 'use_nccl', 'name'], varargs=None, keywords=None, defaults=(False, None))
+paddle.fluid.layers.ParallelDo.do ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.ParallelDo.get_parameters ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.ParallelDo.parent_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.ParallelDo.read_input ArgSpec(args=['self', 'var'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.ParallelDo.write_output ArgSpec(args=['self', 'var'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.Print ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both'))
+paddle.fluid.layers.is_empty ArgSpec(args=['x', 'cond'], varargs=None, keywords='ignored', defaults=(None,))
+paddle.fluid.layers.mean ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.mul ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.scale ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.elementwise_add ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.elementwise_div ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.elementwise_sub ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.elementwise_mul ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.elementwise_max ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.elementwise_min ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.elementwise_pow ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.clip ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.clip_by_norm ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.logical_and ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.logical_or ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.logical_xor ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.logical_not ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.gaussian_random ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.scatter ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.sigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.logsigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.exp ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.tanh ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.tanh_shrink ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.softshrink ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.sqrt ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.abs ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.ceil ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.floor ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.cos ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.sin ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.round ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.reciprocal ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.square ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.softplus ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.softsign ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.brelu ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.leaky_relu ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.soft_relu ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.elu ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.relu6 ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.pow ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.stanh ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.hard_sigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.swish ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.uniform_random ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=(None, None, None, None))
+paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.prior_box ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False))
+paddle.fluid.layers.multi_box_head ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False))
+paddle.fluid.layers.bipartite_match ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.layers.detection_output ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0))
+paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None))
+paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral'))
+paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3))
+paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None))
+paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
+paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 200, 1))
+paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.layers.polynomial_decay ArgSpec(args=['learning_rate', 'decay_steps', 'end_learning_rate', 'power', 'cycle'], varargs=None, keywords=None, defaults=(0.0001, 1.0, False))
+paddle.fluid.layers.piecewise_decay ArgSpec(args=['boundaries', 'values'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.noam_decay ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.append_LARS ArgSpec(args=['params_grads', 'learning_rate', 'weight_decay'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.InitState.__init__ ArgSpec(args=['self', 'init', 'shape', 'value', 'init_boot', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, None, False, 'float32'))
+paddle.fluid.contrib.StateCell.__init__ ArgSpec(args=['self', 'inputs', 'states', 'out_state', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.contrib.StateCell.compute_state ArgSpec(args=['self', 'inputs'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.StateCell.get_input ArgSpec(args=['self', 'input_name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.StateCell.get_state ArgSpec(args=['self', 'state_name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.StateCell.out_state ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.StateCell.set_state ArgSpec(args=['self', 'state_name', 'state_value'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.StateCell.state_updater ArgSpec(args=['self', 'updater'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.StateCell.update_states ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.TrainingDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.contrib.TrainingDecoder.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.contrib.TrainingDecoder.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None)
+paddle.fluid.contrib.TrainingDecoder.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.TrainingDecoder.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.BeamSearchDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'init_ids', 'init_scores', 'target_dict_dim', 'word_dim', 'input_var_dict', 'topk_size', 'sparse_emb', 'max_len', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=({}, 50, True, 100, 1, 1, None))
+paddle.fluid.contrib.BeamSearchDecoder.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.contrib.BeamSearchDecoder.decode ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False))
+paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.transpiler.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True))
+paddle.fluid.transpiler.InferenceTranspiler.__init__ 
+paddle.fluid.transpiler.InferenceTranspiler.fuse_batch_norm ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.transpiler.InferenceTranspiler.fuse_relu_mkldnn ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
+paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.transpiler.HashName.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.transpiler.HashName.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.transpiler.HashName.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.transpiler.RoundRobin.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.transpiler.RoundRobin.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.transpiler.RoundRobin.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ 
+paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn', 'use_mkldnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True, False))
+paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max'))
+paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,))
+paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0))
+paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate'], varargs=None, keywords='kwargs', defaults=None)
+paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov'], varargs=None, keywords='kwargs', defaults=(False,))
+paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon'], varargs=None, keywords='kwargs', defaults=(1e-06,))
+paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon'], varargs=None, keywords='kwargs', defaults=(0.001, 0.9, 0.999, 1e-08))
+paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon'], varargs=None, keywords='kwargs', defaults=(0.001, 0.9, 0.999, 1e-08))
+paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06))
+paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power'], varargs=None, keywords='kwargs', defaults=(0.0, 0.0, -0.5))
+paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06, 0.0))
+paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho'], varargs=None, keywords='kwargs', defaults=(1e-06, 0.95))
+paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window'], varargs=None, keywords='kwargs', defaults=(10000, 10000))
+paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.backward.append_backward ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.regularizer.L1DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,))
+paddle.fluid.regularizer.L2DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,))
+paddle.fluid.LoDTensor.__init__ 1. __init__(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None  2. __init__(self: paddle.fluid.core.LoDTensor) -> None
+paddle.fluid.LoDTensor.has_valid_recursive_sequence_lengths has_valid_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> bool
+paddle.fluid.LoDTensor.lod lod(self: paddle.fluid.core.LoDTensor) -> List[List[int]]
+paddle.fluid.LoDTensor.recursive_sequence_lengths recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> List[List[int]]
+paddle.fluid.LoDTensor.set 1. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CPUPlace) -> None  2. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CPUPlace) -> None  3. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CPUPlace) -> None  4. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CPUPlace) -> None  5. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CPUPlace) -> None  6. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CPUPlace) -> None  7. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CPUPlace) -> None  8. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPlace) -> None  9. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPlace) -> None  10. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPlace) -> None  11. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPlace) -> None  12. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPlace) -> None  13. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPlace) -> None  14. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPlace) -> None  15. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPinnedPlace) -> None  16. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPinnedPlace) -> None  17. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPinnedPlace) -> None  18. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPinnedPlace) -> None  19. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPinnedPlace) -> None  20. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPinnedPlace) -> None  21. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPinnedPlace) -> None
+paddle.fluid.LoDTensor.set_lod set_lod(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None
+paddle.fluid.LoDTensor.set_recursive_sequence_lengths set_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None
+paddle.fluid.LoDTensor.shape shape(self: paddle.fluid.core.Tensor) -> List[int]
+paddle.fluid.LoDTensorArray.__init__ __init__(self: paddle.fluid.core.LoDTensorArray) -> None
+paddle.fluid.LoDTensorArray.append append(self: paddle.fluid.core.LoDTensorArray, arg0: paddle.fluid.core.LoDTensor) -> None
+paddle.fluid.CPUPlace.__init__ __init__(self: paddle.fluid.core.CPUPlace) -> None
+paddle.fluid.CUDAPlace.__init__ __init__(self: paddle.fluid.core.CUDAPlace, arg0: int) -> None
+paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core.CUDAPinnedPlace) -> None
+paddle.fluid.ParamAttr.__init__ ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, False))
+paddle.fluid.WeightNormParamAttr.__init__ ArgSpec(args=['self', 'dim'], varargs=None, keywords='kwargs', defaults=(None,))
+paddle.fluid.DataFeeder.__init__ ArgSpec(args=['self', 'feed_list', 'place', 'program'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.DataFeeder.decorate_reader ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True))
+paddle.fluid.DataFeeder.feed ArgSpec(args=['self', 'iterable'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DataFeeder.feed_parallel ArgSpec(args=['self', 'iterable', 'num_places'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.clip.ErrorClipByValue.__init__ ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.clip.GradientClipByValue.__init__ ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.clip.GradientClipByNorm.__init__ ArgSpec(args=['self', 'clip_norm'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.clip.GradientClipByGlobalNorm.__init__ ArgSpec(args=['self', 'clip_norm', 'group_name'], varargs=None, keywords=None, defaults=('default_group',))
+paddle.fluid.profiler.cuda_profiler ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.profiler.reset_profiler ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
+paddle.fluid.profiler.profiler ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.profiler.start_profiler ArgSpec(args=['state'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.profiler.stop_profiler ArgSpec(args=['sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile'))
+paddle.fluid.unique_name.generate ArgSpec(args=['key'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.unique_name.switch ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.unique_name.guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
+paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
+paddle.fluid.Scope.__init__ __init__(self: paddle.fluid.core.Scope) -> None
+paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None
+paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
+paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope
+paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index d274d96c29bdbf5973d568d783369c3975bdc436..2577e59d9cf24c26b7c04aa00cdde6cde17f7206 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -5,5 +5,7 @@ add_subdirectory(operators)
 add_subdirectory(pybind)
 add_subdirectory(string)
 add_subdirectory(recordio)
-# NOTE: please add subdirectory inference at last.
-add_subdirectory(inference)
+if(WITH_INFERENCE)
+  # NOTE: please add subdirectory inference at last.
+  add_subdirectory(inference)
+endif()
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 397c9f739452e5130dad28a763b92cf76720ec61..6440607dbe4666ff3ff91dc526465706b3b9c1f0 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_subdirectory(details)
+add_subdirectory(ir)
 # ddim lib
 proto_library(framework_proto SRCS framework.proto)
 
@@ -6,10 +7,11 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
+cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
 if(WITH_GPU)
-  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type)
+  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
 else()
-  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type)
+  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context)
 endif()
 
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
@@ -21,12 +23,18 @@ endif()
 
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
-nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place memory device_context tensor)
+if(WITH_GPU)
+  nv_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor)
+else()
+  cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
+endif()
+
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio)
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
 
 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
+cc_test(reader_test SRCS reader_test.cc DEPS reader)
 
 cc_test(variable_test SRCS variable_test.cc)
 
@@ -92,7 +100,7 @@ else()
 endif()
 
 
-cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph graph_viz_pass multi_devices_graph_builder ssa_graph_printer ssa_graph_checker)
 
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
@@ -103,7 +111,7 @@ cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
 
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
-      
+
 # cc_test(channel_test SRCS channel_test.cc)
 cc_test(tuple_test SRCS tuple_test.cc )
 
diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h
index ce48548418478cc5c9f9ca1244df9e66dca884e6..960ca39e1eadd3c064beb0e2c1342a406c4f0b6a 100644
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -88,9 +88,8 @@ class BlockDesc {
   OpDesc *InsertOp(size_t index);
 
   /*
-   * Remove Op and its input/output variables.
-   * Note that for either input or output variable, if it is also an input or
-   * output variable of other ops, we should remain it.
+   * Only remove op itself,
+   * do nothing to its input and output variables
    */
   void RemoveOp(size_t s, size_t e);
 
diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc
index 60382faffb8e53870658b2d1ff83abc4008cb4cf..1a9ce746ea840bc088d222cc4e9bc05159d64734 100644
--- a/paddle/fluid/framework/data_type.cc
+++ b/paddle/fluid/framework/data_type.cc
@@ -17,6 +17,8 @@
 #include <string>
 #include <unordered_map>
 
+using float16 = paddle::platform::float16;
+
 namespace paddle {
 namespace framework {
 
@@ -53,7 +55,7 @@ static DataTypeMap* InitDataTypeMap() {
   RegisterType<cc_type>(retv, proto_type, #cc_type)
 
   // NOTE: Add your customize type here.
-  RegType(platform::float16, proto::VarType::FP16);
+  RegType(float16, proto::VarType::FP16);
   RegType(float, proto::VarType::FP32);
   RegType(double, proto::VarType::FP64);
   RegType(int, proto::VarType::INT32);
diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..54c41c55ba63c0b2001cfcb6a9e94fbb0036d437
--- /dev/null
+++ b/paddle/fluid/framework/data_type_test.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/data_type.h"
+
+#include <string>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/tensor.h"
+
+TEST(DataType, float16) {
+  using paddle::framework::Tensor;
+  using paddle::platform::CPUPlace;
+  using paddle::platform::float16;
+  namespace f = paddle::framework;
+  f::proto::VarType::Type dtype = f::proto::VarType::FP16;
+
+  Tensor tensor;
+  CPUPlace cpu;
+  tensor.mutable_data(cpu, f::ToTypeIndex(dtype));
+
+  // test fp16 tensor
+  EXPECT_EQ(tensor.type(), std::type_index(typeid(float16)));
+
+  // test fp16 size
+  EXPECT_EQ(f::SizeOfType(f::ToTypeIndex(dtype)), 2u);
+
+  // test debug info
+  std::string type = "float16";
+  EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str());
+}
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 4fb4ec38ee965a2790d11378a1ce6befa0ef5a00..5d652d37307d0a55ffee14930ae180dcd3e27841 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -1,12 +1,11 @@
-cc_library(var_handle SRCS var_handle.cc DEPS place)
+cc_library(var_handle SRCS var_handle.cc DEPS place framework_proto node)
 cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
 cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry)
 
-cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
-cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
+cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS graph graph_helper)
 cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder)
 cc_library(ssa_graph_checker SRCS ssa_graph_checker.cc DEPS ssa_graph_builder)
 
@@ -32,10 +31,7 @@ cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base s
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
         scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle)
 
-
-cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker)
-
-cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
+cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
         simple_threadpool device_context)
 
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index b335d3a0d364c916e19574de8d3ed89aaec7de41..bf493a3fa44e48deec734250d04b2a413c3ed9da 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -17,16 +17,21 @@
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
 #ifdef PADDLE_WITH_CUDA
-AllReduceOpHandle::AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
+                                     const std::vector<Scope *> &local_scopes,
                                      const std::vector<platform::Place> &places,
                                      const platform::NCCLContextMap *ctxs)
-    : local_scopes_(local_scopes), places_(places), nccl_ctxs_(ctxs) {
+    : OpHandleBase(node),
+      local_scopes_(local_scopes),
+      places_(places),
+      nccl_ctxs_(ctxs) {
   if (nccl_ctxs_) {
     for (auto &p : places_) {
       this->dev_ctxes_[p] = nccl_ctxs_->DevCtx(p);
@@ -34,12 +39,14 @@ AllReduceOpHandle::AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
   }
 }
 #else
-AllReduceOpHandle::AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
+                                     const std::vector<Scope *> &local_scopes,
                                      const std::vector<platform::Place> &places)
-    : local_scopes_(local_scopes), places_(places) {}
+    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
 #endif
 
 void AllReduceOpHandle::RunImpl() {
+  platform::RecordEvent r("all_reduce", nullptr);
   if (NoDummyInputSize() == 1) {
     return;  // No need to all reduce when GPU count = 1;
   } else {
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h
index fdd250b0d3eb166249271a95f7592b9fadee5265..f6ef3a1367b91b6abf8ce74a91f73056efd0f84e 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.h
@@ -30,11 +30,11 @@ namespace details {
 
 struct AllReduceOpHandle : public OpHandleBase {
 #ifdef PADDLE_WITH_CUDA
-  AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+  AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places,
                     const platform::NCCLContextMap *ctxs);
 #else
-  AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+  AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places);
 #endif
   std::string Name() const override;
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
index 8036f756b6d6506684c109ab881d546f38176a10..fe4e733e43417977df324fde808f52b228a27d19 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -35,10 +35,13 @@ namespace details {
 struct BroadcastOpHandle : public OpHandleBase {
  public:
 #ifdef PADDLE_WITH_CUDA
-  BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
+  BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places,
                     const platform::NCCLContextMap *nccl_ctxs)
-      : local_scopes_(local_scopes), places_(places), nccl_ctxs_(nccl_ctxs) {
+      : OpHandleBase(node),
+        local_scopes_(local_scopes),
+        places_(places),
+        nccl_ctxs_(nccl_ctxs) {
     if (nccl_ctxs_) {
       for (auto &p_ctx : nccl_ctxs_->contexts_) {
         dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
@@ -46,9 +49,9 @@ struct BroadcastOpHandle : public OpHandleBase {
     }
   }
 #else
-  BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
+  BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places)
-      : local_scopes_(local_scopes), places_(places) {}
+      : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
 #endif
 
   std::string Name() const override;
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
index c6e923ef77ff03413eefe4f26457a5322747618e..1413f7bd9ac515ae7dceee62de8f3bc74e3a2efc 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@@ -96,48 +96,61 @@ struct TestBroadcastOpHandle {
     }
     param_scopes_[input_scope_idx]->Var("input");
 
+    std::unique_ptr<ir::Node> n(
+        new ir::Node("node0", ir::Node::Type::kOperation));
     if (use_gpu_) {
 #ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(
-          new BroadcastOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+      op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_,
+                                             nccl_ctxs_.get()));
 #else
       PADDLE_THROW("CUDA is not support.");
 #endif
     } else {
 #ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(
-          new BroadcastOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+      op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_,
+                                             nccl_ctxs_.get()));
 #else
-      op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_));
+      op_handle_.reset(
+          new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_));
 #endif
     }
 
-    auto* in_var_handle =
-        new VarHandle(1, input_scope_idx, "input", gpu_list_[input_scope_idx]);
+    std::unique_ptr<ir::Node> v(
+        new ir::Node("node1", ir::Node::Type::kVariable));
+    auto* in_var_handle = new VarHandle(v.get(), 1, input_scope_idx, "input",
+                                        gpu_list_[input_scope_idx]);
     vars_.emplace_back(in_var_handle);
     op_handle_->AddInput(in_var_handle);
 
     // add dummy var
-    vars_.emplace_back(new DummyVarHandle());
+
+    std::unique_ptr<ir::Node> v2(
+        new ir::Node("node2", ir::Node::Type::kVariable));
+    vars_.emplace_back(new DummyVarHandle(v2.get()));
     DummyVarHandle* dummy_var_handle =
         static_cast<DummyVarHandle*>(vars_.back().get());
-    dummy_var_handle->generated_op_ = nullptr;
+    dummy_var_handle->ClearGeneratedOp();
     op_handle_->AddInput(dummy_var_handle);
 
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
       if (!use_gpu_) {
         op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
       }
-      VarHandle* out_var_handle = new VarHandle(2, j, "out", gpu_list_[j]);
+      std::unique_ptr<ir::Node> v3(
+          new ir::Node("node3", ir::Node::Type::kVariable));
+      VarHandle* out_var_handle =
+          new VarHandle(v3.get(), 2, j, "out", gpu_list_[j]);
       vars_.emplace_back(out_var_handle);
       op_handle_->AddOutput(out_var_handle);
     }
 
     // add dummy var
-    vars_.emplace_back(new DummyVarHandle());
+    std::unique_ptr<ir::Node> v4(
+        new ir::Node("node4", ir::Node::Type::kVariable));
+    vars_.emplace_back(new DummyVarHandle(v4.get()));
     DummyVarHandle* out_dummy_var_handle =
         static_cast<DummyVarHandle*>(vars_.back().get());
-    out_dummy_var_handle->generated_op_ = nullptr;
+    out_dummy_var_handle->ClearGeneratedOp();
     op_handle_->AddOutput(out_dummy_var_handle);
   }
 
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index b2e5399e2376a86c1cd310b29c768832665af87f..8714a42162bda3d5ad12e7925fe8cc4e693f51b1 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -21,6 +21,26 @@ namespace framework {
 namespace details {
 
 struct BuildStrategy {
+  // ParallelExecutor supports two modes of ReduceStrategy, kAllReduce and
+  // kReduce, for CPU and GPU. If you use kAllReduce, different threads
+  // optimize their parameters separately. If you use kReduce, the optimizations
+  // of parameters are distributed to different threads.
+  // For example, a model has 100 parameters and is running with four threads,
+  // if you choose kAllReduce, every thread is to optimize 100 parameters
+  // separately, if you choose kReduce, every thread is to optimize 25
+  // parameters.
+  // Of particular note is, if you use kReduce when using CPU training,
+  // all the parameters are shared between different threads. This feature will
+  // save memory.
+  // FIXME(zcd): The result of the two modes(kAllReduce and kReduce) maybe not
+  // equal for GPU. Because, the result of the different order of summing maybe
+  // different, for example, the result of `a+b+c+d` may be different with the
+  // result of `c+a+b+d`.
+  // For GPU, the implementation of kAllReduce and kReduce is adopted NCCL,
+  // so the result of kAllReduce and kReduce maybe not equal.
+  // For CPU, if you want to fix the order of summing to make the result
+  // of kAllReduce and kReduce no diff, you can add
+  // `FLAGS_cpu_deterministic=true` to env.
   enum class ReduceStrategy { kAllReduce = 0, kReduce = 1 };
 
   enum class GradientScaleStrategy {
diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
index df05bb06333d6b964f2f5434c3d43214e5d2cb7a..b6282debdb4eb6b1f29c39e54ac4f3e2296838da 100644
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -19,9 +19,10 @@
 namespace paddle {
 namespace framework {
 namespace details {
-ComputationOpHandle::ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
+ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
                                          platform::Place place)
-    : op_(framework::OpRegistry::CreateOp(op_desc)),
+    : OpHandleBase(node),
+      op_(framework::OpRegistry::CreateOp(*node->Op())),
       scope_(scope),
       place_(place) {}
 
@@ -35,8 +36,8 @@ void ComputationOpHandle::RunImpl() {
 
 bool ComputationOpHandle::NeedWait(VarHandleBase *in_var) {
   bool need_wait =
-      in_var && in_var->generated_op_ &&
-      in_var->generated_op_->DeviceContext(place_) != dev_ctxes_[place_];
+      in_var && in_var->GeneratedOp() &&
+      in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_[place_];
   return need_wait;
 }
 
diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h
index f048f973fdeb6cf7d1485cda8cea7d530d9ba465..d9fcd92427ef38b131b4ce782c0ada37765682db 100644
--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@@ -28,8 +28,7 @@ namespace framework {
 namespace details {
 struct ComputationOpHandle : public OpHandleBase {
  public:
-  ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
-                      platform::Place place);
+  ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place);
 
   std::string Name() const override;
 
diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc
index 68896c8ac1bae7d4bfcfa79cc8ec5c26bf2d93ee..525d24322442ef4dd6e8c24212af61c908959b87 100644
--- a/paddle/fluid/framework/details/data_balance_op_handle.cc
+++ b/paddle/fluid/framework/details/data_balance_op_handle.cc
@@ -22,10 +22,10 @@ namespace details {
 
 #ifdef PADDLE_WITH_CUDA
 DataBalanceOpHandle::DataBalanceOpHandle(
-    const std::vector<Scope *> &local_scopes,
+    ir::Node *node, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places,
     const platform::NCCLContextMap *ctxs)
-    : local_scopes_(local_scopes), places_(places) {
+    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
   if (ctxs) {
     for (auto &p : places_) {
       this->dev_ctxes_[p] = ctxs->DevCtx(p);
@@ -34,9 +34,9 @@ DataBalanceOpHandle::DataBalanceOpHandle(
 }
 #else
 DataBalanceOpHandle::DataBalanceOpHandle(
-    const std::vector<Scope *> &local_scopes,
+    ir::Node *node, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places)
-    : local_scopes_(local_scopes), places_(places) {}
+    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
 #endif
 
 std::string DataBalanceOpHandle::Name() const { return "data balance"; }
diff --git a/paddle/fluid/framework/details/data_balance_op_handle.h b/paddle/fluid/framework/details/data_balance_op_handle.h
index 76a407e3610e8bb48facf1f814779f4c23f92d98..0462fb6ec713eb977f420a9cb485c0273e782496 100644
--- a/paddle/fluid/framework/details/data_balance_op_handle.h
+++ b/paddle/fluid/framework/details/data_balance_op_handle.h
@@ -30,11 +30,11 @@ namespace details {
 struct DataBalanceOpHandle : public OpHandleBase {
  public:
 #ifdef PADDLE_WITH_CUDA
-  DataBalanceOpHandle(const std::vector<Scope *> &local_scopes,
+  DataBalanceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                       const std::vector<platform::Place> &places,
                       const platform::NCCLContextMap *ctxs);
 #else
-  DataBalanceOpHandle(const std::vector<Scope *> &local_scopes,
+  DataBalanceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                       const std::vector<platform::Place> &places);
 #endif
 
diff --git a/paddle/fluid/framework/details/exception_holder.h b/paddle/fluid/framework/details/exception_holder.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e302a29233b96451df14b4685911be1cd87c1ab
--- /dev/null
+++ b/paddle/fluid/framework/details/exception_holder.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class ExceptionHolder {
+ public:
+  void Catch(const platform::EnforceNotMet& exp) {
+    std::lock_guard<std::mutex> lock(mu_);
+    exception_.reset(new platform::EnforceNotMet(exp));
+    type_ = kEnforceNotMet;
+  }
+
+  void Catch(const platform::EOFException& exp) {
+    std::lock_guard<std::mutex> lock(mu_);
+    // EOFException will not cover up existing EnforceNotMet.
+    if (exception_.get() == nullptr) {
+      exception_.reset(new platform::EOFException(exp));
+      type_ = kEOF;
+    }
+  }
+
+  bool ExceptionCatched() const {
+    std::lock_guard<std::mutex> lock(mu_);
+    return exception_.get() != nullptr;
+  }
+
+  void Throw() {
+    std::lock_guard<std::mutex> lock(mu_);
+    switch (type_) {
+      case kNone:
+        break;
+      case kEnforceNotMet: {
+        auto e = *static_cast<platform::EnforceNotMet*>(exception_.get());
+        throw e;
+        break;
+      }
+      case kEOF: {
+        auto e = *static_cast<platform::EOFException*>(exception_.get());
+        throw e;
+        break;
+      }
+      default:
+        LOG(FATAL) << "Unknown exception.";
+    }
+    exception_.reset();
+    type_ = kNone;
+  }
+
+  void Clear() {
+    std::lock_guard<std::mutex> lock(mu_);
+    exception_.reset();
+    type_ = kNone;
+  }
+
+ private:
+  enum ExceptionType { kNone, kEnforceNotMet, kEOF };
+  ExceptionType type_{kNone};
+
+  std::unique_ptr<std::exception> exception_;
+  mutable std::mutex mu_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index d646c944601e81477787740189d7ac60ae97fa80..fe18b2060c5cd7e157374da53c5a985f70545ab7 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -21,13 +21,16 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-FetchOpHandle::FetchOpHandle(FeedFetchList *data, size_t offset,
+FetchOpHandle::FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset,
                              std::vector<Scope *> *local_scopes)
-    : data_(data), offset_(offset), local_scopes_(local_scopes) {}
+    : OpHandleBase(node),
+      data_(data),
+      offset_(offset),
+      local_scopes_(local_scopes) {}
 
 FetchOpHandle::~FetchOpHandle() {
   for (auto *input_var : inputs_) {
-    input_var->pending_ops_.erase(this);
+    input_var->RemoveOutput(this, this->Node());
   }
 }
 
@@ -77,8 +80,8 @@ void FetchOpHandle::RunImpl() {
 void FetchOpHandle::WaitInputVarGenerated(const platform::Place &place) {
   auto cpu_ctx = platform::DeviceContextPool::Instance().Get(place);
   for (auto *input : inputs_) {
-    if (input->generated_op_) {
-      input->generated_op_->RecordWaitEventOnCtx(cpu_ctx);
+    if (input->GeneratedOp()) {
+      input->GeneratedOp()->RecordWaitEventOnCtx(cpu_ctx);
     }
   }
 }
diff --git a/paddle/fluid/framework/details/fetch_op_handle.h b/paddle/fluid/framework/details/fetch_op_handle.h
index e09bdd1d3338bb175c1ddae35b53f98197b68e9a..6ce42f92d7f1e81eeafd1eb5c28ce3564a5ffebc 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_op_handle.h
@@ -28,7 +28,7 @@ namespace details {
 
 struct FetchOpHandle : public OpHandleBase {
  public:
-  FetchOpHandle(FeedFetchList *data, size_t offset,
+  FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset,
                 std::vector<Scope *> *local_scopes);
 
   ~FetchOpHandle();
diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.h b/paddle/fluid/framework/details/fuse_vars_op_handle.h
index 140fb5bb49a33146de974b6d79559b4cf15bdd7b..3f360c510a4fdc0caaeb15d862b217ef41b8ea6e 100644
--- a/paddle/fluid/framework/details/fuse_vars_op_handle.h
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.h
@@ -30,10 +30,12 @@ namespace details {
 
 struct FuseVarsOpHandle : public OpHandleBase {
  public:
-  FuseVarsOpHandle(Scope *local_scope, const platform::Place &place,
+  FuseVarsOpHandle(ir::Node *node, Scope *local_scope,
+                   const platform::Place &place,
                    const std::unordered_map<std::string, int64_t> &inputs_numel,
                    const std::type_index &var_type)
-      : local_scope_(local_scope),
+      : OpHandleBase(node),
+        local_scope_(local_scope),
         place_(place),
         inputs_numel_(inputs_numel),
         type_(var_type) {
diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc
index 2be02304566cf5dbe348fa01fc4171990eafd158..9aae19fc73de4387186da47c55710c94d53f1b88 100644
--- a/paddle/fluid/framework/details/gather_op_handle.cc
+++ b/paddle/fluid/framework/details/gather_op_handle.cc
@@ -20,9 +20,10 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-GatherOpHandle::GatherOpHandle(const std::vector<Scope *> &local_scopes,
+GatherOpHandle::GatherOpHandle(ir::Node *node,
+                               const std::vector<Scope *> &local_scopes,
                                const std::vector<platform::Place> &places)
-    : local_scopes_(local_scopes), places_(places) {}
+    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
 
 void GatherOpHandle::RunImpl() {
   if (places_.size() == 1) return;
diff --git a/paddle/fluid/framework/details/gather_op_handle.h b/paddle/fluid/framework/details/gather_op_handle.h
index d11ef8556aa8840949ca8dc7aa176413f70b9f22..d9afbc6547e18e8886c414ff150e332cfaf9b0c3 100644
--- a/paddle/fluid/framework/details/gather_op_handle.h
+++ b/paddle/fluid/framework/details/gather_op_handle.h
@@ -30,7 +30,7 @@ namespace details {
 
 struct GatherOpHandle : public OpHandleBase {
  public:
-  GatherOpHandle(const std::vector<Scope *> &local_scopes,
+  GatherOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                  const std::vector<platform::Place> &places);
 
   std::string Name() const override;
diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc
index 3cce2cc1640b3866130126424ff8fef18b8befc6..c9b94d1e1039df6ff27f9ffe225b2a50c35a5c50 100644
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -70,6 +70,7 @@ struct TestGatherOpHandle {
   }
 
   void InitGatherOp(size_t input_scope_idx) {
+    std::vector<std::unique_ptr<ir::Node>> nodes;
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
       local_scopes_.push_back(&(g_scope_.NewScope()));
       Scope& local_scope = local_scopes_.back()->NewScope();
@@ -81,30 +82,37 @@ struct TestGatherOpHandle {
     }
     param_scopes_[input_scope_idx]->Var("out");
 
-    op_handle_.reset(new GatherOpHandle(local_scopes_, gpu_list_));
+    nodes.emplace_back(new ir::Node("node", ir::Node::Type::kOperation));
+    op_handle_.reset(
+        new GatherOpHandle(nodes.back().get(), local_scopes_, gpu_list_));
     // add input
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
       op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
-      auto* in_var_handle = new VarHandle(1, j, "input", gpu_list_[j]);
+      nodes.emplace_back(new ir::Node("node1", ir::Node::Type::kVariable));
+      auto* in_var_handle =
+          new VarHandle(nodes.back().get(), 1, j, "input", gpu_list_[j]);
       vars_.emplace_back(in_var_handle);
       op_handle_->AddInput(in_var_handle);
     }
 
     // add dummy var
-    vars_.emplace_back(new DummyVarHandle());
+    nodes.emplace_back(new ir::Node("node2", ir::Node::Type::kVariable));
+    vars_.emplace_back(new DummyVarHandle(nodes.back().get()));
     DummyVarHandle* in_dummy_var_handle =
         static_cast<DummyVarHandle*>(vars_.back().get());
-    in_dummy_var_handle->generated_op_ = nullptr;
+    in_dummy_var_handle->ClearGeneratedOp();
     op_handle_->AddInput(in_dummy_var_handle);
 
     // add output
-    auto* out_var_handle =
-        new VarHandle(2, input_scope_idx, "out", gpu_list_[input_scope_idx]);
+    nodes.emplace_back(new ir::Node("node3", ir::Node::Type::kVariable));
+    auto* out_var_handle = new VarHandle(nodes.back().get(), 2, input_scope_idx,
+                                         "out", gpu_list_[input_scope_idx]);
     vars_.emplace_back(out_var_handle);
     op_handle_->AddOutput(out_var_handle);
 
     // add dummy var
-    vars_.emplace_back(new DummyVarHandle());
+    nodes.emplace_back(new ir::Node("node4", ir::Node::Type::kVariable));
+    vars_.emplace_back(new DummyVarHandle(nodes.back().get()));
     DummyVarHandle* dummy_var_handle =
         static_cast<DummyVarHandle*>(vars_.back().get());
     op_handle_->AddOutput(dummy_var_handle);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index b82c2ef4082110f1621eb38d50361396511a4825..a4fdbcb26d1d0cfb05edebff5419d9559c336b3a 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -25,6 +25,8 @@
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/rpc_op_handle.h"
 #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/scope.h"
 
@@ -32,30 +34,22 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+static const char kLossVarName[] = "loss_var_name";
+static const char kPlaces[] = "places";
+static const char kParams[] = "params";
+static const char kLocalScopes[] = "local_scopes";
+static const char kStrategy[] = "strategy";
+
+void MultiDevSSAGraphBuilder::Init() const {
+  loss_var_name_ = Get<const std::string>(kLossVarName);
+  places_ = Get<const std::vector<platform::Place>>(kPlaces);
+  local_scopes_ = Get<const std::vector<Scope *>>(kLocalScopes);
+  strategy_ = Get<const BuildStrategy>(kStrategy);
 #ifdef PADDLE_WITH_CUDA
-MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
-    const std::vector<platform::Place> &places,
-    const std::string &loss_var_name,
-    const std::unordered_set<std::string> &params,
-    const std::vector<Scope *> &local_scopes,
-    platform::NCCLContextMap *nccl_ctxs, const BuildStrategy &strategy)
-    : loss_var_name_(loss_var_name),
-      places_(places),
-      local_scopes_(local_scopes),
-      nccl_ctxs_(nccl_ctxs),
-      strategy_(strategy) {
-#else
-MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
-    const std::vector<platform::Place> &places,
-    const std::string &loss_var_name,
-    const std::unordered_set<std::string> &params,
-    const std::vector<Scope *> &local_scopes, const BuildStrategy &strategy)
-    : loss_var_name_(loss_var_name),
-      places_(places),
-      local_scopes_(local_scopes),
-      strategy_(strategy) {
+  nccl_ctxs_ = &Get<platform::NCCLContextMap>("nccl_ctxs");
 #endif
-  for (auto &p : params) {
+
+  for (auto &p : Get<const std::unordered_set<std::string>>(kParams)) {
     grad_names_.insert(GradVarName(p));
   }
   balance_vars_.resize(places_.size(), 0);
@@ -66,31 +60,38 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
   }
 }
 
-void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
-                                                const OpDesc &op,
+void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result,
+                                                ir::Node *node,
                                                 size_t place_id) const {
   auto p = places_[place_id];
-  auto *op_handle = result->ops_.back().get();
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
   op_handle->SetDeviceContext(p,
                               platform::DeviceContextPool::Instance().Get(p));
 
-  for (auto &each_var_name : op.InputArgumentNames()) {
-    VarHandle *var =
-        CreateOrGetLatestVarHandle(result, each_var_name, p, place_id);
+  for (ir::Node *input : node->inputs) {
+    VarHandle *var = CreateOrGetLatestVarHandle(result, input, p, place_id);
     op_handle->AddInput(var);
   }
 
-  for (auto &each_var_name : op.OutputArgumentNames()) {
-    CreateOpOutput(result, op_handle, each_var_name, p, place_id);
+  for (ir::Node *output : node->outputs) {
+    ir::Node *new_node = nullptr;
+    if (output->Var()) {
+      new_node = result->CreateVarNode(output->Var());
+    } else {
+      new_node =
+          result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable);
+    }
+    CreateOpOutput(result, op_handle, new_node, p, place_id);
   }
 }
 
 std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainSendVars(
-    const ProgramDesc &program) const {
+    const std::vector<ir::Node *> &nodes) const {
   std::vector<std::string> send_vars;
   // since parameters are all in block 0,
   // it's enough to only scan send ops in block 0
-  for (auto *op : program.Block(0).AllOps()) {
+  for (auto &node : nodes) {
+    OpDesc *op = node->Op();
     // TODO(Yancey1989): use a graceful method to find send op,
     // instead of the the hard code string
     if (op->Type() == "send") {
@@ -104,9 +105,10 @@ std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainSendVars(
 }
 
 std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainRecvVars(
-    const ProgramDesc &program) const {
+    const std::vector<ir::Node *> &nodes) const {
   std::vector<std::string> recv_vars;
-  for (auto *op : program.Block(0).AllOps()) {
+  for (auto &node : nodes) {
+    OpDesc *op = node->Op();
     // TODO(Yancey1989): use a graceful method to find recv op,
     // instead of the hard code string
     if (op->Type() == "recv") {
@@ -120,7 +122,7 @@ std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainRecvVars(
 }
 
 bool MultiDevSSAGraphBuilder::IsDistTrainOp(
-    const OpDesc &op, const std::vector<std::string> &send_vars,
+    ir::Node *node, const std::vector<std::string> &send_vars,
     const std::vector<std::string> &recv_vars) const {
   if (send_vars.size() == 0 || recv_vars.size() == 0) {
     return false;
@@ -143,14 +145,24 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(
     return false;
   };
 
-  return checker(op.OutputArgumentNames(), send_vars) ||
-         checker(op.InputArgumentNames(), recv_vars);
+  std::vector<std::string> input_var_names;
+  std::vector<std::string> output_var_names;
+  for (ir::Node *input : node->inputs) {
+    input_var_names.push_back(input->Name());
+  }
+  for (ir::Node *output : node->outputs) {
+    output_var_names.push_back(output->Name());
+  }
+
+  return checker(output_var_names, send_vars) ||
+         checker(input_var_names, recv_vars);
 }
 
 size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
     const std::vector<std::string> &var_names) const {
   int64_t numel_sum = 0;
   for (auto var_name : var_names) {
+    if (all_vars_.find(var_name) == all_vars_.end()) continue;
     auto var_desc = all_vars_.at(var_name);
     PADDLE_ENFORCE_NOT_NULL(var_desc);
     auto dim = framework::make_ddim(var_desc->GetShape());
@@ -167,25 +179,83 @@ size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
   return dev_id;
 }
 
-std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
-    const ProgramDesc &program) const {
-  for (auto *var : program.Block(0).AllVars()) {
-    all_vars_.emplace(var->Name(), var);
+// Topology sort the graph nodes from inputs to outputs.
+// Since SSAGraphBuilder depends on forward/backward nodes to assign devices
+// to parameter/gradients before optimizer ops, topo sort is insufficient. (
+// some optimizer ops might not depend on any nodes), we manually move all
+// optimizer nodes after last backward nodes.
+// However, the assumption by SSAGraphBuilder should be relaxed in the future.
+std::vector<ir::Node *> SortOpsAndDelayOptimizeOp(const ir::Graph &graph) {
+  std::vector<ir::Node *> ret = ir::TopologySortOperations(graph);
+  size_t last_backward = 0;
+  for (size_t i = 0; i < ret.size(); ++i) {
+    if (boost::get<int>(
+            ret[i]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+        static_cast<int>(OpRole::kBackward)) {
+      last_backward = i;
+    }
+  }
+
+  std::vector<ir::Node *> optimize_ops;
+  std::vector<ir::Node *> sorted_ret;
+  for (size_t i = 0; i < ret.size(); ++i) {
+    if (i < last_backward) {
+      if (boost::get<int>(ret[i]->Op()->GetAttr(
+              OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+          static_cast<int>(OpRole::kOptimize)) {
+        optimize_ops.push_back(ret[i]);
+      } else {
+        sorted_ret.push_back(ret[i]);
+      }
+    } else if (i == last_backward) {
+      sorted_ret.push_back(ret[i]);
+      // Verify that no operations before optimize ops depends on optimize ops.
+      std::unordered_set<ir::Node *> optimize_set(optimize_ops.begin(),
+                                                  optimize_ops.end());
+      for (ir::Node *n : sorted_ret) {
+        for (ir::Node *in : n->inputs) {
+          for (ir::Node *pre_n : in->inputs) {
+            PADDLE_ENFORCE(optimize_set.find(pre_n) == optimize_set.end(),
+                           "optimize operations cannot be depended by forward "
+                           "or backward node %s -> %s",
+                           pre_n->Name(), n->Name());
+          }
+        }
+      }
+      sorted_ret.insert(sorted_ret.end(), optimize_ops.begin(),
+                        optimize_ops.end());
+    } else {
+      sorted_ret.push_back(ret[i]);
+    }
   }
+  return sorted_ret;
+}
 
-  auto graph = new SSAGraph();
-  SSAGraph &result = *graph;
+std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  Init();
+  // Give the topology sort order and rebuild the graph structure.
+  std::vector<ir::Node *> sorted_ops = SortOpsAndDelayOptimizeOp(*graph);
+  auto nodes = graph->ReleaseNodes();
+  ir::Graph &result = *graph;
+
+  for (auto &node : nodes) {
+    if (node->NodeType() == ir::Node::Type::kVariable && node->Var()) {
+      all_vars_.emplace(node->Name(), node->Var());
+    }
+  }
   std::unordered_set<std::string> og_has_been_broadcast;
 
   // We cannot invoke resize. It is a bug of GCC 4.8
-  result.vars_ = std::vector<
-      std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>(
-      places_.size());
+  result.Set(kGraphVars, new GraphVars(places_.size()));
+  result.Set(kGraphDepVars, new GraphDepVars);
+  result.Set(kGraphOps, new GraphOps);
+  result.Set(kShardedVarDevice, new ShardedVarDevice);
 
   // find send/recv vars so that we can place the distributed training
-  // realted op in the place 0
-  auto send_vars = FindDistTrainSendVars(program);
-  auto recv_vars = FindDistTrainRecvVars(program);
+  // related op in the place 0
+  auto send_vars = FindDistTrainSendVars(sorted_ops);
+  auto recv_vars = FindDistTrainRecvVars(sorted_ops);
 
   std::vector<std::unordered_set<std::string>> bcast_var_name_set;
   bcast_var_name_set.resize(places_.size());
@@ -193,18 +263,20 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
   size_t cur_device_id = 0;
   bool is_forwarding = true;
 
-  for (auto *op : program.Block(0).AllOps()) {
+  for (ir::Node *node : sorted_ops) {
     if (boost::get<int>(
-            op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+            node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
         static_cast<int>(OpRole::kRPC)) {
-      CreateRPCOp(&result, *op);
-    } else if (IsDistTrainOp(*op, send_vars, recv_vars)) {
-      CreateDistTrainOp(&result, *op);
-    } else if (IsScaleLossOp(*op)) {
+      CreateRPCOp(&result, node);
+    } else if (IsDistTrainOp(node, send_vars, recv_vars)) {
+      CreateDistTrainOp(&result, node);
+    } else if (IsScaleLossOp(node)) {
       // user can customize loss@grad if not use_default_grad_scale_
       if (strategy_.gradient_scale_ !=
           BuildStrategy::GradientScaleStrategy::kCustomized) {
-        CreateScaleLossGradOp(&result);
+        // TODO(paddle-dev): Why is there no input for this op_handle?
+        auto loss_grad_name = node->Op()->OutputArgumentNames()[0];
+        CreateScaleLossGradOp(&result, loss_grad_name);
       }
       // This assumes the backward generating code will ensure IsScaleLossOp
       // is true only for the op that scale the final scalar loss.
@@ -212,33 +284,35 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
       // the block.
       is_forwarding = false;
     } else {
-      int op_dev_id = GetOpDeviceID(*op);
+      int op_dev_id = GetOpDeviceID(result, node);
       if (op_dev_id != -1) {  // This op only runs on one specific device.
-        CreateComputationalOp(&result, *op, op_dev_id);
-        for (auto &var_name : op->OutputArgumentNames()) {
-          var_name_on_devices_.emplace(var_name, op_dev_id);
+        CreateComputationalOp(&result, node, op_dev_id);
+        for (ir::Node *n : node->outputs) {
+          graph->Get<ShardedVarDevice>(kShardedVarDevice)
+              .emplace(n->Name(), op_dev_id);
         }
       } else {
         // This op runs on all devices, and its output may have parameter's
         // gradients.
-        if (op->Type() == "read" && strategy_.enable_data_balance_) {
-          op->SetAttr("throw_eof_exp", false);
-          CreateComputationalOps(&result, *op, places_.size());
-          const auto &data_var_names = op->Output("Out");
+        // TODO(paddle-dev): Why is so special about "read" op?
+        if (node->Op()->Type() == "read" && strategy_.enable_data_balance_) {
+          node->Op()->SetAttr("throw_eof_exp", false);
+          CreateComputationalOps(&result, node, places_.size());
+          const auto &data_var_names = node->Op()->Output("Out");
           InsertDataBalanceOp(&result, data_var_names);
         } else {
-          CreateComputationalOps(&result, *op, places_.size());
+          CreateComputationalOps(&result, node, places_.size());
         }
 
         if (!is_forwarding && places_.size() > 1) {
           // Currently, we assume that once gradient is generated, it can be
           // broadcast, and each gradient is only broadcast once.
-          if (static_cast<bool>(boost::get<int>(op->GetAttr(
+          if (static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
                                     OpProtoAndCheckerMaker::OpRoleAttrName())) &
                                 static_cast<int>(OpRole::kBackward))) {
             try {
-              auto backward_vars =
-                  boost::get<std::vector<std::string>>(op->GetNullableAttr(
+              auto backward_vars = boost::get<std::vector<std::string>>(
+                  node->Op()->GetNullableAttr(
                       OpProtoAndCheckerMaker::OpRoleVarAttrName()));
 
               PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
@@ -252,7 +326,8 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
                   case BuildStrategy::ReduceStrategy::kReduce:
                     cur_device_id = GetAppropriateDeviceID({g_name});
                     CreateReduceOp(&result, g_name, cur_device_id);
-                    var_name_on_devices_.emplace(g_name, cur_device_id);
+                    graph->Get<ShardedVarDevice>(kShardedVarDevice)
+                        .emplace(g_name, cur_device_id);
                     bcast_var_name_set[cur_device_id].emplace(p_name);
                     break;
                   case BuildStrategy::ReduceStrategy::kAllReduce:
@@ -276,25 +351,33 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
     }
   }
 
-  // Insert BCast Ops
-  for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
-    auto &to_bcast_set = bcast_var_name_set[dev_id];
-    for (auto &bcast_name : to_bcast_set) {
-      CreateBroadcastOp(&result, bcast_name, dev_id);
+  bool use_gpu = false;
+#ifdef PADDLE_WITH_CUDA
+  use_gpu = nccl_ctxs_ != nullptr;
+#endif
+
+  if (use_gpu ||
+      strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
+    // Insert BCast Ops
+    for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
+      auto &to_bcast_set = bcast_var_name_set[dev_id];
+      for (auto &bcast_name : to_bcast_set) {
+        CreateBroadcastOp(&result, bcast_name, dev_id);
+      }
     }
   }
   /*
-    Dependency graph has been constructed. However, there are still data
-    hazards need to be handled.
-   */
+  Dependency graph has been constructed. However, there are still data
+  hazards need to be handled.
+ */
   PolishGraphToSupportDataHazards(&result);
 
   /*
    * Only variables should be the leaves of graph.
    */
   AddOutputToLeafOps(&result);
-
-  return std::unique_ptr<SSAGraph>(graph);
+  PADDLE_ENFORCE(!ir::HasCircle(result));
+  return graph;
 }
 
 bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const {
@@ -318,78 +401,96 @@ void MultiDevSSAGraphBuilder::SetCommunicationContext(
 #endif
 }
 
-void MultiDevSSAGraphBuilder::CreateBroadcastOp(SSAGraph *result,
+void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result,
                                                 const std::string &p_name,
                                                 size_t src_dev_id) const {
 #ifdef PADDLE_WITH_CUDA
-  auto *op_handle = new BroadcastOpHandle(local_scopes_, places_, nccl_ctxs_);
+  auto *op_handle = new BroadcastOpHandle(
+      result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation),
+      local_scopes_, places_, nccl_ctxs_);
 #else
-  auto *op_handle = new BroadcastOpHandle(local_scopes_, places_);
+  auto *op_handle = new BroadcastOpHandle(
+      result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation),
+      local_scopes_, places_);
 #endif
+  result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);
 
-  result->ops_.emplace_back(op_handle);
-  auto *in = result->vars_.at(src_dev_id).at(p_name).back().get();
+  auto *in =
+      result->Get<GraphVars>(kGraphVars).at(src_dev_id).at(p_name).back().get();
   op_handle->AddInput(in);
 
   for (size_t i = 0; i < places_.size(); ++i) {
     auto &p = places_[i];
     SetCommunicationContext(op_handle, p);
-    auto &vars = result->vars_.at(i).at(p_name);
-    auto *out_var = new VarHandle(vars.size(), i, p_name, p);
+    auto &vars = result->Get<GraphVars>(kGraphVars).at(i).at(p_name);
+    auto *out_var = new VarHandle(
+        result->CreateEmptyNode(p_name, ir::Node::Type::kVariable), vars.size(),
+        i, p_name, p);
     vars.emplace_back(out_var);
     op_handle->AddOutput(out_var);
   }
 }
 
-void MultiDevSSAGraphBuilder::CreateComputationalOp(SSAGraph *result,
-                                                    const OpDesc &op,
+void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result,
+                                                    ir::Node *node,
                                                     int dev_id) const {
-  result->ops_.emplace_back(
-      new ComputationOpHandle(op, local_scopes_[dev_id], places_[dev_id]));
-  CreateOpHandleIOs(result, op, dev_id);
+  result->Get<GraphOps>(kGraphOps).emplace_back(
+      new ComputationOpHandle(result->CreateOpNode(node->Op()),
+                              local_scopes_[dev_id], places_[dev_id]));
+  CreateOpHandleIOs(result, node, dev_id);
 }
 
-void MultiDevSSAGraphBuilder::InsertAllReduceOp(SSAGraph *result,
+void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result,
                                                 const std::string &og) const {
 #ifdef PADDLE_WITH_CUDA
-  result->ops_.emplace_back(
-      new AllReduceOpHandle(local_scopes_, places_, nccl_ctxs_));
+  result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
+      result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
+      local_scopes_, places_, nccl_ctxs_));
 #else
-  result->ops_.emplace_back(new AllReduceOpHandle(local_scopes_, places_));
+  result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
+      result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
+      local_scopes_, places_));
 #endif
-  auto *op_handle = result->ops_.back().get();
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
 
   for (size_t i = 0; i < places_.size(); ++i) {
     auto &p = places_[i];
     SetCommunicationContext(op_handle, p);
-    auto &vars = result->vars_[i][og];
+    auto &vars = result->Get<GraphVars>(kGraphVars)[i][og];
     PADDLE_ENFORCE(!vars.empty());
     auto &prev_grad = vars.back();
     op_handle->AddInput(prev_grad.get());
 
-    auto var = new VarHandle(vars.size(), i, og, p);
+    auto var =
+        new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable),
+                      vars.size(), i, og, p);
     vars.emplace_back(var);
     op_handle->AddOutput(var);
   }
 }
 
 void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
-    SSAGraph *result, const std::vector<std::string> &datas) const {
+    ir::Graph *result, const std::vector<std::string> &datas) const {
 #ifdef PADDLE_WITH_CUDA
-  result->ops_.emplace_back(
-      new DataBalanceOpHandle(local_scopes_, places_, nccl_ctxs_));
+  result->Get<GraphOps>(kGraphOps).emplace_back(new DataBalanceOpHandle(
+      result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation),
+      local_scopes_, places_, nccl_ctxs_));
 #else
-  result->ops_.emplace_back(new DataBalanceOpHandle(local_scopes_, places_));
+  result->Get<GraphOps>(kGraphOps).emplace_back(new DataBalanceOpHandle(
+      result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation),
+      local_scopes_, places_));
 #endif
-  auto *op_handle = result->ops_.back().get();
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
   for (size_t i = 0; i < places_.size(); ++i) {
     auto &p = places_[i];
     SetCommunicationContext(op_handle, p);
     for (const std::string &d_name : datas) {
-      auto &vars = result->vars_[i][d_name];
+      auto &vars = result->Get<GraphVars>(kGraphVars)[i][d_name];
       PADDLE_ENFORCE(!vars.empty());
       op_handle->AddInput(vars.back().get());
-      auto var = new VarHandle(vars.size(), i, d_name, p);
+      auto var = new VarHandle(
+          result->CreateEmptyNode(d_name, ir::Node::Type::kVariable),
+          vars.size(), i, d_name, p);
       vars.emplace_back(var);
       op_handle->AddOutput(var);
     }
@@ -408,26 +509,35 @@ bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
   return is_pg_once;
 }
 
-int MultiDevSSAGraphBuilder::GetOpDeviceID(const OpDesc &op) const {
+int MultiDevSSAGraphBuilder::GetOpDeviceID(const ir::Graph &graph,
+                                           ir::Node *node) const {
   if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
     return -1;
   }
-
-  for (auto &varname : op.InputArgumentNames()) {
-    int dev_id = GetVarDeviceID(varname);
-    if (dev_id != -1) {
-      return dev_id;
-    }
+  int op_role = boost::get<int>(
+      node->Op()->GetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
+  if (op_role != static_cast<int>(framework::OpRole::kOptimize)) {
+    return -1;
   }
-  return -1;
+  auto param_grad = boost::get<std::vector<std::string>>(
+      node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+
+  PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
+  int dev_id = GetVarDeviceID(graph, param_grad[1]);
+  PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]",
+                    node->Op()->Type(), param_grad[0], param_grad[1]);
+  return dev_id;
 }
 
-int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const {
-  auto got = var_name_on_devices_.find(varname);
-  return got == var_name_on_devices_.end() ? -1 : got->second;
+int MultiDevSSAGraphBuilder::GetVarDeviceID(const ir::Graph &graph,
+                                            const std::string &varname) const {
+  auto &sharded_var_device = graph.Get<ShardedVarDevice>(kShardedVarDevice);
+  auto got = sharded_var_device.find(varname);
+  return got == sharded_var_device.end() ? -1 : got->second;
 }
 
-void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
+void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
+    ir::Graph *result, const std::string &loss_grad_name) const {
   for (size_t i = 0; i < places_.size(); ++i) {
 // Insert ScaleCost OpHandle
 #ifdef PADDLE_WITH_CUDA
@@ -438,11 +548,11 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
     auto *communication_dev_ctx =
         platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
 #endif
-
-    auto *op_handle =
-        new ScaleLossGradOpHandle(local_scopes_.size(), local_scopes_[i],
-                                  places_[i], communication_dev_ctx);
-    result->ops_.emplace_back(op_handle);
+    auto *op_handle = new ScaleLossGradOpHandle(
+        result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation),
+        local_scopes_.size(), local_scopes_[i], places_[i],
+        communication_dev_ctx);
+    result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);
 
     // FIXME: Currently ScaleLossGradOp only use device_count as scale
     // factor. So it does not depend on any other operators.
@@ -450,43 +560,51 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
     // loss->pending_ops_.emplace_back(op_handle);
     // op_handle->inputs_.emplace_back(loss);
 
-    CreateOpOutput(result, op_handle, GradVarName(loss_var_name_), places_[i],
-                   i);
+    CreateOpOutput(
+        result, op_handle,
+        result->CreateEmptyNode(loss_grad_name, ir::Node::Type::kVariable),
+        places_[i], i);
   }
 }
 
-void MultiDevSSAGraphBuilder::CreateComputationalOps(SSAGraph *result,
-                                                     const OpDesc &op,
+void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result,
+                                                     ir::Node *node,
                                                      size_t num_places) const {
   for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) {
     auto p = places_[scope_idx];
     auto s = local_scopes_[scope_idx];
-    result->ops_.emplace_back(new ComputationOpHandle(op, s, p));
-    CreateOpHandleIOs(result, op, scope_idx);
+    result->Get<GraphOps>(kGraphOps).emplace_back(
+        new ComputationOpHandle(result->CreateOpNode(node->Op()), s, p));
+    CreateOpHandleIOs(result, node, scope_idx);
   }
 }
 
-VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result,
+VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
                                                    const std::string &og,
                                                    int dst_dev_id) const {
 #ifdef PADDLE_WITH_CUDA
-  result->ops_.emplace_back(
-      new ReduceOpHandle(local_scopes_, places_, nccl_ctxs_));
+  result->Get<GraphOps>(kGraphOps).emplace_back(new ReduceOpHandle(
+      result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
+      local_scopes_, places_, nccl_ctxs_));
 #else
-  result->ops_.emplace_back(new ReduceOpHandle(local_scopes_, places_));
+  result->Get<GraphOps>(kGraphOps).emplace_back(new ReduceOpHandle(
+      result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
+      local_scopes_, places_));
 #endif
-  auto *op_handle = result->ops_.back().get();
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
 
   for (size_t i = 0; i < places_.size(); ++i) {
     auto &p = places_[i];
     SetCommunicationContext(op_handle, p);
-    auto &vars = result->vars_[i][og];
+    auto &vars = result->Get<GraphVars>(kGraphVars)[i][og];
     PADDLE_ENFORCE(!vars.empty());
     auto &prev_grad = vars.back();
     op_handle->AddInput(prev_grad.get());
   }
-  auto &vars = result->vars_[dst_dev_id][og];
-  auto var = new VarHandle(vars.size(), dst_dev_id, og, places_[dst_dev_id]);
+  auto &vars = result->Get<GraphVars>(kGraphVars)[dst_dev_id][og];
+  auto var =
+      new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable),
+                    vars.size(), dst_dev_id, og, places_[dst_dev_id]);
   vars.emplace_back(var);
   op_handle->AddOutput(var);
   return var;
@@ -494,36 +612,50 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result,
 
 // Find the first occurence of `prev_op_name` and make current `op` depend
 // on it.
-void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op,
+void MultiDevSSAGraphBuilder::ConnectOp(ir::Graph *result, OpHandleBase *op,
                                         const std::string &prev_op_name) const {
-  for (auto &prev_op : result->ops_) {
+  for (auto &prev_op : result->Get<GraphOps>(kGraphOps)) {
     if (prev_op->Name() == prev_op_name) {
-      auto *dep_var = new DummyVarHandle();
+      auto *dep_var = new DummyVarHandle(result->CreateControlDepVar());
       prev_op->AddOutput(dep_var);
-      result->dep_vars_.emplace(dep_var);
+      result->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
       op->AddInput(dep_var);
     }
   }
 }
 
-void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
-                                                const OpDesc &op) const {
+void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
+                                                ir::Node *node) const {
   int op_dev_id = -1;
-  if (op.Type() == "split_byref" || op.Type() == "split_selected_rows") {
-    op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
+  std::vector<std::string> input_var_names;
+  std::vector<std::string> output_var_names;
+  for (ir::Node *input : node->inputs) {
+    input_var_names.push_back(input->Name());
+  }
+  for (ir::Node *output : node->outputs) {
+    output_var_names.push_back(output->Name());
+  }
+
+  if (node->Op()->Type() == "split_byref" ||
+      node->Op()->Type() == "split_selected_rows") {
+    // TODO(paddle-dev): getting the first var is not safe.
+    op_dev_id = GetVarDeviceID(*result, input_var_names[0]);
     if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
-      op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());
-      for (auto &varname : op.InputArgumentNames()) {
-        var_name_on_devices_.emplace(varname, op_dev_id);
+      op_dev_id = GetAppropriateDeviceID(input_var_names);
+      for (auto &varname : input_var_names) {
+        result->Get<ShardedVarDevice>(kShardedVarDevice)
+            .emplace(varname, op_dev_id);
       }
     }
-    for (auto &varname : op.OutputArgumentNames()) {
-      var_name_on_devices_.emplace(varname, op_dev_id);
+    for (auto &varname : output_var_names) {
+      result->Get<ShardedVarDevice>(kShardedVarDevice)
+          .emplace(varname, op_dev_id);
     }
-  } else if (op.Type() == "concat") {
-    op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
-    for (auto &varname : op.OutputArgumentNames()) {
-      var_name_on_devices_.emplace(varname, op_dev_id);
+  } else if (node->Op()->Type() == "concat") {
+    op_dev_id = GetVarDeviceID(*result, input_var_names[0]);
+    for (auto &varname : output_var_names) {
+      result->Get<ShardedVarDevice>(kShardedVarDevice)
+          .emplace(varname, op_dev_id);
     }
   } else {
     PADDLE_ENFORCE(
@@ -532,35 +664,50 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
   }
 
   PADDLE_ENFORCE(op_dev_id != -1,
-                 "can not find right place for distributed op: %s", op.Type());
+                 "can not find right place for distributed op: %s",
+                 node->Op()->Type());
 
-  CreateComputationalOp(result, op, op_dev_id);
-  if (op.Type() == "concat") {
-    ConnectOp(result, result->ops_.back().get(), "fetch_barrier");
+  CreateComputationalOp(result, node, op_dev_id);
+  if (node->Op()->Type() == "concat") {
+    ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(),
+              "fetch_barrier");
   }
 }
 
 // Create RPC related op handles that connects its in ops and out ops.
-void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
-                                          const OpDesc &op) const {
+void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
+                                          ir::Node *node) const {
   int op_dev_id = -1;
-  if (op.Type() == "send") {
-    op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
+  if (node->Op()->Type() == "send") {
+    // TODO(paddle-dev): getting the first var is not safe.
+    op_dev_id = GetVarDeviceID(*result, node->inputs[0]->Name());
+    PADDLE_ENFORCE(!ir::IsControlDepVar(*node->inputs[0]),
+                   "This hack no longer holds, please fix.");
     // the variable name which contains .block means it was splited by
     // split_byref op
     // so that we can balance the variable blocks to all the pserver
     // instances.
     if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce &&
-        op.InputArgumentNames()[0].find(".block") == std::string::npos) {
-      op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());
-      for (auto &varname : op.InputArgumentNames()) {
-        var_name_on_devices_.emplace(varname, op_dev_id);
+        node->inputs[0]->Name().find(".block") == std::string::npos) {
+      std::vector<std::string> input_var_names;
+      for (ir::Node *n : node->inputs) {
+        input_var_names.push_back(n->Name());
+      }
+      op_dev_id = GetAppropriateDeviceID(input_var_names);
+      for (auto &varname : input_var_names) {
+        result->Get<ShardedVarDevice>(kShardedVarDevice)
+            .emplace(varname, op_dev_id);
       }
     }
-  } else if (op.Type() == "recv") {
-    op_dev_id = GetAppropriateDeviceID(op.OutputArgumentNames());
-    for (auto &varname : op.OutputArgumentNames()) {
-      var_name_on_devices_.emplace(varname, op_dev_id);
+  } else if (node->Op()->Type() == "recv") {
+    std::vector<std::string> output_var_names;
+    for (ir::Node *n : node->outputs) {
+      output_var_names.push_back(n->Name());
+    }
+    op_dev_id = GetAppropriateDeviceID(output_var_names);
+    for (auto &varname : output_var_names) {
+      result->Get<ShardedVarDevice>(kShardedVarDevice)
+          .emplace(varname, op_dev_id);
     }
   } else {
     // send_barrier and fetch_barrier op can be scheduled on device 0
@@ -568,18 +715,21 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
   }
 
   PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s",
-                 op.Type());
-
-  result->ops_.emplace_back(new RPCOpHandle(op, local_scopes_[op_dev_id],
-                                            op.Type(), places_[op_dev_id]));
-
-  if (op.Type() == "send_barrier") {
-    ConnectOp(result, result->ops_.back().get(), "send");
-  } else if (op.Type() == "recv") {
-    ConnectOp(result, result->ops_.back().get(), "send_barrier");
-  } else if (op.Type() == "fetch_barrier") {
-    ConnectOp(result, result->ops_.back().get(), "recv");
-  } else if (op.Type() == "send") {
+                 node->Op()->Type());
+
+  result->Get<GraphOps>(kGraphOps).emplace_back(new RPCOpHandle(
+      result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id],
+      node->Op()->Type(), places_[op_dev_id]));
+
+  // TODO(panyx0718): This might not be needed anymore.
+  if (node->Op()->Type() == "send_barrier") {
+    ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(), "send");
+  } else if (node->Op()->Type() == "recv") {
+    ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(),
+              "send_barrier");
+  } else if (node->Op()->Type() == "fetch_barrier") {
+    ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(), "recv");
+  } else if (node->Op()->Type() == "send") {
     // do nothing
   } else {
     PADDLE_THROW(
@@ -587,12 +737,12 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
         "send, send_barrier. recv, fetch_barrier]");
   }
 
-  CreateOpHandleIOs(result, op, op_dev_id);
+  CreateOpHandleIOs(result, node, op_dev_id);
 }
 
-bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {
+bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const {
   return boost::get<int>(
-             op.GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+             node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
              (static_cast<int>(OpRole::kBackward) |
               static_cast<int>(OpRole::kLoss)) &&
          !loss_var_name_.empty();  // If loss_var is empty. This is test mode
@@ -600,3 +750,11 @@ bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
+
+REGISTER_PASS(multi_device_pass,
+              paddle::framework::details::MultiDevSSAGraphBuilder)
+    .RequirePassAttr(paddle::framework::details::kLossVarName)
+    .RequirePassAttr(paddle::framework::details::kPlaces)
+    .RequirePassAttr(paddle::framework::details::kParams)
+    .RequirePassAttr(paddle::framework::details::kLocalScopes)
+    .RequirePassAttr(paddle::framework::details::kStrategy);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index a964e024885e56693224a6199e00ff30beaa1df4..f2cb6bb1c861e07f1034f1742ad4f3cfbb0d8837 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -19,6 +19,7 @@
 
 #include "paddle/fluid/framework/details/build_strategy.h"
 #include "paddle/fluid/framework/details/ssa_graph_builder.h"
+#include "paddle/fluid/framework/ir/graph.h"
 
 namespace paddle {
 namespace platform {
@@ -30,81 +31,70 @@ class Scope;
 namespace details {
 
 class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
- public:
-#ifdef PADDLE_WITH_CUDA
-  MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places,
-                          const std::string &loss_var_name,
-                          const std::unordered_set<std::string> &params,
-                          const std::vector<Scope *> &local_scopes,
-                          platform::NCCLContextMap *nccl_ctxs,
-                          const BuildStrategy &strategy);
-#else
-  MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places,
-                          const std::string &loss_var_name,
-                          const std::unordered_set<std::string> &params,
-                          const std::vector<Scope *> &local_scopes,
-                          const BuildStrategy &strategy);
-#endif
-
-  std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
-  int GetVarDeviceID(const std::string &varname) const override;
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
  private:
-  void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op,
+  void CreateOpHandleIOs(ir::Graph *result, ir::Node *node,
                          size_t device_id) const;
+  void Init() const;
 
  private:
-  std::string loss_var_name_;
-  const std::vector<platform::Place> &places_;
-  const std::vector<Scope *> &local_scopes_;
-  std::unordered_set<std::string> grad_names_;
+  mutable std::string loss_var_name_;
+  mutable std::vector<platform::Place> places_;
+  mutable std::vector<Scope *> local_scopes_;
+  mutable std::unordered_set<std::string> grad_names_;
 
 #ifdef PADDLE_WITH_CUDA
-  platform::NCCLContextMap *nccl_ctxs_;
+  mutable platform::NCCLContextMap *nccl_ctxs_;
 #endif
 
-  bool IsScaleLossOp(const OpDesc &op) const;
+  int GetVarDeviceID(const ir::Graph &graph, const std::string &varname) const;
 
-  void CreateRPCOp(SSAGraph *result, const OpDesc &op) const;
-  void CreateDistTrainOp(SSAGraph *result, const OpDesc &op) const;
+  bool IsScaleLossOp(ir::Node *node) const;
+
+  void CreateRPCOp(ir::Graph *result, ir::Node *node) const;
+  void CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
 
   /**
    * Is this operator as the end-point operator before/after send operator.
    */
-  bool IsDistTrainOp(const OpDesc &op,
-                     const std::vector<std::string> &send_vars,
+  bool IsDistTrainOp(ir::Node *node, const std::vector<std::string> &send_vars,
                      const std::vector<std::string> &recv_vars) const;
 
   std::vector<std::string> FindDistTrainSendVars(
-      const ProgramDesc &program) const;
+      const std::vector<ir::Node *> &nodes) const;
 
   std::vector<std::string> FindDistTrainRecvVars(
-      const ProgramDesc &program) const;
+      const std::vector<ir::Node *> &nodes) const;
 
-  void ConnectOp(SSAGraph *result, OpHandleBase *op,
+  void ConnectOp(ir::Graph *result, OpHandleBase *op,
                  const std::string &prev_op_name) const;
 
-  void CreateComputationalOps(SSAGraph *result, const OpDesc &op,
+  void CreateComputationalOps(ir::Graph *result, ir::Node *node,
                               size_t num_places) const;
 
-  void CreateScaleLossGradOp(SSAGraph *result) const;
-  VarHandle *CreateReduceOp(SSAGraph *result, const std::string &og,
+  void CreateScaleLossGradOp(ir::Graph *result,
+                             const std::string &loss_grad_name) const;
+
+  VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
                             int dst_dev_id) const;
-  void CreateComputationalOp(SSAGraph *result, const OpDesc &op,
+  void CreateComputationalOp(ir::Graph *result, ir::Node *node,
                              int dev_id) const;
 
   bool IsParameterGradientOnce(
       const std::string &og,
       std::unordered_set<std::string> *og_has_been_broadcast) const;
 
-  int GetOpDeviceID(const OpDesc &op) const;
+  int GetOpDeviceID(const ir::Graph &graph, ir::Node *node) const;
 
-  void InsertAllReduceOp(SSAGraph *result, const std::string &og) const;
+  void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;
 
-  void InsertDataBalanceOp(SSAGraph *result,
+  void InsertDataBalanceOp(ir::Graph *result,
                            const std::vector<std::string> &datas) const;
 
-  void CreateBroadcastOp(SSAGraph *result, const std::string &p_name,
+  void CreateBroadcastOp(ir::Graph *result, const std::string &p_name,
                          size_t src_dev_id) const;
 
   bool IsSparseGradient(const std::string &og) const;
@@ -113,9 +103,8 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
       const std::vector<std::string> &var_names) const;
 
  private:
-  BuildStrategy strategy_;
+  mutable BuildStrategy strategy_;
   mutable std::unordered_map<std::string, VarDesc *> all_vars_;
-  mutable std::unordered_map<std::string, int> var_name_on_devices_;
   mutable std::vector<int64_t> balance_vars_;
 
   void SetCommunicationContext(OpHandleBase *op_handle,
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index d80bdcf15d798925c137460125964d3d7e65f67e..ee9f9184da65467b82794c99fe3e95b108373753 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -80,19 +80,21 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
 
 void OpHandleBase::AddInput(VarHandleBase *in) {
   this->inputs_.emplace_back(in);
-  in->pending_ops_.insert(this);
+  node_->inputs.push_back(in->Node());
+  in->AddOutput(this, this->Node());
 }
 
 void OpHandleBase::AddOutput(VarHandleBase *out) {
   outputs_.emplace_back(out);
-  out->generated_op_ = this;
+  node_->outputs.push_back(out->Node());
+  out->AddInput(this, this->Node());
 }
 
 void OpHandleBase::WaitInputVarGenerated() {
   for (auto in_var : inputs_) {
     if (NeedWait(in_var)) {
       for (auto &pair : dev_ctxes_) {
-        in_var->generated_op_->RecordWaitEventOnCtx(pair.second);
+        in_var->GeneratedOp()->RecordWaitEventOnCtx(pair.second);
       }
     }
   }
@@ -101,7 +103,7 @@ void OpHandleBase::WaitInputVarGenerated() {
 void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
   for (auto *in : inputs_) {
     if (NeedWait(in)) {
-      in->generated_op_->RecordWaitEventOnCtx(dev_ctxes_[place]);
+      in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[place]);
     }
   }
 }
@@ -117,7 +119,7 @@ size_t OpHandleBase::NoDummyInputSize() const {
 }
 
 bool OpHandleBase::NeedWait(VarHandleBase *in_var) {
-  return in_var && in_var->generated_op_;
+  return in_var && in_var->GeneratedOp();
 }
 
 void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index 6aec178831161f8ac1306fc3ed72e3267ca3c7e5..2d7f18942890245249dd0619a40bb43833c9a2ee 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -17,6 +17,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/details/var_handle.h"
+#include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/macros.h"
 
@@ -26,9 +27,11 @@ namespace details {
 
 constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@";
 
+// Wraps ir::Node and provide helper utilities.
+// It's responsible for populating necessary fields of ir::Node.
 class OpHandleBase {
  public:
-  OpHandleBase() {}
+  explicit OpHandleBase(ir::Node *node) : node_(node) {}
 
   virtual ~OpHandleBase();
 
@@ -82,6 +85,8 @@ class OpHandleBase {
 
   size_t NoDummyInputSize() const;
 
+  ir::Node *Node() { return node_; }
+
  protected:
   void RunAndRecordEvent(const std::function<void()> &callback);
 
@@ -90,6 +95,7 @@ class OpHandleBase {
 
   virtual void RunImpl() = 0;
 
+  ir::Node *node_;
   std::vector<VarHandleBase *> inputs_;
   std::vector<VarHandleBase *> outputs_;
   std::map<platform::Place, platform::DeviceContext *> dev_ctxes_;
diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h
index c0cd873a1d83fa8c2c7b7cd5acfaad9949bcff7d..e28264eb32756f77ef5baed3dff77ba9f0943160 100644
--- a/paddle/fluid/framework/details/reduce_and_gather.h
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@@ -35,14 +35,16 @@ struct ReduceLoDTensor {
     PADDLE_ENFORCE(!src_tensors_.empty());
     auto &t0 = *src_tensors_[0];
     PADDLE_ENFORCE_NE(t0.numel(), 0);
+
     dst_tensor_.Resize(t0.dims());
     T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
-    if (dst != t0.data<T>()) {
-      std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
-    }
 
-    for (size_t i = 1; i < src_tensors_.size(); ++i) {
+    for (size_t i = 0; i < src_tensors_.size(); ++i) {
       auto &t = *src_tensors_[i];
+      if (dst == t.data<T>()) {
+        continue;
+      }
+
       PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
       PADDLE_ENFORCE_EQ(t.type(), t0.type());
       std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index 7160e346dad0615e2fd32b70c096880af0359e1a..6c7e5c1fb06620b1c071b00fcfcc1b4a29bf8d62 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -16,12 +16,18 @@
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DEFINE_bool(
+    cpu_deterministic, false,
+    "Whether to make the result of computation deterministic in CPU side.");
 
 namespace paddle {
 namespace framework {
 namespace details {
 
 void ReduceOpHandle::RunImpl() {
+  platform::RecordEvent r("reduce", nullptr);
   if (places_.size() == 1) return;
   // the input and output may have dummy var.
   auto in_var_handles = DynamicCast<VarHandle>(inputs_);
@@ -89,11 +95,33 @@ void ReduceOpHandle::RunImpl() {
   } else {
     std::vector<const LoDTensor *> lod_tensors =
         GetInputValues<LoDTensor>(in_var_handles, var_scopes);
+
     if (paddle::platform::is_cpu_place(lod_tensors[0]->place())) {
       this->RunAndRecordEvent([&] {
-        ReduceLoDTensor func(lod_tensors,
-                             out_var->GetMutable<framework::LoDTensor>());
-        VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+        // FIXME(zcd): The order of summing is important,
+        // especially when the type of data is float or double.
+        // For example, the result of `a+b+c+d` may be different
+        // with the result of `c+a+b+d`, so the summing order should be fixed.
+        if (!FLAGS_cpu_deterministic) {
+          ReduceLoDTensor func(lod_tensors,
+                               out_var->GetMutable<framework::LoDTensor>());
+          VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+        } else {
+          // We sum lod_tensors to reduce_sum_trg which is in local_scopes_0
+          // here, but it doesn't mean reduce_sum_trg must be in local_scopes_0.
+          auto &reduce_sum_trg = *this->local_scopes_[0]
+                                      ->FindVar(kLocalExecScopeName)
+                                      ->Get<Scope *>()
+                                      ->FindVar(out_var_handle->name_)
+                                      ->GetMutable<framework::LoDTensor>();
+          ReduceLoDTensor func(lod_tensors, &reduce_sum_trg);
+          VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+
+          auto trg = out_var->GetMutable<framework::LoDTensor>();
+          if (reduce_sum_trg.data<void>() != trg->data<void>()) {
+            TensorCopy(reduce_sum_trg, platform::CPUPlace(), trg);
+          }
+        }
       });
     } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h
index 4d14334cdfe06e2e805c2577458d6689e6324cc7..a6289b055f97b7b0e57928358d84117b33cf2df8 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -37,10 +37,13 @@ struct ReduceOpHandle : public OpHandleBase {
 
 #ifdef PADDLE_WITH_CUDA
   const platform::NCCLContextMap *nccl_ctxs_;
-  ReduceOpHandle(const std::vector<Scope *> &local_scopes,
+  ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                  const std::vector<platform::Place> &places,
                  const platform::NCCLContextMap *nccl_ctxs)
-      : local_scopes_(local_scopes), places_(places), nccl_ctxs_(nccl_ctxs) {
+      : OpHandleBase(node),
+        local_scopes_(local_scopes),
+        places_(places),
+        nccl_ctxs_(nccl_ctxs) {
     if (nccl_ctxs_) {
       for (auto &p_ctx : nccl_ctxs_->contexts_) {
         dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
@@ -48,9 +51,9 @@ struct ReduceOpHandle : public OpHandleBase {
     }
   }
 #else
-  ReduceOpHandle(const std::vector<Scope *> &local_scopes,
+  ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                  const std::vector<platform::Place> &places)
-      : local_scopes_(local_scopes), places_(places) {}
+      : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
 #endif
 
   std::string Name() const override;
diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc
index ffdd7c14eb5097cc8285da090e4a72e1e3f43d86..3a9a58412391b188c5e804b41fa47b3607a36bd1 100644
--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -84,6 +84,7 @@ struct TestReduceOpHandle {
   }
 
   void InitReduceOp(size_t out_scope_idx) {
+    std::vector<std::unique_ptr<ir::Node>> nodes;
     // init scope
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
       local_scopes_.push_back(&(g_scope_.NewScope()));
@@ -96,19 +97,21 @@ struct TestReduceOpHandle {
     }
     param_scopes_[out_scope_idx]->Var("out");
 
+    nodes.emplace_back(new ir::Node("node"));
     if (use_gpu_) {
 #ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(
-          new ReduceOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+      op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
+                                          gpu_list_, nccl_ctxs_.get()));
 #else
       PADDLE_THROW("CUDA is not support.");
 #endif
     } else {
 #ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(
-          new ReduceOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+      op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
+                                          gpu_list_, nccl_ctxs_.get()));
 #else
-      op_handle_.reset(new ReduceOpHandle(local_scopes_, gpu_list_));
+      op_handle_.reset(
+          new ReduceOpHandle(nodes.back().get(), local_scopes_, gpu_list_));
 #endif
     }
 
@@ -118,8 +121,10 @@ struct TestReduceOpHandle {
       if (!use_gpu_) {
         op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
       }
-      auto *in_var_handle = new VarHandle(1, j, "input", gpu_list_[j]);
-      in_var_handle->generated_op_ = nullptr;
+      nodes.emplace_back(new ir::Node("node1"));
+      auto *in_var_handle =
+          new VarHandle(nodes.back().get(), 1, j, "input", gpu_list_[j]);
+      in_var_handle->ClearGeneratedOp();
       vars_.emplace_back(in_var_handle);
       op_handle_->AddInput(in_var_handle);
     }
@@ -128,12 +133,13 @@ struct TestReduceOpHandle {
     vars_.emplace_back(new DummyVarHandle());
     DummyVarHandle *in_dummy_var_handle =
         static_cast<DummyVarHandle *>(vars_.back().get());
-    in_dummy_var_handle->generated_op_ = nullptr;
+    in_dummy_var_handle->ClearGeneratedOp();
     op_handle_->AddInput(in_dummy_var_handle);
 
     // add output
-    auto *out_var_handle =
-        new VarHandle(2, out_scope_idx, "out", gpu_list_[out_scope_idx]);
+    nodes.emplace_back(new ir::Node("node2"));
+    auto *out_var_handle = new VarHandle(nodes.back().get(), 2, out_scope_idx,
+                                         "out", gpu_list_[out_scope_idx]);
     vars_.emplace_back(out_var_handle);
     op_handle_->AddOutput(out_var_handle);
 
diff --git a/paddle/fluid/framework/details/rpc_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc
index 586465f99fd94117c821be2952bffda385fbcf75..f44b374edb29228dff5a8bf003d945291f166d49 100644
--- a/paddle/fluid/framework/details/rpc_op_handle.cc
+++ b/paddle/fluid/framework/details/rpc_op_handle.cc
@@ -13,15 +13,17 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/rpc_op_handle.h"
+#include "paddle/fluid/framework/ir/graph.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
-RPCOpHandle::RPCOpHandle(const framework::OpDesc &op_desc,
+RPCOpHandle::RPCOpHandle(ir::Node *node, const framework::OpDesc &op_desc,
                          const Scope *local_scope, const std::string &name,
                          const platform::Place &place)
-    : op_(framework::OpRegistry::CreateOp(op_desc)),
+    : OpHandleBase(node),
+      op_(framework::OpRegistry::CreateOp(op_desc)),
       local_scope_(local_scope),
       name_(name),
       place_(place) {}
@@ -32,11 +34,11 @@ void RPCOpHandle::RunImpl() {
   for (auto *in : inputs_) {
     auto &p = static_cast<VarHandle *>(in)->place_;
     // FIXME(Yancey1989): need a better solution instead of use DebugString()
-    if (in->DebugString() == "dummy") {  // HACK
+    if (ir::IsControlDepVar(*in->Node())) {  // HACK
       continue;
     }
-    if (in->generated_op_) {
-      in->generated_op_->RecordWaitEventOnCtx(dev_ctxes_[p]);
+    if (in->GeneratedOp()) {
+      in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[p]);
     }
   }
   auto &tmp_scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
diff --git a/paddle/fluid/framework/details/rpc_op_handle.h b/paddle/fluid/framework/details/rpc_op_handle.h
index ae38c7fe19e102a330455d89a1068414a7835fab..7f99cdeacf618a9496eaef98520685d6d1621ae1 100644
--- a/paddle/fluid/framework/details/rpc_op_handle.h
+++ b/paddle/fluid/framework/details/rpc_op_handle.h
@@ -28,8 +28,9 @@ namespace framework {
 namespace details {
 
 struct RPCOpHandle : public OpHandleBase {
-  RPCOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope,
-              const std::string& name, const platform::Place& place);
+  RPCOpHandle(ir::Node* node, const framework::OpDesc& op_desc,
+              const Scope* local_scope, const std::string& name,
+              const platform::Place& place);
 
   std::string Name() const override;
 
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index d9c387e79dc71288e7330597fed57171d447f31b..609e18581957f62b040e04e937873b7a8fa5785a 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -19,10 +19,14 @@
 namespace paddle {
 namespace framework {
 namespace details {
-ScaleLossGradOpHandle::ScaleLossGradOpHandle(size_t num_dev, Scope *scope,
+ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev,
+                                             Scope *scope,
                                              platform::Place place,
                                              platform::DeviceContext *dev_ctx)
-    : coeff_(static_cast<float>(1.0 / num_dev)), scope_(scope), place_(place) {
+    : OpHandleBase(node),
+      coeff_(static_cast<float>(1.0 / num_dev)),
+      scope_(scope),
+      place_(place) {
   dev_ctxes_[place_] = dev_ctx;
 }
 
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
index d93d599d46f130cf98f39f15697ce994a31e20c3..523b55724c82d4e2bef0520c10e5708c952a3ecc 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
@@ -25,7 +25,8 @@ namespace framework {
 namespace details {
 
 struct ScaleLossGradOpHandle : public OpHandleBase {
-  ScaleLossGradOpHandle(size_t num_dev, Scope *scope, platform::Place place,
+  ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, Scope *scope,
+                        platform::Place place,
                         platform::DeviceContext *context);
 
   ~ScaleLossGradOpHandle() final;
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index eb4e7ec52f907f9403e21ec2734d61824f51a58b..5bd974d6b789a2f085c0a69de5e133187342f587 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
+#include <stdexcept>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace framework {
@@ -53,8 +55,15 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
       }
     }
   }
+  std::vector<framework::LoDTensor> fetch_data;
+  std::exception_ptr eptr;
+  try {
+    fetch_data = underlying_executor_->Run(fetch_tensors);
+  } catch (...) {
+    eptr = std::current_exception();
+  }
 
-  auto fetch_data = underlying_executor_->Run(fetch_tensors);
+  platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
   drop_scope_counter_ += 1;
   if (!fetch_tensors.empty() ||
       drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
@@ -69,7 +78,11 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
       scope->DeleteScope(local_scope);
     }
   }
-  return fetch_data;
+  if (eptr) {
+    std::rethrow_exception(eptr);
+  } else {
+    return fetch_data;
+  }
 }
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
index 20df7a4722d589ffd168f842e927cff8411096bb..5e87e0bf50b51d2b630aba06a5907dd721754d1f 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -17,6 +17,9 @@
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/details/var_handle.h"
+
 #include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
 #include "paddle/fluid/framework/scope.h"
@@ -37,6 +40,11 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
       ExecutionStrategy strategy, std::vector<Scope*> local_scopes,
       std::vector<VariableInfo> var_infos, std::vector<platform::Place> places,
       std::unique_ptr<SSAGraphExecutor>&& underlying_executor);
+
+  const ir::Graph& Graph() const override {
+    return underlying_executor_->Graph();
+  }
+
   FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override;
 
  private:
diff --git a/paddle/fluid/framework/details/ssa_graph.h b/paddle/fluid/framework/details/ssa_graph.h
deleted file mode 100644
index e996a00c162186e47e77d007503ac67caa9f8024..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/ssa_graph.h
+++ /dev/null
@@ -1,49 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/details/var_handle.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-// A SSA graph used by parallel executor.
-struct SSAGraph {
-  // all variable in each devices.
-  // The outside vector is the device vector. Each element of this vector is a
-  // map from variable name to variables. The variables, who have the same name,
-  // will have a different version. The offset in the
-  // `std::vector<std::unique_ptr<VarHandle>>` is the version of varaibles.
-  std::vector<
-      std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>
-      vars_;
-
-  // aux variables to represent dependency. Useful to resolve data hazard.
-  std::unordered_set<std::unique_ptr<VarHandleBase>> dep_vars_;
-
-  // all operators. NOTE that even we use a vector here, the operators is
-  // unordered.
-  std::vector<std::unique_ptr<OpHandleBase>> ops_;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc
index 88a21f48879a15450051ad94ed76e1c48bf23014..575532540a624afde5f6dab25b11e9eac93c6448 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -17,8 +17,8 @@
 namespace paddle {
 namespace framework {
 namespace details {
-void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) {
-  for (auto &var_map : graph->vars_) {
+void SSAGraphBuilder::PolishGraphToSupportDataHazards(ir::Graph *graph) {
+  for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
     for (auto &name_pair : var_map) {
       if (name_pair.second.size() <= 1) {
         continue;
@@ -27,8 +27,8 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) {
       auto it_old = name_pair.second.rbegin();
       ++it_old;
       for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) {
-        auto *write_op = (*it_new)->generated_op_;
-        auto &read_ops = (*it_old)->pending_ops_;
+        OpHandleBase *write_op = (*it_new)->GeneratedOp();
+        const auto &read_ops = (*it_old)->PendingOps();
 
         for (auto *read_op : read_ops) {
           // Manually add a dependency var from read_op to write_op;
@@ -36,11 +36,21 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) {
             // Read Write is the same op.
             continue;
           }
+          bool has_dep = false;
+          for (auto *r_out : read_op->Outputs()) {
+            for (auto *w_in : write_op->Inputs()) {
+              if (r_out->Node() == w_in->Node()) {
+                has_dep = true;
+                break;
+              }
+            }
+          }
+          if (has_dep) continue;
 
-          auto *dep_var = new DummyVarHandle();
+          auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
           read_op->AddOutput(dep_var);
           write_op->AddInput(dep_var);
-          graph->dep_vars_.emplace(dep_var);
+          graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
         }
       }
     }
@@ -48,13 +58,20 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) {
 }
 
 VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
-    SSAGraph *graph, const std::string &each_var_name,
-    const platform::Place &place, size_t place_offset) {
-  auto &var_holders = graph->vars_[place_offset];
-  auto &var_holder = var_holders[each_var_name];
+    ir::Graph *graph, ir::Node *node, const platform::Place &place,
+    size_t place_offset) {
+  auto &var_holders = graph->Get<GraphVars>(kGraphVars)[place_offset];
+  auto &var_holder = var_holders[node->Name()];
   VarHandle *var = nullptr;
   if (var_holder.empty()) {
-    var = new VarHandle(0, place_offset, each_var_name, place);
+    if (node->Var()) {
+      var = new VarHandle(graph->CreateVarNode(node->Var()), 0, place_offset,
+                          node->Name(), place);
+    } else {
+      var = new VarHandle(
+          graph->CreateEmptyNode(node->Name(), ir::Node::Type::kVariable), 0,
+          place_offset, node->Name(), place);
+    }
     var_holder.emplace_back(var);
   } else {
     var = var_holder.rbegin()->get();
@@ -62,24 +79,26 @@ VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
   return var;
 }
 
-void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
-                                     const std::string &each_var_name,
+void SSAGraphBuilder::CreateOpOutput(ir::Graph *graph, OpHandleBase *op_handle,
+                                     ir::Node *new_node,
                                      const platform::Place &place,
                                      size_t place_offset) {
-  auto &vars = graph->vars_[place_offset][each_var_name];
+  auto &vars =
+      graph->Get<GraphVars>(kGraphVars)[place_offset][new_node->Name()];
   size_t version = vars.size();
-  auto var = new VarHandle(version, place_offset, each_var_name, place);
+  auto var =
+      new VarHandle(new_node, version, place_offset, new_node->Name(), place);
   vars.emplace_back(var);
   op_handle->AddOutput(var);
 }
 
-void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) {
-  for (auto &op : graph->ops_) {
+void SSAGraphBuilder::AddOutputToLeafOps(ir::Graph *graph) {
+  for (auto &op : graph->Get<GraphOps>(kGraphOps)) {
     if (!op->Outputs().empty()) {
       continue;
     }
-    auto *dummy_leaf = new DummyVarHandle();
-    graph->dep_vars_.emplace(dummy_leaf);
+    auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());
+    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dummy_leaf);
     op->AddOutput(dummy_leaf);
   }
 }
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h
index 18612c3c1b62cf4c2ebdc221c301c59ec81c2da7..53a4ad003d51a27a044d7a142434545eca0d5965 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -16,46 +16,68 @@
 
 #include <memory>
 #include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/details/var_handle.h"
 
-#include "paddle/fluid/framework/details/ssa_graph.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/platform/place.h"
 
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
 namespace paddle {
 namespace framework {
 namespace details {
 
-class SSAGraphBuilder {
+// all variable in each devices.
+// The outside vector is the device vector. Each element of this vector is a
+// map from variable name to variables. The variables, who have the same name,
+// will have a differsent version. The offset in the
+// `std::vector<std::unique_ptr<VarHandle>>` is the version of varaibles.
+typedef std::vector<
+    std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>
+    GraphVars;
+const char kGraphVars[] = "vars";
+
+// aux variables to represent dependency. Useful to resolve data hazard.
+typedef std::unordered_set<std::unique_ptr<VarHandleBase>> GraphDepVars;
+const char kGraphDepVars[] = "dep_vars";
+
+// all operators. NOTE that even we use a vector here, the operators is
+// unordered.
+typedef std::vector<std::unique_ptr<OpHandleBase>> GraphOps;
+const char kGraphOps[] = "ops";
+
+typedef std::unordered_map<std::string, int> ShardedVarDevice;
+const char kShardedVarDevice[] = "sharded_var_device";
+
+class SSAGraphBuilder : public ir::Pass {
  public:
   SSAGraphBuilder() {}
   virtual ~SSAGraphBuilder() {}
-  virtual std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const = 0;
-  virtual int GetVarDeviceID(const std::string &var_name) const = 0;
 
   DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
 
  protected:
-  /**
-   * We only handle write after read(WAR), since it should not have a write
-   * after write in program. If there are write after write operators, we need
-   * prune them.
-   *
-   * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
-   */
-  static void PolishGraphToSupportDataHazards(SSAGraph *graph);
-
-  static VarHandle *CreateOrGetLatestVarHandle(SSAGraph *graph,
-                                               const std::string &each_var_name,
+  /*
+    Dependency graph has been constructed. However, there are still data
+    hazards need to be handled.
+  */
+  static void PolishGraphToSupportDataHazards(ir::Graph *graph);
+
+  static VarHandle *CreateOrGetLatestVarHandle(ir::Graph *graph, ir::Node *node,
                                                const platform::Place &place,
                                                size_t place_offset);
 
   // Add an output variable (each_var_name, place, place_offset) to op_handle,
   // which belongs to graph
-  static void CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
-                             const std::string &each_var_name,
-                             const platform::Place &place, size_t place_offset);
+  static void CreateOpOutput(ir::Graph *graph, OpHandleBase *op_handle,
+                             ir::Node *new_node, const platform::Place &place,
+                             size_t place_offset);
 
-  static void AddOutputToLeafOps(SSAGraph *graph);
+  static void AddOutputToLeafOps(ir::Graph *graph);
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc
deleted file mode 100644
index b4b49d3de6da2e5fd7836668619e42d10bb6b35a..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h"
-#include <fstream>
-#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
-#include "paddle/fluid/framework/details/ssa_graph_checker.h"
-#include "paddle/fluid/framework/details/ssa_graph_printer.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-std::unique_ptr<SSAGraphBuilder> SSAGraphBuilderFactory::Create() {
-  std::unique_ptr<SSAGraphBuilder> res(
-#ifdef PADDLE_WITH_CUDA
-      new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_,
-                                  local_scopes_, nccl_ctxs_, strategy_)
-#else
-      new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_,
-                                  local_scopes_, strategy_)
-#endif
-          );  // NOLINT
-
-  if (!strategy_.debug_graphviz_path_.empty()) {
-    std::unique_ptr<std::ostream> fout(
-        new std::ofstream(strategy_.debug_graphviz_path_));
-    PADDLE_ENFORCE(fout->good());
-    std::unique_ptr<GraphvizSSAGraphPrinter> graphviz_printer(
-        new GraphvizSSAGraphPrinter());
-    res.reset(new SSAGraghBuilderWithPrinter(
-        std::move(fout), std::move(graphviz_printer), std::move(res)));
-  }
-  res.reset(new SSAGraghBuilderWithChecker(std::move(res)));
-
-  return res;
-}
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.h b/paddle/fluid/framework/details/ssa_graph_builder_factory.h
deleted file mode 100644
index 91a119de83ed3d1573803e48faf86c874eed98d6..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/ssa_graph_builder_factory.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/details/build_strategy.h"
-#include "paddle/fluid/framework/details/ssa_graph_builder.h"
-#include "paddle/fluid/platform/place.h"
-
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace framework {
-class Scope;
-namespace details {
-
-class SSAGraphBuilderFactory {
- public:
-  SSAGraphBuilderFactory(const std::vector<platform::Place>& places,
-                         const std::string& loss_var_name,
-                         const std::unordered_set<std::string>& param_names,
-                         const std::vector<Scope*>& local_scopes,
-                         const BuildStrategy& strategy)
-      : places_(places),
-        loss_var_name_(loss_var_name),
-        param_names_(param_names),
-        local_scopes_(local_scopes),
-        strategy_(strategy) {
-#ifdef PADDLE_WITH_CUDA
-    nccl_ctxs_ = nullptr;
-#endif
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  void SetNCCLContextMap(platform::NCCLContextMap* nccl_ctxs) {
-    nccl_ctxs_ = nccl_ctxs;
-  }
-#endif
-
-  std::unique_ptr<SSAGraphBuilder> Create();
-
- private:
-  std::vector<platform::Place> places_;
-  std::string loss_var_name_;
-  std::unordered_set<std::string> param_names_;
-  std::vector<Scope*> local_scopes_;
-  BuildStrategy strategy_;
-
-#ifdef PADDLE_WITH_CUDA
-  platform::NCCLContextMap* nccl_ctxs_;
-#endif
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_checker.cc b/paddle/fluid/framework/details/ssa_graph_checker.cc
index da5428946ee588e8eac1f78929dc0432df532975..b9e1cda1f24810009bc74a7abdf0156f723a1755 100644
--- a/paddle/fluid/framework/details/ssa_graph_checker.cc
+++ b/paddle/fluid/framework/details/ssa_graph_checker.cc
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/details/ssa_graph.h"
-#include <string>
 #include "paddle/fluid/framework/details/ssa_graph_checker.h"
+#include <string>
+#include "paddle/fluid/framework/ir/graph.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
-bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const {
+bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const {
   std::unordered_map<OpHandleBase *, size_t> pending_ops;
   std::unordered_set<VarHandleBase *> pending_vars;
   std::unordered_set<VarHandleBase *> ready_vars;
@@ -28,12 +28,12 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const {
 
   auto insert_pending_var = [&](VarHandleBase *var) {
     pending_vars.insert(var);
-    if (var->generated_op_ == nullptr) {
+    if (var->GeneratedOp() == nullptr) {
       ready_vars.emplace(var);
     }
   };
 
-  for (auto &var_map : graph->vars_) {
+  for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
     for (auto &name_pair : var_map) {
       for (auto &version_pair : name_pair.second) {
         insert_pending_var(version_pair.get());
@@ -41,11 +41,11 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const {
     }
   }
 
-  for (auto &var : graph->dep_vars_) {
+  for (auto &var : graph->Get<GraphDepVars>(kGraphDepVars)) {
     insert_pending_var(var.get());
   }
 
-  for (auto &op : graph->ops_) {
+  for (auto &op : graph->Get<GraphOps>(kGraphOps)) {
     if (op->Inputs().empty()) {
       ready_ops.insert(op.get());
     } else {
@@ -71,7 +71,7 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const {
 
     for (auto ready_var : ready_vars) {
       pending_vars.erase(ready_var);
-      for (auto *op : ready_var->pending_ops_) {
+      for (auto *op : ready_var->PendingOps()) {
         auto &deps = --pending_ops[op];
         if (deps == 0) {
           ready_ops.insert(op);
@@ -85,3 +85,10 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const {
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
+
+REGISTER_PASS(multi_device_check_pass,
+              paddle::framework::details::SSAGraghBuilderWithChecker)
+    .RequireGraphAttr(paddle::framework::details::kGraphVars)
+    .RequireGraphAttr(paddle::framework::details::kGraphDepVars)
+    .RequireGraphAttr(paddle::framework::details::kGraphOps)
+    .RequireGraphAttr(paddle::framework::details::kShardedVarDevice);
diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/ssa_graph_checker.h
index 331aa9d2b5864c470dbd5e29ef6faccffdcf781c..0e861ecb236361992d9883e3bd0e679f7563b539 100644
--- a/paddle/fluid/framework/details/ssa_graph_checker.h
+++ b/paddle/fluid/framework/details/ssa_graph_checker.h
@@ -21,28 +21,16 @@
 namespace paddle {
 namespace framework {
 namespace details {
-struct SSAGraph;
 
 class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
- public:
-  explicit SSAGraghBuilderWithChecker(
-      std::unique_ptr<SSAGraphBuilder>&& builder)
-      : builder_(std::move(builder)) {}
-
-  std::unique_ptr<SSAGraph> Build(const ProgramDesc& program) const override {
-    auto graph = builder_->Build(program);
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override {
     PADDLE_ENFORCE(IsValidGraph(graph.get()));
     return graph;
   }
 
-  int GetVarDeviceID(const std::string& var_name) const override {
-    return builder_->GetVarDeviceID(var_name);
-  }
-
-  bool IsValidGraph(const SSAGraph* graph) const;
-
- private:
-  std::unique_ptr<SSAGraphBuilder> builder_;
+  bool IsValidGraph(const ir::Graph* graph) const;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h
index 958086033607a4ed8fb840f5b14fe5779625bd82..96fffb7d9430cd00b3823ada9fbe9a65a6bd718c 100644
--- a/paddle/fluid/framework/details/ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/ssa_graph_executor.h
@@ -18,8 +18,8 @@
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/framework/details/ssa_graph.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/ir/graph.h"
 
 namespace paddle {
 namespace framework {
@@ -32,7 +32,9 @@ class SSAGraphExecutor {
 
   virtual ~SSAGraphExecutor();
 
-  virtual FeedFetchList Run(const std::vector<std::string> &fetch_tensors) = 0;
+  virtual const ir::Graph& Graph() const = 0;
+
+  virtual FeedFetchList Run(const std::vector<std::string>& fetch_tensors) = 0;
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.cc b/paddle/fluid/framework/details/ssa_graph_printer.cc
index 22a40ca4b25cdd8ed9856b6c71bffc79561edcac..ec3f31ab8d135efd2c77018e90cec46b25ca5e66 100644
--- a/paddle/fluid/framework/details/ssa_graph_printer.cc
+++ b/paddle/fluid/framework/details/ssa_graph_printer.cc
@@ -14,15 +14,15 @@
 
 #include "paddle/fluid/framework/details/ssa_graph_printer.h"
 #include <string>
-#include "paddle/fluid/framework/details/ssa_graph.h"
+#include "paddle/fluid/framework/ir/graph.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
 template <typename Callback>
-static inline void IterAllVar(const SSAGraph &graph, Callback callback) {
-  for (auto &each : graph.vars_) {
+static inline void IterAllVar(const ir::Graph &graph, Callback callback) {
+  for (auto &each : graph.Get<GraphVars>(kGraphVars)) {
     for (auto &pair1 : each) {
       for (auto &pair2 : pair1.second) {
         callback(*pair2);
@@ -30,12 +30,12 @@ static inline void IterAllVar(const SSAGraph &graph, Callback callback) {
     }
   }
 
-  for (auto &var : graph.dep_vars_) {
+  for (auto &var : graph.Get<GraphDepVars>(kGraphDepVars)) {
     callback(*var);
   }
 }
 
-void GraphvizSSAGraphPrinter::Print(const SSAGraph &graph,
+void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph,
                                     std::ostream &sout) const {
   size_t var_id = 0;
   std::unordered_map<const VarHandleBase *, size_t> vars;
@@ -61,7 +61,7 @@ void GraphvizSSAGraphPrinter::Print(const SSAGraph &graph,
   });
 
   size_t op_id = 0;
-  for (auto &op : graph.ops_) {
+  for (auto &op : graph.Get<GraphOps>(kGraphOps)) {
     std::string op_name = "op_" + std::to_string(op_id++);
     sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
          << std::endl;
@@ -81,3 +81,6 @@ void GraphvizSSAGraphPrinter::Print(const SSAGraph &graph,
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
+
+REGISTER_PASS(multi_device_print_pass,
+              paddle::framework::details::SSAGraghBuilderWithPrinter);
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h
index 09b0333ef2cb43a306133aa5af98d37c11454d4d..5eafd1805c3102dbd3cdfa68ee1495631c182b51 100644
--- a/paddle/fluid/framework/details/ssa_graph_printer.h
+++ b/paddle/fluid/framework/details/ssa_graph_printer.h
@@ -14,57 +14,37 @@
 
 #pragma once
 
+#include <fstream>
 #include <iosfwd>
+#include <ostream>
 #include <string>
 #include "paddle/fluid/framework/details/ssa_graph_builder.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
-struct SSAGraph;
+
 class SSAGraphPrinter {
  public:
   virtual ~SSAGraphPrinter() {}
-  virtual void Print(const SSAGraph& graph, std::ostream& sout) const = 0;
+  virtual void Print(const ir::Graph& graph, std::ostream& sout) const = 0;
 };
 
 class GraphvizSSAGraphPrinter : public SSAGraphPrinter {
  public:
-  void Print(const SSAGraph& graph, std::ostream& sout) const override;
+  void Print(const ir::Graph& graph, std::ostream& sout) const override;
 };
 
 class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
- public:
-  SSAGraghBuilderWithPrinter(std::ostream& sout,
-                             std::unique_ptr<SSAGraphPrinter>&& printer,
-                             std::unique_ptr<SSAGraphBuilder>&& builder)
-      : printer_(std::move(printer)),
-        builder_(std::move(builder)),
-        stream_ref_(sout) {}
-
-  SSAGraghBuilderWithPrinter(std::unique_ptr<std::ostream>&& sout,
-                             std::unique_ptr<SSAGraphPrinter>&& printer,
-                             std::unique_ptr<SSAGraphBuilder>&& builder)
-      : printer_(std::move(printer)),
-        builder_(std::move(builder)),
-        stream_ptr_(std::move(sout)),
-        stream_ref_(*stream_ptr_) {}
-
-  std::unique_ptr<SSAGraph> Build(const ProgramDesc& program) const override {
-    auto graph = builder_->Build(program);
-    printer_->Print(*graph, stream_ref_);
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override {
+    std::unique_ptr<std::ostream> fout(
+        new std::ofstream(Get<const std::string>("debug_graphviz_path")));
+    PADDLE_ENFORCE(fout->good());
+    Get<GraphvizSSAGraphPrinter>("graph_printer").Print(*graph, *fout);
     return graph;
   }
-
-  int GetVarDeviceID(const std::string& var_name) const override {
-    return builder_->GetVarDeviceID(var_name);
-  }
-
- private:
-  std::unique_ptr<SSAGraphPrinter> printer_;
-  std::unique_ptr<SSAGraphBuilder> builder_;
-  std::unique_ptr<std::ostream> stream_ptr_;
-  std::ostream& stream_ref_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 99b10254a7961bf7b27b256acaece573a71c4115..0eaf9a9c951991a5775604eb8d0e7535f81a4ae2 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -14,13 +14,16 @@
 
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 
+#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+#include "paddle/fluid/platform/profiler.h"
+
 namespace paddle {
 namespace framework {
 namespace details {
 ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
     const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places,
-    std::unique_ptr<SSAGraph> &&graph)
+    std::unique_ptr<ir::Graph> &&graph)
     : graph_(std::move(graph)),
       pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
                                        : nullptr),
@@ -32,6 +35,8 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
 
 FeedFetchList ThreadedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors) {
+  std::unique_ptr<platform::RecordEvent> event(
+      new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr));
   std::unordered_map<OpHandleBase *, size_t> pending_ops;
   std::unordered_set<VarHandleBase *> pending_vars;
   BlockingQueue<VarHandleBase *> ready_vars;
@@ -43,18 +48,18 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   std::unordered_set<OpHandleBase *> delayed_ops;
 
   // Transform SSAGraph to pending_ops & pending_vars
-  for (auto &var_map : graph_->vars_) {
+  for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
     for (auto &name_pair : var_map) {
       for (auto &version_pair : name_pair.second) {
         InsertPendingVar(&pending_vars, &ready_vars, version_pair.get());
       }
     }
   }
-  for (auto &var : graph_->dep_vars_) {
+  for (auto &var : graph_->Get<details::GraphDepVars>(details::kGraphDepVars)) {
     InsertPendingVar(&pending_vars, &ready_vars, var.get());
   }
 
-  for (auto &op : graph_->ops_) {
+  for (auto &op : graph_->Get<details::GraphOps>(details::kGraphOps)) {
     if (op->Inputs().empty()) {  // Special case, Op has no input.
       ready_ops.insert(op.get());
     } else {
@@ -64,11 +69,12 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
 
   // Step 2. Insert FetchOps
   std::vector<std::unique_ptr<FetchOpHandle>> fetch_ops;
+  std::vector<std::unique_ptr<ir::Node>> tmp_nodes;
   std::unordered_set<std::unique_ptr<VarHandleBase>> fetch_dependencies;
   FeedFetchList fetch_data(fetch_tensors.size());
 
-  InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &pending_ops,
-                 &pending_vars, &ready_vars, &fetch_data);
+  InsertFetchOps(fetch_tensors, &fetch_ops, &tmp_nodes, &fetch_dependencies,
+                 &pending_ops, &pending_vars, &ready_vars, &fetch_data);
 
   auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
     for (auto *op : set) {
@@ -78,6 +84,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     set.clear();
   };
 
+  // Clean run context
+  run_op_futures_.clear();
+  exception_holder_.Clear();
+  event.reset(nullptr);
+
   // Step 3. Execution
   while (!pending_vars.empty()) {
     // 1. Run All Ready ops
@@ -96,20 +107,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
 
     if (timeout) {
-      std::lock_guard<std::mutex> l(exception_mu_);
-      if (exception_) {
-        std::exception *exp = exception_.get();
-        if (dynamic_cast<platform::EOFException *>(exp)) {
-          auto e = *static_cast<platform::EOFException *>(exp);
-          exception_.reset();
-          throw e;
-        } else if (dynamic_cast<platform::EnforceNotMet *>(exp)) {
-          auto e = *static_cast<platform::EnforceNotMet *>(exp);
-          exception_.reset();
-          throw e;
-        } else {
-          LOG(FATAL) << "Unknown exception.";
+      if (exception_holder_.ExceptionCatched()) {
+        for (auto &run_op_future : run_op_futures_) {
+          run_op_future.wait();
         }
+        exception_holder_.Throw();
       } else {
         continue;
       }
@@ -118,7 +120,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     // Find the ready_ops after the ready_var.
     for (auto ready_var : cur_ready_vars) {
       pending_vars.erase(ready_var);
-      for (auto *op : ready_var->pending_ops_) {
+      for (auto *op : ready_var->PendingOps()) {
         auto &deps = pending_ops[op];
         --deps;
         if (deps == 0) {
@@ -144,6 +146,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
 void ThreadedSSAGraphExecutor::InsertFetchOps(
     const std::vector<std::string> &fetch_tensors,
     std::vector<std::unique_ptr<FetchOpHandle>> *fetch_ops,
+    std::vector<std::unique_ptr<ir::Node>> *temp_nodes,
     std::unordered_set<std::unique_ptr<VarHandleBase>> *fetch_dependencies,
     std::unordered_map<OpHandleBase *, size_t> *pending_ops,
     std::unordered_set<VarHandleBase *> *pending_vars,
@@ -151,7 +154,7 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
   std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
 
   for (auto &fetch_var_name : fetch_tensors) {
-    for (auto &var_map : graph_->vars_) {
+    for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
       auto it = var_map.find(fetch_var_name);
       if (it != var_map.end()) {
         fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get());
@@ -161,8 +164,16 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
 
   for (size_t i = 0; i < fetch_tensors.size(); ++i) {
     auto &var_name = fetch_tensors[i];
-    auto &vars = fetched_vars.at(var_name);
-    auto *op = new FetchOpHandle(fetch_data, i, &local_scopes_);
+    auto fetched_var_it = fetched_vars.find(var_name);
+    PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(),
+                   "Cannot find fetched variable.(Perhaps the main_program "
+                   "is not set to ParallelExecutor)");
+
+    auto &vars = fetched_var_it->second;
+
+    temp_nodes->emplace_back(new ir::Node("fetch", ir::Node::Type::kOperation));
+    auto *op = new FetchOpHandle(temp_nodes->back().get(), fetch_data, i,
+                                 &local_scopes_);
     fetch_ops->emplace_back(op);
 
     for (auto &p : places_) {
@@ -173,7 +184,8 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
       op->AddInput(var);
     }
 
-    auto *fetch_dummy = new DummyVarHandle();
+    temp_nodes->emplace_back(new ir::Node("fetch", ir::Node::Type::kOperation));
+    auto *fetch_dummy = new DummyVarHandle(temp_nodes->back().get());
     op->AddOutput(fetch_dummy);
     fetch_dependencies->emplace(fetch_dummy);
     this->InsertPendingVar(pending_vars, ready_vars, fetch_dummy);
@@ -191,7 +203,7 @@ void ThreadedSSAGraphExecutor::InsertPendingVar(
     std::unordered_set<VarHandleBase *> *pending_vars,
     BlockingQueue<VarHandleBase *> *ready_vars, VarHandleBase *var) const {
   pending_vars->insert(var);
-  if (var->generated_op_ == nullptr) {
+  if (var->GeneratedOp() == nullptr) {
     ready_vars->Push(var);
   }
 }
@@ -209,20 +221,15 @@ void ThreadedSSAGraphExecutor::RunOp(
       ready_var_q->Extend(op->Outputs());
       VLOG(10) << op << " " << op->Name() << "Signal posted";
     } catch (platform::EOFException ex) {
-      std::lock_guard<std::mutex> l(exception_mu_);
-      // EOFException will not cover up existing EnforceNotMet.
-      if (exception_.get() == nullptr) {
-        exception_.reset(new platform::EOFException(ex));
-      }
+      exception_holder_.Catch(ex);
     } catch (platform::EnforceNotMet ex) {
-      std::lock_guard<std::mutex> l(exception_mu_);
-      exception_.reset(new platform::EnforceNotMet(ex));
+      exception_holder_.Catch(ex);
     } catch (...) {
       LOG(FATAL) << "Unknown exception catched";
     }
   };
   if (pool_) {
-    pool_->enqueue(op_run);
+    run_op_futures_.emplace_back(pool_->enqueue(op_run));
   } else {
     op_run();
   }
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index c69e0487e2e503a0d445300aa2fd6bb9c30b06c9..9135c1f5d435d5e2c60eb90c80803361aa31a3c4 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <deque>
+#include <list>
 #include <string>
 #include <unordered_set>
 #include <utility>
@@ -23,9 +24,11 @@
 #include <functional>
 #include "ThreadPool.h"  // ThreadPool in thrird party
 #include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
+#include "paddle/fluid/framework/ir/graph.h"
 
 namespace paddle {
 namespace framework {
@@ -38,8 +41,9 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
                            const std::vector<Scope *> &local_scopes,
                            const std::vector<platform::Place> &places,
-                           std::unique_ptr<SSAGraph> &&graph);
+                           std::unique_ptr<ir::Graph> &&graph);
 
+  const ir::Graph &Graph() const override { return *graph_; }
   // Run a SSAGraph by a thread pool
   // Use topological sort algorithm
   FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
@@ -51,13 +55,12 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
              details::OpHandleBase *op);
 
  private:
-  std::unique_ptr<SSAGraph> graph_;
+  std::unique_ptr<ir::Graph> graph_;
   std::unique_ptr<::ThreadPool> pool_;
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
   platform::DeviceContextPool fetch_ctxs_;
-  std::mutex exception_mu_;
-  std::unique_ptr<std::exception> exception_;
+  ExceptionHolder exception_holder_;
   std::atomic<int> running_ops_;
 
   void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops,
@@ -70,6 +73,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   void InsertFetchOps(
       const std::vector<std::string> &fetch_tensors,
       std::vector<std::unique_ptr<FetchOpHandle>> *fetch_ops,
+      std::vector<std::unique_ptr<ir::Node>> *temp_nodes,
       std::unordered_set<std::unique_ptr<VarHandleBase>> *fetch_dependencies,
       std::unordered_map<OpHandleBase *, size_t> *pending_ops,
       std::unordered_set<VarHandleBase *> *pending_vars,
@@ -77,6 +81,8 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
 
  private:
   ExecutionStrategy strategy_;
+  // use std::list because clear(), push_back, and for_each are O(1)
+  std::list<std::future<void>> run_op_futures_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/var_handle.cc b/paddle/fluid/framework/details/var_handle.cc
index 6f00abd9473a84a77ed1a39015e2ae079e00be79..5457870e9ff5d7cf67c9c7076b9aae94eeada779 100644
--- a/paddle/fluid/framework/details/var_handle.cc
+++ b/paddle/fluid/framework/details/var_handle.cc
@@ -26,7 +26,7 @@ std::string VarHandle::DebugString() const {
   return ss.str();
 }
 
-std::string DummyVarHandle::DebugString() const { return "dummy"; }
+std::string DummyVarHandle::DebugString() const { return node_->Name(); }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
index cae9af7217660fb7e4b8535ee8e022fb3a127668..d8c2bc40b9458a1d5a7dd8a32277d04f69295f09 100644
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -13,11 +13,14 @@
 // limitations under the License.
 
 #pragma once
+
+#include <algorithm>
 #include <sstream>
 #include <string>
 #include <unordered_set>
 #include <utility>
 
+#include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
@@ -25,19 +28,60 @@ namespace framework {
 namespace details {
 class OpHandleBase;
 
+// Wraps ir::Node and provide helper utilities.
+// It's responsible for populating necessary fields of ir::Node.
+//
 // VarHandleBase is the var node in the dependency graph.
 // A variable can only be generated by a single operator. i.e.
 // This is a single assignment graph.
 struct VarHandleBase {
+  explicit VarHandleBase(ir::Node* node) : node_(node) {}
+
   virtual ~VarHandleBase();
+
   virtual std::string DebugString() const = 0;
 
+  void AddInput(OpHandleBase* in, ir::Node* node) {
+    node_->inputs.clear();
+    node_->inputs.push_back(node);
+    generated_op_ = in;
+  }
+
+  void AddOutput(OpHandleBase* out, ir::Node* node) {
+    if (pending_ops_.find(out) == pending_ops_.end()) {
+      pending_ops_.insert(out);
+      node_->outputs.push_back(node);
+    }
+  }
+
+  void RemoveOutput(OpHandleBase* out, ir::Node* node) {
+    pending_ops_.erase(out);
+    node_->outputs.erase(
+        std::remove(node_->outputs.begin(), node_->outputs.end(), node),
+        node_->outputs.end());
+  }
+
+  void ClearGeneratedOp() {
+    generated_op_ = nullptr;
+    node_->inputs.clear();
+  }
+
+  OpHandleBase* GeneratedOp() { return generated_op_; }
+
+  const std::unordered_set<OpHandleBase*>& PendingOps() const {
+    return pending_ops_;
+  }
+
+  ir::Node* Node() { return node_; }
+
+ protected:
   // The operator who generate this variable. nullptr if the variable
   // is a root node.
   OpHandleBase* generated_op_{nullptr};
 
   // Operators which depend on this variable ready.
   std::unordered_set<OpHandleBase*> pending_ops_;
+  ir::Node* node_;
 };
 
 // VarHandle is actually a single version of Runtime Variable.
@@ -46,11 +90,14 @@ struct VarHandleBase {
 //
 // NOTE: runtime variables have place.
 struct VarHandle : public VarHandleBase {
+  explicit VarHandle(ir::Node* node) : VarHandleBase(node) {}
+
   std::string DebugString() const override;
 
-  VarHandle(size_t version, size_t scope_index, std::string name,
-            platform::Place place)
-      : version_(version),
+  VarHandle(ir::Node* node, size_t version, size_t scope_index,
+            std::string name, platform::Place place)
+      : VarHandleBase(node),
+        version_(version),
         scope_idx_(scope_index),
         name_(std::move(name)),
         place_(std::move(place)) {}
@@ -70,6 +117,8 @@ struct VarHandle : public VarHandleBase {
 
 // Dummy Variable. It is used to represent dependencies between operators
 struct DummyVarHandle : public VarHandleBase {
+  explicit DummyVarHandle(ir::Node* node) : VarHandleBase(node) {}
+
   std::string DebugString() const override;
 };
 
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 84f67fafa19ac545ebb7a1019059e3c74c363c56..dad170ed78c64202b5c812bd8682887fe3b736d6 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -45,19 +45,13 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
 
 Executor::Executor(const platform::Place& place) : place_(place) {}
 
+void Executor::Close() {
 #ifdef PADDLE_WITH_DISTRIBUTE
-void Executor::BeginPass() {
   ::paddle::operators::distributed::RPCClient::GetInstance<
       ::paddle::operators::distributed::GRPCClient>()
-      ->SendBeginPass();
-}
-
-void Executor::EndPass() {
-  ::paddle::operators::distributed::RPCClient::GetInstance<
-      ::paddle::operators::distributed::GRPCClient>()
-      ->SendEndPass();
-}
+      ->SendComplete();
 #endif
+}
 
 void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
   if (var_type == proto::VarType::LOD_TENSOR) {
@@ -336,12 +330,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   }
 
   for (auto& op : ctx->ops_) {
-    VLOG(4) << place_ << " " << op->DebugStringEx(local_scope);
     op->Run(*local_scope, place_);
-    // NOTE! Please do not delete this line, it's usefull because the debug
-    // string before and after op.run are different, after run the output
-    // will have right shape which is usefull for debug.
-    VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
 
     if (FLAGS_benchmark) {
       VLOG(2) << "Memory used after operator " + op->Type() + " running: "
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index 563a4b2bb65dad481a755f67c7f23939816ce8e8..214ca3dc492c31d4c683790a6ae051be467401c9 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -44,17 +44,11 @@ class Executor {
 
   explicit Executor(const platform::Place& place);
 
-#ifdef PADDLE_WITH_DISTRIBUTE
   /*
-   * Sending signal to pserver to mark current pass started.
+   * Close this Executor.
+   * Calling this method will send complete messages to all pserver instances.
    */
-  void BeginPass();
-
-  /*
-   * Sending signal to pserver to mark current pass finished.
-   */
-  void EndPass();
-#endif
+  void Close();
 
   /* @Brief
    * Runtime evaluation of the given ProgramDesc under certain Scope
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bf7d76a8a6e173e648cea5aaba9b7202d787173b
--- /dev/null
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -0,0 +1,9 @@
+cc_library(node SRCS node.cc DEPS proto_desc)
+cc_library(graph SRCS graph.cc DEPS node)
+cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
+cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
+cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper)
+
+cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
+cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
+cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f870fb2b9cf805aba84d6f4573b0574ff361e71c
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -0,0 +1,237 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/var_desc.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::vector<std::string> FindDistTrainSendVars(
+    const std::vector<ir::Node *> &nodes) {
+  std::vector<std::string> send_vars;
+  // since parameters are all in block 0,
+  // it's enough to only scan send ops in block 0
+  for (auto &node : nodes) {
+    auto op_vars = node->Op()->InputArgumentNames();
+    send_vars.reserve(send_vars.size() +
+                      std::distance(op_vars.begin(), op_vars.end()));
+    send_vars.insert(send_vars.end(), op_vars.begin(), op_vars.end());
+  }
+  return send_vars;
+}
+
+std::vector<std::string> FindDistTrainRecvVars(
+    const std::vector<ir::Node *> &nodes) {
+  std::vector<std::string> recv_vars;
+  for (auto &node : nodes) {
+    auto op_vars = node->Op()->OutputArgumentNames();
+    recv_vars.reserve(recv_vars.size() +
+                      std::distance(op_vars.begin(), op_vars.end()));
+    recv_vars.insert(recv_vars.end(), op_vars.begin(), op_vars.end());
+  }
+  return recv_vars;
+}
+
+bool IsDistTrainOp(ir::Node *node, const std::vector<std::string> &send_vars,
+                   const std::vector<std::string> &recv_vars) {
+  if (send_vars.size() == 0 || recv_vars.size() == 0) {
+    return false;
+  }
+
+  /**
+   * Check any of opvars contains `.block` and in sendvars
+   */
+  auto checker = [](const std::vector<std::string> &opvars,
+                    const std::vector<std::string> &rpc_vars) -> bool {
+    for (auto &var : opvars) {
+      // a variable name with the suffix `.block` means it's a splited
+      // variable by (DistributeTranspiler)
+      // [python/paddle/fluid/transpiler/distribute_transpiler.py]
+      if (var.find(".block") != std::string::npos &&
+          std::find(rpc_vars.begin(), rpc_vars.end(), var) != rpc_vars.end()) {
+        return true;
+      }
+    }
+    return false;
+  };
+
+  std::vector<std::string> input_var_names;
+  std::vector<std::string> output_var_names;
+  for (ir::Node *input : node->inputs) {
+    input_var_names.push_back(input->Name());
+  }
+  for (ir::Node *output : node->outputs) {
+    output_var_names.push_back(output->Name());
+  }
+
+  return checker(output_var_names, send_vars) ||
+         checker(input_var_names, recv_vars);
+}
+
+Graph::Graph(const ProgramDesc &program) : program_(program) {
+  VLOG(3) << "block in program:" << program_.Size();
+  std::unordered_map<std::string, VarDesc *> all_vars;
+  for (auto *var : program.Block(0).AllVars()) {
+    all_vars.emplace(var->Name(), var);
+  }
+
+  std::map<std::string, std::vector<ir::Node *>> var_nodes;
+  for (auto *op : program.Block(0).AllOps()) {
+    ir::Node *node = CreateOpNode(op);
+    // For input args, reuse the same var name if it was created before.
+    // Otherwise, create a new one.
+    for (auto &each_var_name : op->InputArgumentNames()) {
+      ir::Node *var = nullptr;
+      if (var_nodes.find(each_var_name) != var_nodes.end()) {
+        var = var_nodes.at(each_var_name).back();
+      } else if (all_vars.count(each_var_name) != 0) {
+        var = CreateVarNode(all_vars.at(each_var_name));
+        var_nodes[each_var_name].push_back(var);
+      } else {
+        // Operation input var can be optional (dispensable). Which means
+        // the operation doesn't really need the var at runtime. In this
+        // case, the no-existed var is ready at the beginning.
+        var = CreateEmptyNode(each_var_name, ir::Node::Type::kVariable);
+        var_nodes[each_var_name].push_back(var);
+      }
+      node->inputs.push_back(var);
+      var->outputs.push_back(node);
+    }
+    // For output args, always create a new var.
+    for (auto &each_var_name : op->OutputArgumentNames()) {
+      ir::Node *var = CreateVarNode(all_vars.at(each_var_name));
+      var_nodes[each_var_name].push_back(var);
+      node->outputs.push_back(var);
+      var->inputs.push_back(node);
+    }
+  }
+
+  std::vector<ir::Node *> send_ops;
+  ir::Node *send_bar = nullptr;
+  std::vector<ir::Node *> recv_ops;
+  ir::Node *fetch_bar = nullptr;
+  for (ir::Node *node : Nodes()) {
+    if (node->Name() == "send") {
+      send_ops.push_back(node);
+    } else if (node->Name() == "send_barrier") {
+      PADDLE_ENFORCE(!send_bar, "only has one send barrier");
+      send_bar = node;
+    } else if (node->Name() == "recv") {
+      recv_ops.push_back(node);
+    } else if (node->Name() == "fetch_barrier") {
+      PADDLE_ENFORCE(!fetch_bar, "only has one fetch barrier");
+      fetch_bar = node;
+    }
+  }
+  if (send_bar) {
+    for (ir::Node *send : send_ops) {
+      ir::Node *dep_var = CreateControlDepVar();
+      send->outputs.push_back(dep_var);
+      dep_var->inputs.push_back(send);
+      send_bar->inputs.push_back(dep_var);
+      dep_var->outputs.push_back(send_bar);
+    }
+    for (ir::Node *recv : recv_ops) {
+      ir::Node *dep_var = CreateControlDepVar();
+      recv->inputs.push_back(dep_var);
+      dep_var->outputs.push_back(recv);
+      send_bar->outputs.push_back(dep_var);
+      dep_var->inputs.push_back(send_bar);
+    }
+  }
+  if (fetch_bar) {
+    for (ir::Node *recv : recv_ops) {
+      ir::Node *dep_var = CreateControlDepVar();
+      recv->outputs.push_back(dep_var);
+      dep_var->inputs.push_back(recv);
+      fetch_bar->inputs.push_back(dep_var);
+      dep_var->outputs.push_back(fetch_bar);
+    }
+  }
+
+  std::vector<std::string> send_vars = FindDistTrainSendVars(send_ops);
+  std::vector<std::string> recv_vars = FindDistTrainRecvVars(recv_ops);
+  for (ir::Node *node : Nodes()) {
+    if (IsDistTrainOp(node, send_vars, recv_vars)) {
+      if (fetch_bar && node->Name() == "concat") {
+        ir::Node *dep_var = CreateControlDepVar();
+        fetch_bar->outputs.push_back(dep_var);
+        dep_var->inputs.push_back(fetch_bar);
+        node->inputs.push_back(dep_var);
+        dep_var->outputs.push_back(node);
+      }
+    }
+  }
+
+  /**
+   * We only handle write after read(WAR), since it should not have a write
+   * after write in program. If there are write after write operators, we need
+   * prune them.
+   *
+   * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
+   */
+
+  for (auto &var : var_nodes) {
+    auto &versions = var.second;
+    if (versions.size() <= 1) continue;
+
+    auto it_new = versions.rbegin();
+    auto it_old = versions.rbegin();
+    ++it_old;
+    for (; it_old != versions.rend(); it_new = it_old, ++it_old) {
+      ir::Node *write_op =
+          (*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0];
+      const auto &read_ops = (*it_old)->outputs;
+
+      for (auto *read_op : read_ops) {
+        // Manually add a dependency var from read_op to write_op;
+        if (read_op == write_op) {
+          // Read Write is the same op.
+          continue;
+        }
+        // 2 ops might have been connected via other vars.
+        bool has_dep = false;
+        for (ir::Node *r_out : read_op->outputs) {
+          for (ir::Node *w_in : write_op->inputs) {
+            if (r_out == w_in) {
+              has_dep = true;
+              break;
+            }
+          }
+        }
+        if (has_dep) continue;
+
+        ir::Node *dep_var = CreateControlDepVar();
+        read_op->outputs.push_back(dep_var);
+        dep_var->inputs.push_back(read_op);
+        write_op->inputs.push_back(dep_var);
+        dep_var->outputs.push_back(write_op);
+      }
+    }
+  }
+}
+
+bool IsControlDepVar(const ir::Node &var) {
+  return var.Name().find(ir::Node::kControlDepVarName) != std::string::npos;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..c9d55fbf525a1a476ac469e8e57462169a7db2da
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph.h
@@ -0,0 +1,131 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/variant.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class Graph {
+ public:
+  explicit Graph(const ProgramDesc &program);
+
+  virtual ~Graph() {
+    for (auto &attr : attrs_) {
+      attr_dels_[attr.first]();
+    }
+    attrs_.clear();
+    attr_dels_.clear();
+  }
+
+  bool Has(const std::string &attr_name) const {
+    return attrs_.find(attr_name) != attrs_.end();
+  }
+
+  template <typename AttrType>
+  AttrType &Get(const std::string &attr_name) const {
+    PADDLE_ENFORCE(Has(attr_name), "%s attr not registered for graph.",
+                   attr_name);
+    return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
+  }
+
+  template <typename AttrType>
+  void Set(const std::string &attr_name, AttrType *attr) {
+    PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the graph",
+                   attr_name);
+    attrs_[attr_name] = attr;
+    attr_dels_[attr_name] = [attr, attr_name]() {
+      VLOG(3) << "deleting " << attr_name;
+      delete attr;
+    };
+  }
+
+  const std::unordered_set<ir::Node *> &Nodes() const { return node_set_; }
+
+  // Create a normal variable with non-null VarDesc.
+  ir::Node *CreateVarNode(VarDesc *var_desc) {
+    return AddNode(new ir::Node(var_desc));
+  }
+
+  // Create a normal runnable operator with OpDesc.
+  ir::Node *CreateOpNode(OpDesc *op_desc) {
+    return AddNode(new ir::Node(op_desc));
+  }
+
+  // Create a control dependency var that connects 2 operations. The
+  // var doesn't hold any data. Other than that, it's no different from
+  // other var, considering dependency analysis.
+  ir::Node *CreateControlDepVar() {
+    // TODO(panyx0718): control var name should be really unique.
+    const std::string name = string::Sprintf(
+        "%s@%llu", ir::Node::kControlDepVarName, node_set_.size());
+    return AddNode(new ir::Node(name, ir::Node::Type::kVariable));
+  }
+
+  // A more free style way of creating a graph node. Mostly use for test
+  // or "copy" from another node. Avoid using it if possible.
+  ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type) {
+    return AddNode(new ir::Node(name, type));
+  }
+
+  // Clear all node information of the graph and return the ownership of the
+  // nodes.
+  std::vector<std::unique_ptr<ir::Node>> ReleaseNodes() {
+    std::vector<std::unique_ptr<ir::Node>> ret;
+    for (auto &n : nodes_) {
+      ret.emplace_back(n.second.release());
+    }
+    nodes_.clear();
+    node_set_.clear();
+    return ret;
+  }
+
+ private:
+  // This method takes ownership of `node`.
+  ir::Node *AddNode(ir::Node *node) {
+    PADDLE_ENFORCE(node_set_.find(node) == node_set_.end());
+    nodes_[node].reset(node);
+    node_set_.insert(node);
+    return node;
+  }
+
+  void RemoveNode(ir::Node *node) {
+    PADDLE_ENFORCE(node_set_.find(node) != node_set_.end());
+    node_set_.erase(node);
+    nodes_.erase(node);
+  }
+
+  // NOTE: program_ shouldn't be exposed to user.
+  const ProgramDesc &program_;
+  std::map<std::string, boost::any> attrs_;
+  std::map<std::string, std::function<void(void)>> attr_dels_;
+  std::map<ir::Node *, std::unique_ptr<ir::Node>> nodes_;
+  std::unordered_set<ir::Node *> node_set_;
+};
+
+bool IsControlDepVar(const ir::Node &var);
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b1c19e6535150130822e9f48685241e62de5b064
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -0,0 +1,118 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/ir/graph_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace {
+void SortHelper(
+    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list,
+    ir::Node *node, std::unordered_set<ir::Node *> *visited,
+    std::vector<ir::Node *> *ret) {
+  visited->insert(node);
+
+  for (auto adj : adj_list.at(node)) {
+    if (visited->find(adj) == visited->end()) {
+      SortHelper(adj_list, adj, visited, ret);
+    }
+  }
+
+  VLOG(3) << "topology sort insert: " << node->Name()
+          << reinterpret_cast<void *>(node) << " input " << node->inputs.size();
+  ret->push_back(node);
+}
+
+bool HasCircleHelper(
+    ir::Node *node,
+    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list,
+    std::unordered_set<ir::Node *> *visited,
+    std::unordered_set<ir::Node *> *in_trace) {
+  if (visited->find(node) == visited->end()) {
+    visited->insert(node);
+    in_trace->insert(node);
+
+    for (ir::Node *in : adj_list.at(node)) {
+      if (visited->find(in) == visited->end() &&
+          HasCircleHelper(in, adj_list, visited, in_trace)) {
+        return true;
+      } else if (in_trace->find(in) != in_trace->end()) {
+        return true;
+      }
+    }
+  }
+  in_trace->erase(node);
+  return false;
+}
+
+bool HasCircleInternal(
+    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list) {
+  std::unordered_set<ir::Node *> visited;
+  std::unordered_set<ir::Node *> in_trace;
+  for (auto &adj : adj_list) {
+    if (HasCircleHelper(adj.first, adj_list, &visited, &in_trace)) {
+      return true;
+    }
+  }
+  return false;
+}
+}  // namespace
+
+bool HasCircle(const Graph &graph) {
+  return HasCircleInternal(BuildOperationAdjList(graph));
+}
+
+std::vector<ir::Node *> TopologySortOperations(const Graph &graph) {
+  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list =
+      BuildOperationAdjList(graph);
+  PADDLE_ENFORCE(!HasCircleInternal(adj_list));
+  std::unordered_set<ir::Node *> visited;
+  std::vector<ir::Node *> ret;
+  for (auto adj : adj_list) {
+    if (visited.find(adj.first) == visited.end()) {
+      SortHelper(adj_list, adj.first, &visited, &ret);
+    }
+  }
+  return ret;
+}
+
+std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
+    const Graph &graph) {
+  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list;
+
+  for (auto &n : graph.Nodes()) {
+    if (n->NodeType() != ir::Node::Type::kOperation) continue;
+    if (adj_list.find(n) == adj_list.end()) {
+      adj_list[n] = std::unordered_set<ir::Node *>();
+    }
+    for (auto &var : n->inputs) {
+      for (auto &adj_n : var->inputs) {
+        PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
+        adj_list[n].insert(adj_n);
+        VLOG(3) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
+                << " -> " << n->Name() << reinterpret_cast<void *>(n)
+                << "  via " << var->Name() << reinterpret_cast<void *>(var);
+      }
+    }
+  }
+  return adj_list;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd6c53a07f8f56781989739d995226bd02b3d3d0
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/node.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+// Test if the graph contains circle.
+bool HasCircle(const Graph &graph);
+
+// Topology Sort the operations in the graph from inputs to outputs.
+// `graph` cannot contain circle.
+std::vector<ir::Node *> TopologySortOperations(const Graph &graph);
+
+// Build an adjacency list of operations for the `graph`.
+std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
+    const Graph &graph);
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_helper_test.cc b/paddle/fluid/framework/ir/graph_helper_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a260dd3da2a7863c06e51aa4feafd824ea254139
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_helper_test.cc
@@ -0,0 +1,125 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include <string>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void BuildCircleGraph(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+
+  o1->outputs.push_back(v1);
+  o1->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o1);
+}
+
+void BuildCircleGraph2(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
+
+  o1->outputs.push_back(v1);
+  o2->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o2);
+
+  o2->outputs.push_back(v2);
+  o1->inputs.push_back(v2);
+  v2->inputs.push_back(o2);
+  v2->outputs.push_back(o1);
+}
+
+void BuildNoCircleGraph(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
+  ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation);
+  ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation);
+  ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
+  ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable);
+  ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable);
+
+  // o1->v1->o2
+  o1->outputs.push_back(v1);
+  o2->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o2);
+  // o2->v2->o3
+  // o2->v2->o4
+  o2->outputs.push_back(v2);
+  o3->inputs.push_back(v2);
+  o4->inputs.push_back(v2);
+  v2->inputs.push_back(o2);
+  v2->outputs.push_back(o3);
+  v2->outputs.push_back(o4);
+  // o2->v3->o5
+  o2->outputs.push_back(v3);
+  o5->inputs.push_back(v3);
+  v3->inputs.push_back(o2);
+  v3->outputs.push_back(o5);
+  // o3-v4->o5
+  o3->outputs.push_back(v4);
+  o5->inputs.push_back(v4);
+  v4->inputs.push_back(o3);
+  v4->outputs.push_back(o5);
+}
+
+TEST(GraphHelperTest, Basic) {
+  ProgramDesc prog;
+
+  Graph g(prog);
+  BuildCircleGraph(&g);
+  ASSERT_TRUE(HasCircle(g));
+
+  Graph g2(prog);
+  BuildCircleGraph2(&g2);
+  ASSERT_TRUE(HasCircle(g2));
+
+  auto adj_list = BuildOperationAdjList(g2);
+  for (auto& adj : adj_list) {
+    auto& adj_set = adj.second;
+    if (adj.first->Name() == "op1") {
+      ASSERT_EQ((*adj_set.begin())->Name(), "op2");
+    } else if (adj.first->Name() == "op2") {
+      ASSERT_EQ((*adj_set.begin())->Name(), "op1");
+    } else {
+      ASSERT_TRUE(false);
+    }
+  }
+
+  Graph g3(prog);
+  BuildNoCircleGraph(&g3);
+  ASSERT_FALSE(HasCircle(g3));
+  auto sorted = TopologySortOperations(g3);
+  std::map<std::string, size_t> node_map;
+  for (size_t i = 0; i < sorted.size(); ++i) {
+    node_map[sorted[i]->Name()] = i;
+  }
+  ASSERT_EQ(node_map.at("op1"), 0UL);
+  ASSERT_EQ(node_map.at("op2"), 1UL);
+  ASSERT_TRUE(node_map.at("op3") < node_map.at("op5"));
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f9e6bdf3625bdced9d1a9195a979b0f46016d8bf
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_test.cc
@@ -0,0 +1,114 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace framework {
+
+class NOP : public OperatorBase {
+ public:
+  NOP(const std::string &type, const VariableNameMap &inputs,
+      const VariableNameMap &outputs, const AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void RunImpl(const Scope &scope,
+               const platform::Place &place) const override {}
+};
+
+class SumOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "").AsDuplicable();
+    AddOutput("Out", "");
+    AddComment("");
+  }
+};
+
+class SumOpVarTypeInference : public VarTypeInference {
+ public:
+  void operator()(const OpDesc &op_desc, BlockDesc *block) const override {
+    auto &inputs = op_desc.Input("X");
+    auto default_var_type = proto::VarType::SELECTED_ROWS;
+
+    bool any_input_is_lod_tensor = std::any_of(
+        inputs.begin(), inputs.end(), [block](const std::string &name) {
+          return block->Var(name)->GetType() == proto::VarType::LOD_TENSOR;
+        });
+    if (any_input_is_lod_tensor) {
+      default_var_type = proto::VarType::LOD_TENSOR;
+    }
+
+    auto out_var_name = op_desc.Output("Out").front();
+    block->Var(out_var_name)->SetType(default_var_type);
+  }
+};
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OPERATOR(sum, paddle::framework::NOP, paddle::framework::SumOpMaker,
+                  paddle::framework::SumOpVarTypeInference);
+REGISTER_OPERATOR(sum_without_infer_var_type, paddle::framework::NOP,
+                  paddle::framework::SumOpMaker);
+
+namespace paddle {
+namespace framework {
+
+TEST(GraphTest, Basic) {
+  ProgramDesc prog;
+  auto *op = prog.MutableBlock(0)->AppendOp();
+  op->SetType("sum");
+  op->SetInput("X", {"test_a", "test_b", "test_c"});
+  op->SetOutput("Out", {"test_out"});
+  op->SetAttr("op_role", 1);
+
+  prog.MutableBlock(0)->Var("test_a")->SetType(proto::VarType::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarType::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_c")->SetType(proto::VarType::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_out");
+
+  op->InferVarType(prog.MutableBlock(0));
+
+  ASSERT_EQ(proto::VarType::SELECTED_ROWS,
+            prog.MutableBlock(0)->Var("test_out")->GetType());
+
+  prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarType::LOD_TENSOR);
+  op->InferVarType(prog.MutableBlock(0));
+  ASSERT_EQ(proto::VarType::LOD_TENSOR,
+            prog.MutableBlock(0)->Var("test_out")->GetType());
+
+  std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
+  std::vector<ir::Node *> nodes(g->Nodes().begin(), g->Nodes().end());
+  for (ir::Node *n : nodes) {
+    if (n->Name() == "sum") {
+      ASSERT_EQ(n->inputs.size(), 3UL);
+      ASSERT_EQ(n->outputs.size(), 1UL);
+    } else if (n->Name() == "test_a" || n->Name() == "test_b" ||
+               n->Name() == "test_c") {
+      ASSERT_EQ(n->inputs.size(), 0UL);
+      ASSERT_EQ(n->outputs.size(), 1UL);
+    } else if (n->Name() == "test_out") {
+      ASSERT_EQ(n->inputs.size(), 1UL);
+      ASSERT_EQ(n->outputs.size(), 0UL);
+    }
+  }
+  ASSERT_EQ(nodes.size(), 5);
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8cb812d1388bf74d173a4dc7a99561e730f8e95a
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -0,0 +1,72 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+static const char kGraphVizPath[] = "graph_viz_path";
+
+std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  const std::string graph_viz_path = Get<std::string>(kGraphVizPath);
+  std::unique_ptr<std::ostream> fout(new std::ofstream(graph_viz_path));
+  PADDLE_ENFORCE(fout->good());
+  std::ostream& sout = *fout;
+
+  size_t var_id = 0;
+  std::unordered_map<const ir::Node*, size_t> vars;
+
+  sout << "digraph G {\n";
+
+  for (const ir::Node* n : graph->Nodes()) {
+    if (n->NodeType() != ir::Node::Type::kVariable) continue;
+    size_t cur_var_id = var_id++;
+    vars[n] = cur_var_id;
+
+    sout << "var_" << cur_var_id << " [label=\"" << n->Name() << "\"]"
+         << std::endl;
+  }
+
+  size_t op_id = 0;
+  for (const ir::Node* n : graph->Nodes()) {
+    if (n->NodeType() != ir::Node::Type::kOperation) continue;
+    std::string op_name = "op_" + std::to_string(op_id++);
+    sout << op_name << " [label=\"" << n->Name() << "\", shape=rect]"
+         << std::endl;
+    for (auto in : n->inputs) {
+      std::string var_name = "var_" + std::to_string(vars[in]);
+      sout << var_name << " -> " << op_name << std::endl;
+    }
+
+    for (auto out : n->outputs) {
+      std::string var_name = "var_" + std::to_string(vars[out]);
+      sout << op_name << " -> " << var_name << std::endl;
+    }
+  }
+
+  sout << "}\n";
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(graph_viz_pass, paddle::framework::ir::GraphVizPass)
+    .RequirePassAttr(paddle::framework::ir::kGraphVizPath);
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.h b/paddle/fluid/framework/ir/graph_viz_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..1fd8c8a26e9581ccf605d4271a49ec2e90d8b997
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_viz_pass.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <fstream>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class GraphVizPass : public Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aca77da8d674f29b89c023717cdcd061232d023a
--- /dev/null
+++ b/paddle/fluid/framework/ir/node.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/node.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+const char Node::kControlDepVarName[] = "__control_var";
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3138fccee86fb274abe72007961fc1c982b1e96
--- /dev/null
+++ b/paddle/fluid/framework/ir/node.h
@@ -0,0 +1,76 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class Node {
+ public:
+  enum class Type { kOperation, kVariable };
+  static const char kControlDepVarName[];
+
+  explicit Node(const std::string& name, Type type)
+      : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {}
+
+  explicit Node(VarDesc* var_desc)
+      : name_(var_desc->Name()),
+        var_desc_(var_desc),
+        op_desc_(nullptr),
+        type_(Type::kVariable) {}
+
+  explicit Node(OpDesc* op_desc)
+      : name_(op_desc->Type()),
+        var_desc_(nullptr),
+        op_desc_(op_desc),
+        type_(Type::kOperation) {}
+
+  Type NodeType() const { return type_; }
+
+  std::string Name() const { return name_; }
+
+  VarDesc* Var() {
+    PADDLE_ENFORCE(type_ == Type::kVariable);
+    return var_desc_;
+  }
+
+  OpDesc* Op() {
+    PADDLE_ENFORCE(type_ == Type::kOperation);
+    return op_desc_;
+  }
+
+  std::vector<Node*> inputs;
+  std::vector<Node*> outputs;
+
+ protected:
+  const std::string name_;
+  VarDesc* var_desc_;
+  OpDesc* op_desc_;
+  Type type_;
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(Node);
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d7158eba62686be57499df697466797e4034ea8f
--- /dev/null
+++ b/paddle/fluid/framework/ir/pass.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+std::unique_ptr<Graph> Pass::Apply(std::unique_ptr<Graph> graph) const {
+  PADDLE_ENFORCE(!applied_, "Pass can only Apply() once.");
+  PADDLE_ENFORCE(graph.get(), "graph passed to Pass::Apply() cannot be empty.");
+  for (const std::string& attr : required_pass_attrs_) {
+    PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(),
+                   "Required pass atrribute %s not set.", attr);
+  }
+  for (const std::string& attr : required_graph_attrs_) {
+    PADDLE_ENFORCE(graph->Has(attr), "Required graph atrribute %s not set.",
+                   attr);
+  }
+  auto applied_graph = ApplyImpl(std::move(graph));
+  // TODO(panyx0718): Add more verifications.
+  PADDLE_ENFORCE(!HasCircle(*applied_graph),
+                 "Illegal Pass. Generated graph shouldn't has cycle.");
+  applied_ = true;
+  return applied_graph;
+}
+
+PassRegistry& PassRegistry::Instance() {
+  static PassRegistry g_pass_info_map;
+  return g_pass_info_map;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f14083d259172f5b5f1ed80c7d38312d711beb5
--- /dev/null
+++ b/paddle/fluid/framework/ir/pass.h
@@ -0,0 +1,200 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <map>
+#include <string>
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/platform/variant.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+template <typename PassType>
+struct PassRegistrar;
+
+class Pass {
+ public:
+  Pass() = default;
+  virtual ~Pass() {
+    for (auto &attr : attrs_) {
+      if (attr_dels_.find(attr.first) != attr_dels_.end()) {
+        attr_dels_[attr.first]();
+      }
+    }
+    attrs_.clear();
+    attr_dels_.clear();
+  }
+
+  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const;
+
+  // Get a reference to the attributed previously set.
+  template <typename AttrType>
+  AttrType &Get(const std::string &attr_name) const {
+    PADDLE_ENFORCE(attrs_.find(attr_name) != attrs_.end(),
+                   "%s attr not registered for pass.", attr_name);
+    return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
+  }
+
+  // Set a pointer to the attribute. Pass takes ownership of the attribute.
+  template <typename AttrType>
+  void Set(const std::string &attr_name, AttrType *attr) {
+    PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the pass",
+                   attr_name);
+    attrs_[attr_name] = attr;
+    attr_dels_[attr_name] = [attr, attr_name]() {
+      VLOG(3) << "deleting " << attr_name;
+      delete attr;
+    };
+  }
+
+  // Set a pointer to the attribute. Pass doesn't take ownership. Caller
+  // should delete the attribute.
+  template <typename AttrType>
+  void SetNotOwned(const std::string &attr_name, AttrType *attr) {
+    PADDLE_ENFORCE(attrs_.count(attr_name) == 0);
+    attrs_[attr_name] = attr;
+  }
+
+ protected:
+  virtual std::unique_ptr<Graph> ApplyImpl(
+      std::unique_ptr<Graph> graph) const = 0;
+
+ private:
+  template <typename PassType>
+  friend struct PassRegistrar;
+
+  void RegisterRequiredPassAttrs(const std::unordered_set<std::string> &attrs) {
+    required_pass_attrs_.insert(attrs.begin(), attrs.end());
+  }
+
+  void RegisterRequiredGraphAttrs(
+      const std::unordered_set<std::string> &attrs) {
+    required_graph_attrs_.insert(attrs.begin(), attrs.end());
+  }
+
+  mutable bool applied_{false};
+  std::unordered_set<std::string> required_pass_attrs_;
+  std::unordered_set<std::string> required_graph_attrs_;
+  std::map<std::string, boost::any> attrs_;
+  std::map<std::string, std::function<void(void)>> attr_dels_;
+};
+
+using PassCreator = std::function<std::unique_ptr<Pass>()>;
+
+class Registrar {
+ public:
+  // In our design, various kinds of passes,
+  // have their corresponding registry and registrar. The action of
+  // registration is in the constructor of a global registrar variable, which
+  // are not used in the code that calls package framework, and would
+  // be removed from the generated binary file by the linker. To avoid such
+  // removal, we add Touch to all registrar classes and make USE_PASS macros to
+  // call this method. So, as long as the callee code calls USE_PASS, the global
+  // registrar variable won't be removed by the linker.
+  void Touch() {}
+};
+
+class PassRegistry {
+ public:
+  static PassRegistry &Instance();
+
+  bool Has(const std::string &pass_type) const {
+    return map_.find(pass_type) != map_.end();
+  }
+
+  void Insert(const std::string &pass_type, const PassCreator &pass_creator) {
+    PADDLE_ENFORCE(!Has(pass_type), "Pass %s has been registered", pass_type);
+    map_.insert({pass_type, pass_creator});
+  }
+
+  std::unique_ptr<Pass> Get(const std::string &pass_type) const {
+    PADDLE_ENFORCE(Has(pass_type), "Pass %s has not been registered",
+                   pass_type);
+    return map_.at(pass_type)();
+  }
+
+ private:
+  PassRegistry() = default;
+  std::unordered_map<std::string, PassCreator> map_;
+
+  DISABLE_COPY_AND_ASSIGN(PassRegistry);
+};
+
+template <typename PassType>
+struct PassRegistrar : public Registrar {
+  explicit PassRegistrar(const char *pass_type) {
+    PADDLE_ENFORCE(!PassRegistry::Instance().Has(pass_type),
+                   "'%s' is registered more than once.", pass_type);
+    PassRegistry::Instance().Insert(
+        pass_type, [this]() -> std::unique_ptr<Pass> {
+          std::unique_ptr<Pass> pass(new PassType());
+          pass->RegisterRequiredPassAttrs(this->required_pass_attrs_);
+          pass->RegisterRequiredGraphAttrs(this->required_graph_attrs_);
+          return pass;
+        });
+  }
+
+  PassRegistrar<PassType> &RequirePassAttr(const std::string &attr) {
+    required_pass_attrs_.insert(attr);
+    return *this;
+  }
+
+  PassRegistrar<PassType> &RequireGraphAttr(const std::string &attr) {
+    required_graph_attrs_.insert(attr);
+    return *this;
+  }
+
+ private:
+  std::unordered_set<std::string> required_pass_attrs_;
+  std::unordered_set<std::string> required_graph_attrs_;
+};
+
+#define STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(uniq_name, msg)                   \
+  struct __test_global_namespace_##uniq_name##__ {};                          \
+  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
+                             __test_global_namespace_##uniq_name##__>::value, \
+                msg)
+
+// Register a new pass that can be applied on the IR.
+#define REGISTER_PASS(pass_type, pass_class)                          \
+  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                                \
+      __reg_pass__##pass_type,                                        \
+      "REGISTER_PASS must be called in global namespace");            \
+  static ::paddle::framework::ir::PassRegistrar<pass_class>           \
+      __pass_registrar_##pass_type##__(#pass_type);                   \
+  int TouchPassRegistrar_##pass_type() {                              \
+    __pass_registrar_##pass_type##__.Touch();                         \
+    return 0;                                                         \
+  }                                                                   \
+  static ::paddle::framework::ir::PassRegistrar<pass_class>           \
+      &__pass_tmp_registrar_##pass_type##__ __attribute__((unused)) = \
+          __pass_registrar_##pass_type##__
+
+#define USE_PASS(pass_type)                                           \
+  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                                \
+      __use_pass_itself_##pass_type,                                  \
+      "USE_PASS must be called in global namespace");                 \
+  extern int TouchPassRegistrar_##pass_type();                        \
+  static int use_pass_itself_##pass_type##_ __attribute__((unused)) = \
+      TouchPassRegistrar_##pass_type()
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/pass_test.cc b/paddle/fluid/framework/ir/pass_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5b5011412ed39e033a7a65921e9c64ce2d54c638
--- /dev/null
+++ b/paddle/fluid/framework/ir/pass_test.cc
@@ -0,0 +1,112 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/pass.h"
+#include <string>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/ir/graph.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+void BuildCircleGraph(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
+
+  o1->outputs.push_back(v1);
+  o2->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o2);
+
+  o2->outputs.push_back(v2);
+  o1->inputs.push_back(v2);
+  v2->inputs.push_back(o2);
+  v2->outputs.push_back(o1);
+}
+
+class TestPass : public Pass {
+ protected:
+  std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const {
+    graph->Set<int>("copy_test_pass_attr", new int);
+    graph->Set<int>("copy_test_graph_attr", new int);
+
+    int test_pass_attr = this->Get<int>("test_pass_attr");
+    graph->Get<int>("copy_test_pass_attr") = test_pass_attr + 1;
+
+    int test_graph_attr = graph->Get<int>("test_graph_attr");
+    graph->Get<int>("copy_test_graph_attr") = test_graph_attr + 1;
+    return graph;
+  }
+};
+
+TEST(PassTest, TestPassAttrCheck) {
+  ProgramDesc prog;
+  auto pass = PassRegistry::Instance().Get("test_pass");
+  std::unique_ptr<Graph> graph(new Graph(prog));
+  std::string exception;
+  try {
+    graph = pass->Apply(std::move(graph));
+  } catch (paddle::platform::EnforceNotMet e) {
+    exception = std::string(e.what());
+  }
+  ASSERT_TRUE(exception.find("test_pass_attr not set") != exception.npos);
+
+  int val = 1;
+  graph.reset(new Graph(prog));
+  pass->SetNotOwned<int>("test_pass_attr", &val);
+
+  try {
+    graph = pass->Apply(std::move(graph));
+  } catch (paddle::platform::EnforceNotMet e) {
+    exception = std::string(e.what());
+  }
+  ASSERT_TRUE(exception.find("test_graph_attr not set") != exception.npos);
+
+  graph.reset(new Graph(prog));
+  graph->Set<int>("test_graph_attr", new int);
+  graph->Get<int>("test_graph_attr") = 1;
+  graph = pass->Apply(std::move(graph));
+  ASSERT_EQ(graph->Get<int>("copy_test_pass_attr"), 2);
+  ASSERT_EQ(graph->Get<int>("copy_test_graph_attr"), 2);
+
+  try {
+    graph = pass->Apply(std::move(graph));
+  } catch (paddle::platform::EnforceNotMet e) {
+    exception = std::string(e.what());
+  }
+  ASSERT_TRUE(exception.find("Pass can only Apply() once") != exception.npos);
+
+  pass = PassRegistry::Instance().Get("test_pass");
+  pass->SetNotOwned<int>("test_pass_attr", &val);
+  graph.reset(new Graph(prog));
+  BuildCircleGraph(graph.get());
+  graph->Set<int>("test_graph_attr", new int);
+  graph->Get<int>("test_graph_attr") = 2;
+  try {
+    auto tmp = pass->Apply(std::move(graph));
+  } catch (paddle::platform::EnforceNotMet e) {
+    exception = std::string(e.what());
+  }
+  ASSERT_TRUE(exception.find("shouldn't has cycle") != exception.npos);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(test_pass, paddle::framework::ir::TestPass)
+    .RequirePassAttr("test_pass_attr")
+    .RequireGraphAttr("test_graph_attr");
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index cba0064f38f89c1dd27cfac1ddb2339a5ee6c93f..919029c38f2f26a6f5e02da645c4f7718044cdae 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -312,19 +312,22 @@ void WriteToRecordIO(recordio::Writer *writer,
   writer->Write(buffer.str());
 }
 
-std::vector<LoDTensor> ReadFromRecordIO(
-    recordio::Scanner *scanner, const platform::DeviceContext &dev_ctx) {
-  std::vector<LoDTensor> result;
-  if (scanner->HasNext()) {
-    std::istringstream sin(scanner->Next());
-    uint32_t sz;
-    sin.read(reinterpret_cast<char *>(&sz), sizeof(uint32_t));
-    result.resize(sz);
-    for (uint32_t i = 0; i < sz; ++i) {
-      DeserializeFromStream(sin, &result[i], dev_ctx);
-    }
+bool ReadFromRecordIO(recordio::Scanner *scanner,
+                      const platform::DeviceContext &dev_ctx,
+                      std::vector<LoDTensor> *result_ptr) {
+  if (!scanner->HasNext()) {
+    return false;
   }
-  return result;
+  std::istringstream sin(scanner->Next());
+  uint32_t sz;
+  sin.read(reinterpret_cast<char *>(&sz), sizeof(uint32_t));
+  auto &result = *result_ptr;
+  result.resize(sz);
+  for (uint32_t i = 0; i < sz; ++i) {
+    DeserializeFromStream(sin, &result[i], dev_ctx);
+  }
+
+  return true;
 }
 
 std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index 4a2729373b5c63176ed1e856f4acf29fd1e73254..e9b473d547252e80ed26ec61e1a33fbe1742dbe0 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -223,8 +223,9 @@ extern void WriteToRecordIO(recordio::Writer* writer,
                             const std::vector<LoDTensor>& tensor,
                             const platform::DeviceContext& dev_ctx);
 
-extern std::vector<LoDTensor> ReadFromRecordIO(
-    recordio::Scanner* scanner, const platform::DeviceContext& dev_ctx);
+extern bool ReadFromRecordIO(recordio::Scanner* scanner,
+                             const platform::DeviceContext& dev_ctx,
+                             std::vector<LoDTensor>* result_ptr);
 
 /*
  * Convert between length-based LoD and offset-based LoD.
diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
index 38d3cd96d65f0a54b0ea87b4c677013f3802adfb..cd50aaa26054b78f1b1e8f0d470b397892155a2b 100644
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -301,11 +301,12 @@ static void TestRecordIO() {
   {
     std::unique_ptr<std::istream> stream_ptr(stream);
     recordio::Scanner scanner(std::move(stream_ptr));
-    auto tensors = ReadFromRecordIO(&scanner, ctx);
+    std::vector<framework::LoDTensor> tensors;
+    ASSERT_TRUE(ReadFromRecordIO(&scanner, ctx, &tensors));
     ASSERT_EQ(tensors.size(), static_cast<size_t>(2));
     assert_tensor_ok(tensors[0]);
     assert_tensor_ok(tensors[1]);
-    tensors = ReadFromRecordIO(&scanner, ctx);
+    ASSERT_TRUE(ReadFromRecordIO(&scanner, ctx, &tensors));
     ASSERT_EQ(tensors.size(), static_cast<size_t>(2));
     assert_tensor_ok(tensors[0]);
     assert_tensor_ok(tensors[1]);
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index 29b3396bc9854cd3d3ac8d4283f48019c9a9c55f..7836ecb1272a07a79a70c9cb040335f9a42e5684 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -16,6 +16,7 @@
 
 #include <algorithm>
 #include <initializer_list>
+#include <memory>
 #include <vector>
 
 #include "paddle/fluid/framework/tensor.h"
@@ -26,6 +27,7 @@
 namespace paddle {
 namespace framework {
 
+#if defined(PADDLE_WITH_CUDA)
 // Vector<T> implements the std::vector interface, and can get Data or
 // MutableData from any place. The data will be synced implicitly inside.
 template <typename T>
@@ -37,11 +39,11 @@ class Vector {
   Vector() { InitEmpty(); }
 
   // Fill vector with value. The vector size is `count`.
-  explicit Vector(size_t count, const T& value = T()) {
+  explicit Vector(size_t count, const T &value = T()) {
     InitEmpty();
     if (count != 0) {
       resize(count);
-      T* ptr = begin();
+      T *ptr = begin();
       for (size_t i = 0; i < count; ++i) {
         ptr[i] = value;
       }
@@ -59,7 +61,7 @@ class Vector {
 
   // implicit cast from std::vector.
   template <typename U>
-  Vector(const std::vector<U>& dat) {  // NOLINT
+  Vector(const std::vector<U> &dat) {  // NOLINT
     if (dat.size() == 0) {
       InitEmpty();
     } else {
@@ -68,10 +70,10 @@ class Vector {
   }
 
   // Copy ctor
-  Vector(const Vector<T>& other) { this->operator=(other); }
+  Vector(const Vector<T> &other) { this->operator=(other); }
 
   // Copy operator
-  Vector<T>& operator=(const Vector<T>& other) {
+  Vector<T> &operator=(const Vector<T> &other) {
     if (other.size() != 0) {
       this->InitByIter(other.size(), other.begin(), other.end());
     } else {
@@ -81,7 +83,7 @@ class Vector {
   }
 
   // Move ctor
-  Vector(Vector<T>&& other) {
+  Vector(Vector<T> &&other) {
     this->size_ = other.size_;
     this->flag_ = other.flag_;
     if (other.cuda_vec_.memory_size()) {
@@ -93,13 +95,13 @@ class Vector {
   }
 
   // CPU data access method. Mutable.
-  T& operator[](size_t i) {
+  T &operator[](size_t i) {
     MutableCPU();
-    return const_cast<T*>(cpu_vec_.data<T>())[i];
+    return const_cast<T *>(cpu_vec_.data<T>())[i];
   }
 
   // CPU data access method. Immutable.
-  const T& operator[](size_t i) const {
+  const T &operator[](size_t i) const {
     ImmutableCPU();
     return cpu_vec_.data<T>()[i];
   }
@@ -107,43 +109,43 @@ class Vector {
   // std::vector iterator methods. Based on CPU data access method
   size_t size() const { return size_; }
 
-  T* begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
+  T *begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
 
-  T* end() {
+  T *end() {
     return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
   }
 
-  T& front() { return *begin(); }
+  T &front() { return *begin(); }
 
-  T& back() {
+  T &back() {
     auto it = end();
     --it;
     return *it;
   }
 
-  const T* begin() const {
+  const T *begin() const {
     return capacity() == 0 ? &EmptyDummy() : &this->operator[](0);
   }
 
-  const T* end() const {
+  const T *end() const {
     return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
   }
 
-  const T* cbegin() const { return begin(); }
+  const T *cbegin() const { return begin(); }
 
-  const T* cend() const { return end(); }
+  const T *cend() const { return end(); }
 
-  const T& back() const {
+  const T &back() const {
     auto it = end();
     --it;
     return *it;
   }
 
-  T* data() { return begin(); }
+  T *data() { return begin(); }
 
-  const T* data() const { return begin(); }
+  const T *data() const { return begin(); }
 
-  const T& front() const { return *begin(); }
+  const T &front() const { return *begin(); }
   // end of std::vector iterator methods
 
   // assign this from iterator.
@@ -169,7 +171,7 @@ class Vector {
   void Extend(It begin, It end) {
     size_t pre_size = size_;
     resize(pre_size + (end - begin));
-    T* ptr = this->begin() + pre_size;
+    T *ptr = this->begin() + pre_size;
     for (; begin < end; ++begin, ++ptr) {
       *ptr = *begin;
     }
@@ -183,9 +185,9 @@ class Vector {
       MutableCPU();
       Tensor cpu_tensor;
       platform::Place cpu = platform::CPUPlace();
-      T* ptr = cpu_tensor.mutable_data<T>(
+      T *ptr = cpu_tensor.mutable_data<T>(
           framework::make_ddim({static_cast<int64_t>(size)}), cpu);
-      const T* old_ptr =
+      const T *old_ptr =
           cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
       if (old_ptr != nullptr) {
         std::copy(old_ptr, old_ptr + size_, ptr);
@@ -196,7 +198,7 @@ class Vector {
   }
 
   // get cuda ptr. immutable
-  const T* CUDAData(platform::Place place) const {
+  const T *CUDAData(platform::Place place) const {
     PADDLE_ENFORCE(platform::is_gpu_place(place),
                    "CUDA Data must on CUDA place");
     ImmutableCUDA(place);
@@ -204,10 +206,10 @@ class Vector {
   }
 
   // get cuda ptr. mutable
-  T* CUDAMutableData(platform::Place place) {
-    const T* ptr = CUDAData(place);
+  T *CUDAMutableData(platform::Place place) {
+    const T *ptr = CUDAData(place);
     flag_ = kDirty | kDataInCUDA;
-    return const_cast<T*>(ptr);
+    return const_cast<T *>(ptr);
   }
 
   // clear
@@ -228,7 +230,7 @@ class Vector {
   }
 
   // the unify method to access CPU or CUDA data. immutable.
-  const T* Data(platform::Place place) const {
+  const T *Data(platform::Place place) const {
     if (platform::is_gpu_place(place)) {
       return CUDAData(place);
     } else {
@@ -237,7 +239,7 @@ class Vector {
   }
 
   // the unify method to access CPU or CUDA data. mutable.
-  T* MutableData(platform::Place place) {
+  T *MutableData(platform::Place place) {
     if (platform::is_gpu_place(place)) {
       return CUDAMutableData(place);
     } else {
@@ -253,7 +255,7 @@ class Vector {
     return result;
   }
 
-  bool operator==(const Vector<T>& other) const {
+  bool operator==(const Vector<T> &other) const {
     if (size() != other.size()) return false;
     auto it1 = cbegin();
     auto it2 = other.cbegin();
@@ -274,7 +276,7 @@ class Vector {
   template <typename Iter>
   void InitByIter(size_t size, Iter begin, Iter end) {
     platform::Place cpu = platform::CPUPlace();
-    T* ptr = this->cpu_vec_.template mutable_data<T>(
+    T *ptr = this->cpu_vec_.template mutable_data<T>(
         framework::make_ddim({static_cast<int64_t>(size)}), cpu);
     for (size_t i = 0; i < size; ++i) {
       *ptr++ = *begin++;
@@ -368,7 +370,7 @@ class Vector {
     }
   }
 
-  static T& EmptyDummy() {
+  static T &EmptyDummy() {
     static T dummy = T();
     return dummy;
   }
@@ -379,5 +381,52 @@ class Vector {
   size_t size_;
 };
 
-}  // namespace framework
+#else  // PADDLE_WITH_CUDA
+
+template <typename T>
+class CPUVector : public std::vector<T, std::allocator<T>> {
+ public:
+  CPUVector() : std::vector<T>() {}
+  CPUVector(size_t count, const T &value = T())  // NOLINT
+      : std::vector<T>(count, value) {}
+  CPUVector(std::initializer_list<T> init) : std::vector<T>(init) {}
+  CPUVector(const std::vector<T> &other) : std::vector<T>(other) {}  // NOLINT
+  CPUVector(const CPUVector<T> &other) : std::vector<T>(other) {}
+  CPUVector(CPUVector<T> &&other) : std::vector<T>(std::move(other)) {}
+  CPUVector(std::vector<T> &&other)  // NOLINT
+      : std::vector<T>(std::move(other)) {}
+  CPUVector &operator=(const CPUVector &other) {
+    this->assign(other.begin(), other.end());
+    return *this;
+  }
+  CPUVector &operator=(const std::vector<T> &other) {
+    this->assign(other.begin(), other.end());
+    return *this;
+  }
+
+  friend std::ostream &operator<<(std::ostream &os, const CPUVector<T> &other) {
+    std::stringstream ss;
+    for (auto v : other) {
+      os << v << " ";
+    }
+    return os;
+  }
+
+  T &operator[](size_t id) { return this->at(id); }
+
+  const T &operator[](size_t id) const { return this->at(id); }
+
+  template <typename D>
+  void Extend(const D &begin, const D &end) {
+    this->reserve(this->size() + size_t(end - begin));
+    this->insert(this->end(), begin, end);
+  }
+};
+
+template <typename T>
+using Vector = CPUVector<T>;
+
+#endif  // PADDLE_WITH_CUDA
+
+};  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/mixed_vector_test.cc b/paddle/fluid/framework/mixed_vector_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0599c8d384641606b0a5ebb5ba1781b56f539e63
--- /dev/null
+++ b/paddle/fluid/framework/mixed_vector_test.cc
@@ -0,0 +1,72 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <memory>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/mixed_vector.h"
+
+template <typename T>
+using vec = paddle::framework::Vector<T>;
+
+TEST(mixed_vector, CPU_VECTOR) {
+  vec<int> tmp;
+  for (int i = 0; i < 10; ++i) {
+    tmp.push_back(i);
+  }
+  ASSERT_EQ(tmp.size(), 10UL);
+  vec<int> tmp2;
+  tmp2 = tmp;
+  ASSERT_EQ(tmp2.size(), 10UL);
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_EQ(tmp2[i], i);
+    ASSERT_EQ(tmp2[i], tmp[i]);
+  }
+  int cnt = 0;
+  for (auto& t : tmp2) {
+    ASSERT_EQ(t, cnt);
+    ++cnt;
+  }
+}
+
+TEST(mixed_vector, InitWithCount) {
+  paddle::framework::Vector<int> vec(10, 10);
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_EQ(vec[i], 10);
+  }
+}
+
+TEST(mixed_vector, ForEach) {
+  vec<int> tmp;
+  for (auto& v : tmp) {
+    VLOG(3) << v;
+  }
+}
+
+TEST(mixed_vector, Reserve) {
+  paddle::framework::Vector<int> vec;
+  vec.reserve(1);
+  vec.push_back(0);
+  vec.push_back(0);
+  vec.push_back(0);
+}
+
+TEST(mixed_vector, Resize) {
+  paddle::framework::Vector<int> vec;
+  vec.resize(1);
+  vec.push_back(0);
+  vec.push_back(0);
+  vec.push_back(0);
+}
diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu
index d57f82510833d6a0cea7009cf1f0b49543812f8d..4b0caa8d350dde0462e5fdcca743df919358a364 100644
--- a/paddle/fluid/framework/mixed_vector_test.cu
+++ b/paddle/fluid/framework/mixed_vector_test.cu
@@ -11,7 +11,9 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License. */
+
 #include <cuda_runtime.h>
+#include <memory>
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
@@ -21,26 +23,6 @@
 template <typename T>
 using vec = paddle::framework::Vector<T>;
 
-TEST(mixed_vector, CPU_VECTOR) {
-  vec<int> tmp;
-  for (int i = 0; i < 10; ++i) {
-    tmp.push_back(i);
-  }
-  ASSERT_EQ(tmp.size(), 10UL);
-  vec<int> tmp2;
-  tmp2 = tmp;
-  ASSERT_EQ(tmp2.size(), 10UL);
-  for (int i = 0; i < 10; ++i) {
-    ASSERT_EQ(tmp2[i], i);
-    ASSERT_EQ(tmp2[i], tmp[i]);
-  }
-  int cnt = 0;
-  for (auto& t : tmp2) {
-    ASSERT_EQ(t, cnt);
-    ++cnt;
-  }
-}
-
 static __global__ void multiply_10(int* ptr) {
   for (int i = 0; i < 10; ++i) {
     ptr[i] *= 10;
@@ -91,24 +73,3 @@ TEST(mixed_vector, MultiGPU) {
     ASSERT_EQ(tmp[i], i * 100);
   }
 }
-
-TEST(mixed_vector, InitWithCount) {
-  paddle::framework::Vector<int> vec(10, 10);
-  for (int i = 0; i < 10; ++i) {
-    ASSERT_EQ(vec[i], 10);
-  }
-}
-
-TEST(mixed_vector, ForEach) {
-  vec<int> tmp;
-  for (auto& v : tmp) {
-  }
-}
-
-TEST(mixed_vector, Reserve) {
-  paddle::framework::Vector<int> vec;
-  vec.reserve(1);
-  vec.push_back(0);
-  vec.push_back(0);
-  vec.push_back(0);
-}
diff --git a/paddle/fluid/framework/op_info.cc b/paddle/fluid/framework/op_info.cc
index f1261dee0319440995951d1bee145404186a8ad4..af75baa5c4b98f7d092834c05eb57e9c7e131b29 100644
--- a/paddle/fluid/framework/op_info.cc
+++ b/paddle/fluid/framework/op_info.cc
@@ -21,8 +21,8 @@ namespace framework {
 // a static local variable is already being initialized.
 // https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex
 OpInfoMap& OpInfoMap::Instance() {
-  static OpInfoMap* g_op_info_map = new OpInfoMap();
-  return *g_op_info_map;
+  static OpInfoMap g_op_info_map;
+  return g_op_info_map;
 }
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/op_kernel_type_test.cc b/paddle/fluid/framework/op_kernel_type_test.cc
index db95861c510b52a5b52229541434e6437d3fb9f4..3e17a512ce154de88ac890f3b29f03385595d95c 100644
--- a/paddle/fluid/framework/op_kernel_type_test.cc
+++ b/paddle/fluid/framework/op_kernel_type_test.cc
@@ -29,6 +29,13 @@ TEST(OpKernelType, ToString) {
   ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type),
             "data_type[float]:data_layout[NCHW]:place[CPUPlace]:library_type["
             "CUDNN]");
+
+  using CUDAPlace = paddle::platform::CUDAPlace;
+  OpKernelType op_kernel_type2(DataType::FP16, CUDAPlace(0), DataLayout::kNCHW,
+                               LibraryType::kCUDNN);
+  ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type2),
+            "data_type[float16]:data_layout[NCHW]:place[CUDAPlace(0)]:library_"
+            "type[CUDNN]");
 }
 
 TEST(OpKernelType, Hash) {
diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
index 001b5cb5a8eb57cbe0a2e0ad7f64ef05f8149922..2288c7fe6609a765612b468d69ad35101b92b384 100644
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -40,6 +40,40 @@ OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput(
   return OpProtoAndCheckerMaker::VariableBuilder{output};
 }
 
+void OpProtoAndCheckerMaker::Reuse(const std::string& name,
+                                   const std::string& reused_name) {
+  bool found = false;
+  proto::OpProto::Var* var;
+
+  for (auto& var : proto_->inputs()) {
+    if (var.name() == reused_name) {
+      found = true;
+      break;
+    }
+  }
+  PADDLE_ENFORCE(found == true,
+                 "Input/Output name: %s reused_name: %s, one of them is not "
+                 "exists or not matched.",
+                 name, reused_name);
+
+  found = false;
+  for (int i = 0; i < proto_->outputs().size(); ++i) {
+    var = proto_->mutable_outputs()->Mutable(i);
+    if (var->name() == name) {
+      PADDLE_ENFORCE(!var->has_reuse(),
+                     "Output(%s) has been set reused var of %s", name,
+                     var->reuse());
+      found = true;
+      var->set_reuse(reused_name);
+      break;
+    }
+  }
+  PADDLE_ENFORCE(found == true,
+                 "Input/Output name: %s reused_name: %s, one of them is not "
+                 "exists or not matched.",
+                 name, reused_name);
+}
+
 void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
   std::unordered_set<std::string> names;
   auto checker = [&](const std::string& name) {
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index 92f86bb5de520878d0a7b8d7214620580242c061..80970291c9c234f1306162f4ffa3c2528f88c35f 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -78,6 +78,8 @@ class OpProtoAndCheckerMaker {
   VariableBuilder AddOutput(const std::string &name,
                             const std::string &comment);
 
+  void Reuse(const std::string &name, const std::string &reused_name);
+
   template <typename T>
   TypedAttrChecker<T> &AddAttr(const std::string &name,
                                const std::string &comment,
diff --git a/paddle/fluid/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc
index 58f70cb39c0d96ed3b9ff35ea132ba75a37f5405..b71c7b646857e11f291748c4c7c2af92b6d53231 100644
--- a/paddle/fluid/framework/op_proto_maker_test.cc
+++ b/paddle/fluid/framework/op_proto_maker_test.cc
@@ -49,6 +49,15 @@ TEST(ProtoMaker, DuplicatedInOut) {
 }
 
 class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "input of test op");
+    AddOutput("XOut", "output of test op").Reuse("X");
+  }
+};
+
+class TestInplaceProtoMaker2
+    : public paddle::framework::OpProtoAndCheckerMaker {
  public:
   void Make() {
     AddInput("X", "input of test op");
@@ -58,12 +67,100 @@ class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
 };
 
 TEST(ProtoMaker, InplaceOutput) {
-  paddle::framework::proto::OpProto op_proto;
+  paddle::framework::proto::OpProto op_proto, op_proto2;
   paddle::framework::OpAttrChecker op_checker;
   TestInplaceProtoMaker proto_maker;
-  ASSERT_THROW(proto_maker(&op_proto, &op_checker),
+  TestInplaceProtoMaker2 proto_maker2;
+
+  proto_maker(&op_proto, &op_checker);
+
+  ASSERT_THROW(proto_maker2(&op_proto2, &op_checker),
                paddle::platform::EnforceNotMet);
-  // proto_maker(&op_proto, &op_checker);
-  // proto_maker.Make();
-  // ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
 }
+
+// normal reuse
+class TestReuseProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "input of test op");
+    AddInput("Y", "input of test op");
+    AddOutput("Out", "output of test op");
+    AddOutput("XOut", "output of test op");
+    // avoid destructor exception.
+    // Validate();
+    TestReuse();
+  }
+
+  virtual void TestReuse() {}
+};
+
+// test duplicate reuse error
+class TestReuseProtoMaker2 : public TestReuseProtoMaker {
+ public:
+  void TestReuse() {
+    Reuse("Out", "X");
+    Reuse("Out", "Y");
+  }
+};
+
+// NotExists Input
+class TestReuseProtoMaker3 : public TestReuseProtoMaker {
+ public:
+  void TestReuse() {
+    Reuse("Out", "NotExists");
+    Reuse("XOut", "X");
+  }
+};
+
+// NotExists Output
+class TestReuseProtoMaker4 : public TestReuseProtoMaker {
+ public:
+  void TestReuse() { Reuse("NotExists", "X"); }
+};
+
+TEST(ProtoMaker, Reuse) {
+  paddle::framework::proto::OpProto op_proto;
+  paddle::framework::OpAttrChecker op_checker;
+  TestReuseProtoMaker proto_maker;
+  proto_maker(&op_proto, &op_checker);
+}
+
+// NOTE(dzhwinter):
+// There is a Fatal CHECK on base class destructor, which will call abort inside
+// instead of
+// throw an exception. If we throw an exception in Make(), we will trigger the
+// CHECK and terminate the tests.
+//
+// I had tried to replace the default CHECK with a exception, however, it's
+// still not supported by glog.
+// the details:
+// https://github.com/google/glog/issues/249
+// https://github.com/facebookresearch/TensorComprehensions/issues/351
+/*
+TEST(ProtoMaker, ReuseWithException) {
+  paddle::framework::proto::OpProto op_proto2, op_proto3, op_proto4;
+  paddle::framework::OpAttrChecker op_checker;
+  TestReuseProtoMaker2 proto_maker2;
+  TestReuseProtoMaker3 proto_maker3;
+  TestReuseProtoMaker4 proto_maker4;
+  EXPECT_THROW(proto_maker2(&op_proto2, &op_checker),
+               paddle::platform::EnforceNotMet);
+
+  EXPECT_THROW(proto_maker3(&op_proto3, &op_checker),
+               paddle::platform::EnforceNotMet);
+
+  EXPECT_THROW(proto_maker4(&op_proto4, &op_checker),
+               paddle::platform::EnforceNotMet);
+}
+
+void FailureFunction() {
+  throw std::runtime_error("Check failed in destructor.");
+  // return 0;
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  google::InstallFailureFunction(&FailureFunction);
+  return RUN_ALL_TESTS();
+}
+*/
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 3cf8e8696d739e3f2894e490161b9fb5b459bc41..d04f7744961b2561977f4d36d0073a97557043da 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/var_type.h"
@@ -57,7 +58,11 @@ static DDim GetDims(const Scope& scope, const std::string& name,
   }
 
   if (var->IsType<LoDTensor>()) {
-    return var->Get<LoDTensor>().dims();
+    const LoDTensor& tensor = var->Get<LoDTensor>();
+    if (UNLIKELY(!tensor.IsInitialized())) {
+      return DDim({-1});
+    }
+    return tensor.dims();
   } else if (var->IsType<SelectedRows>()) {
     if (get_actual_dim) {
       return var->Get<SelectedRows>().value().dims();
@@ -69,6 +74,26 @@ static DDim GetDims(const Scope& scope, const std::string& name,
   }
 }
 
+static std::string GetDtype(const Scope& scope, const std::string& name) {
+  Variable* var = scope.FindVar(name);
+  if (var == nullptr) {
+    return "";
+  }
+
+  if (var->IsType<LoDTensor>()) {
+    const LoDTensor& tensor = var->Get<LoDTensor>();
+    if (UNLIKELY(!tensor.IsInitialized())) {
+      return "";
+    }
+    return DataTypeToString(ToDataType(tensor.type()));
+  } else if (var->IsType<SelectedRows>()) {
+    return DataTypeToString(
+        ToDataType(var->Get<SelectedRows>().value().type()));
+  } else {
+    return "";
+  }
+}
+
 static int GetRowSize(const Scope& scope, const std::string& name) {
   Variable* var = scope.FindVar(name);
   if (var == nullptr) {
@@ -91,14 +116,18 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
   }
 
   if (var->IsType<LoDTensor>()) {
-    return var->Get<LoDTensor>().lod();
+    const LoDTensor& tensor = var->Get<LoDTensor>();
+    if (UNLIKELY(!tensor.IsInitialized())) {
+      return default_lod;
+    }
+    return tensor.lod();
   } else {
     return default_lod;
   }
 }
 
 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
-  VLOG(10) << "- " << DebugStringEx(&scope);
+  VLOG(4) << place << " " << DebugStringEx(&scope);
   if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
     PADDLE_THROW("Cannot run operator on place %s", place);
@@ -107,8 +136,10 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
     platform::SetDeviceId(dev_id);
 #endif
   }
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  platform::RecordEvent record_event(Type(), pool.Get(place));
   RunImpl(scope, place);
-  VLOG(10) << "+ " << DebugStringEx(&scope);
+  VLOG(3) << place << " " << DebugStringEx(&scope);
 }
 
 bool OperatorBase::HasInputs(const std::string& name) const {
@@ -172,6 +203,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
         if (row_size >= 0) {
           ss << "[row_size=" << row_size << "]";
         }
+        std::string dtype = GetDtype(*scope, input.second[i]);
+        ss << ":" << dtype;
         ss << "[" << GetDims(*scope, input.second[i], true) << "]";
         ss << "(" << GetLoD(*scope, input.second[i]) << ")";
       }
@@ -608,9 +641,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.Get(place);
 
-  // For profiling, don't move out of this function because that will result
-  // in the failure of multi-GPU profiling.
-  platform::RecordEvent record_event(Type(), dev_ctx);
   // check if op[type] has kernel registered.
   auto& all_op_kernels = AllOpKernels();
   auto kernels_iter = all_op_kernels.find(type_);
@@ -633,6 +663,16 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
   auto kernel_iter = kernels.find(expected_kernel_key);
+#ifdef PADDLE_WITH_MKLDNN
+  // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
+  if (kernel_iter == kernels.end() &&
+      expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
+    VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
+    expected_kernel_key.library_type_ = LibraryType::kPlain;
+    expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
+#endif
   if (kernel_iter == kernels.end()) {
     PADDLE_THROW("op %s does not have kernel for %s", type_,
                  KernelTypeToString(expected_kernel_key));
@@ -669,6 +709,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       if (var == nullptr) continue;
       if (var->IsType<framework::LoDTensor>()) {
         CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
+      } else if (var->IsType<framework::SelectedRows>()) {
+        CheckTensorNANOrInf(vname, var->Get<framework::SelectedRows>().value());
       }
     }
   }
@@ -736,6 +778,7 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
     const ExecutionContext& ctx) const {
   auto& scope = ctx.scope();
   int data_type = -1;
+  std::string last_input_name;
   for (auto& input : this->inputs_) {
     for (auto& ipt_name : input.second) {
       auto* var = scope.FindVar(ipt_name);
@@ -752,9 +795,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
           int tmp = static_cast<int>(ToDataType(t->type()));
           PADDLE_ENFORCE(
               tmp == data_type || data_type == -1,
-              "DataType of Paddle Op %s must be the same. Get %d != %d", Type(),
-              data_type, tmp);
+              "DataType of Paddle Op %s must be the same. Get %s(%d) != %s(%d)",
+              Type(), last_input_name, data_type, ipt_name, tmp);
           data_type = tmp;
+          last_input_name = ipt_name;
         }
       }
     }
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index b53a6f43fbd1f23e69d23ad0fcc54d5c25d352a3..b5f01a9a2b76472063658f1a051a2ee3c65559b7 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -18,18 +18,81 @@ limitations under the License. */
 #include <tuple>
 #include <vector>
 
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
-#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h"
+#include "paddle/fluid/framework/details/ssa_graph_checker.h"
+#include "paddle/fluid/framework/details/ssa_graph_printer.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace framework {
 
+std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
+    const ProgramDesc &main_program, const std::vector<platform::Place> &places,
+    const std::string &loss_var_name,
+    const std::unordered_set<std::string> &param_names,
+    const std::vector<Scope *> &local_scopes, const bool use_cuda,
+#ifdef PADDLE_WITH_CUDA
+    const BuildStrategy &strategy, platform::NCCLContextMap *nccl_ctxs) {
+#else
+    const BuildStrategy &strategy) {
+#endif
+  // Convert the program to graph.
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
+
+  // Apply a graph viz pass to record a graph.
+  if (!strategy.debug_graphviz_path_.empty()) {
+    auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
+    const std::string graph_path = string::Sprintf(
+        "%s%s", strategy.debug_graphviz_path_.c_str(), "_original_graph");
+    viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
+    graph = viz_pass->Apply(std::move(graph));
+  }
+
+  // Convert graph to run on multi-devices.
+  auto multi_device_pass =
+      ir::PassRegistry::Instance().Get("multi_device_pass");
+  multi_device_pass->SetNotOwned<const std::vector<platform::Place>>("places",
+                                                                     &places);
+  multi_device_pass->SetNotOwned<const std::string>("loss_var_name",
+                                                    &loss_var_name);
+  multi_device_pass->SetNotOwned<const std::unordered_set<std::string>>(
+      "params", &param_names);
+  multi_device_pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
+                                                             &local_scopes);
+  multi_device_pass->SetNotOwned<const BuildStrategy>("strategy", &strategy);
+
+#ifdef PADDLE_WITH_CUDA
+  platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
+  multi_device_pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
+#endif
+  graph = multi_device_pass->Apply(std::move(graph));
+
+  // Apply a graph print pass to record a graph with device info.
+  if (!strategy.debug_graphviz_path_.empty()) {
+    auto multi_device_print_pass =
+        ir::PassRegistry::Instance().Get("multi_device_print_pass");
+    multi_device_print_pass->SetNotOwned<const std::string>(
+        "debug_graphviz_path", &strategy.debug_graphviz_path_);
+    multi_device_print_pass->Set<details::GraphvizSSAGraphPrinter>(
+        "graph_printer", new details::GraphvizSSAGraphPrinter);
+    graph = multi_device_print_pass->Apply(std::move(graph));
+  }
+
+  // Verify that the graph is correct for multi-device executor.
+  auto multi_device_check_pass =
+      ir::PassRegistry::Instance().Get("multi_device_check_pass");
+  graph = multi_device_check_pass->Apply(std::move(graph));
+  return graph;
+}
+
 class ParallelExecutorPrivate {
  public:
   explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
@@ -45,6 +108,7 @@ class ParallelExecutorPrivate {
 #endif
   bool own_local_scope_;
   bool use_cuda_;
+  bool use_all_reduce_;
 };
 
 std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
@@ -62,6 +126,14 @@ ParallelExecutor::ParallelExecutor(
     : member_(new ParallelExecutorPrivate(places)) {
   member_->global_scope_ = scope;
   member_->use_cuda_ = exec_strategy.use_cuda_;
+  member_->use_all_reduce_ =
+      build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce;
+
+  if (!member_->use_all_reduce_) {
+    PADDLE_ENFORCE(places.size() > 1,
+                   "If you set build_strategy.reduce with 'Reduce',"
+                   "the number of places must be greater than 1.");
+  }
 
   // Step 1. Bcast the params to devs.
   // Create local scopes
@@ -95,7 +167,7 @@ ParallelExecutor::ParallelExecutor(
   }
 
   if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
-    BCastParamsToGPUs(bcast_vars);
+    BCastParamsToDevices(bcast_vars);
   }
   // Startup Program has been run. All local scopes has correct parameters.
 
@@ -108,39 +180,43 @@ ParallelExecutor::ParallelExecutor(
     var_infos.back().persistable_ = var->Persistable();
   }
 
-  // Step 3. Convert main_program to SSA form and dependency graph. Also, insert
-  // ncclOp
-  details::SSAGraphBuilderFactory builder_factory(
-      member_->places_, loss_var_name, params, member_->local_scopes_,
-      build_strategy);
-  if (member_->use_cuda_) {
+// Step 3. Convert main_program to SSA form and dependency graph. Also, insert
+// ncclOp
 #ifdef PADDLE_WITH_CUDA
-    builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get());
+  std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
+      main_program, member_->places_, loss_var_name, params,
+      member_->local_scopes_, member_->use_cuda_, build_strategy,
+      member_->nccl_ctxs_.get());
 #else
-    PADDLE_THROW("Not compiled with CUDA");
+  std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
+      main_program, member_->places_, loss_var_name, params,
+      member_->local_scopes_, member_->use_cuda_, build_strategy);
 #endif
-  }
 
-  builder_ = builder_factory.Create();
   member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
-      exec_strategy, member_->local_scopes_, places,
-      builder_->Build(main_program)));
-
+      exec_strategy, member_->local_scopes_, places, std::move(graph)));
   member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
       exec_strategy, member_->local_scopes_, std::move(var_infos),
       member_->places_, std::move(member_->executor_)));
 }
 
-void ParallelExecutor::BCastParamsToGPUs(
+void ParallelExecutor::BCastParamsToDevices(
     const std::unordered_set<std::string> &vars) const {
-  // the the initializing bcast, all vars would be bcast from device(0),
+  // the initializing bcast, all vars would be bcast from device(0),
   // otherwise
   // bcast from the specified device.
-  bool initializing = builder_.get() == nullptr ? true : false;
-
+  bool initializing = member_->executor_ ? false : true;
   for (auto &var : vars) {
-    int var_dev_id =
-        builder_.get() == nullptr ? -1 : builder_->GetVarDeviceID(var);
+    int var_dev_id = -1;
+    if (member_->executor_) {
+      auto &sharded_var_device =
+          member_->executor_->Graph().Get<details::ShardedVarDevice>(
+              details::kShardedVarDevice);
+      if (sharded_var_device.find(var) != sharded_var_device.end()) {
+        var_dev_id = sharded_var_device.at(var);
+      }
+    }
+
     if (!initializing && var_dev_id == -1) continue;
 
     framework::Variable *main_var = nullptr;
@@ -202,12 +278,23 @@ void ParallelExecutor::BCastParamsToGPUs(
 #endif
     } else {
       platform::CPUPlace cpu;
-      for (size_t i = 1; i < member_->places_.size(); ++i) {
+      for (size_t i = 0; i < member_->places_.size(); ++i) {
+        if ((initializing && i == 0) ||
+            (!initializing && static_cast<int>(i) == var_dev_id))
+          continue;
+
         auto local_scope = member_->local_scopes_[i];
         auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
-        t->Resize(dims);
-        t->mutable_data(cpu, main_tensor.type());
-        paddle::framework::TensorCopy(main_tensor, cpu, t);
+
+        // FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix.
+        if (member_->use_all_reduce_ || member_->use_cuda_ ||
+            var == "@LR_DECAY_COUNTER@") {
+          t->Resize(dims);
+          t->mutable_data(cpu, main_tensor.type());
+          paddle::framework::TensorCopy(main_tensor, cpu, t);
+        } else {
+          t->ShareDataWith(main_tensor);
+        }
       }
     }
   }
@@ -265,3 +352,8 @@ ParallelExecutor::~ParallelExecutor() {
 
 }  // namespace framework
 }  // namespace paddle
+
+USE_PASS(graph_viz_pass);
+USE_PASS(multi_device_pass);
+USE_PASS(multi_device_check_pass);
+USE_PASS(multi_device_print_pass);
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 058f83f07c26224e3180d140630c08a24c40cd80..d624956acde86cefc4ec1dec80df3738bcf1d8be 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -66,11 +66,10 @@ class ParallelExecutor {
   void Run(const std::vector<std::string> &fetch_tensors,
            const std::string &fetched_var_name);
 
-  void BCastParamsToGPUs(const std::unordered_set<std::string> &vars) const;
+  void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
 
  private:
   ParallelExecutorPrivate *member_;
-  std::unique_ptr<details::SSAGraphBuilder> builder_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc
index 0b36f1116d15004b355e854e101abb9ad3297836..40eafda9bf294f7e8ddd067e9014447f4de1cc0e 100644
--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -13,29 +13,62 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/reader.h"
+#include <deque>
 
 namespace paddle {
 namespace framework {
-ReaderBase::~ReaderBase() {}
-
-FileReader::FileReader(const std::vector<DDim> &dims) : dims_(dims) {}
 
-void FileReader::ReadNext(std::vector<LoDTensor> *out) {
+void ReaderBase::ReadNext(std::vector<LoDTensor> *out) {
+  std::lock_guard<std::mutex> lock(mu_);
+  PADDLE_ENFORCE_EQ(status_, ReaderStatus::kRunning);
   ReadNextImpl(out);
-  if (out->empty()) {
-    return;
-  }
+}
 
-  PADDLE_ENFORCE_EQ(out->size(), dims_.size());
-  for (size_t i = 0; i < dims_.size(); ++i) {
-    auto &actual = (*out)[i].dims();
-    auto &expect = dims_[i];
+void ReaderBase::InsertDecoratedReader(
+    const std::shared_ptr<ReaderBase> &decorated_reader) {
+  std::lock_guard<std::mutex> guard(mu_);
+  decorated_readers_.emplace_back(decorated_reader);
+}
 
-    PADDLE_ENFORCE_EQ(actual.size(), expect.size());
-    for (int j = 0; j < actual.size(); ++j) {
-      //      PADDLE_ENFORCE(actual[i] == expect[i] || expect[i] == -1);
+std::unordered_set<ReaderBase *> ReaderBase::GetEndPoints() {
+  std::unordered_set<ReaderBase *> result;
+  std::deque<ReaderBase *> queue;
+  queue.emplace_back(this);
+  while (!queue.empty()) {  // BFS search
+    auto *front = queue.front();
+    queue.pop_front();
+    if (front->decorated_readers_.empty()) {
+      result.emplace(front);
+    } else {
+      for (auto &reader : front->decorated_readers_) {
+        if (auto *reader_ptr = reader.lock().get()) {
+          queue.emplace_back(reader_ptr);
+        }
+      }
     }
   }
+
+  return result;
 }
+
+void ReaderBase::Shutdown() {
+  std::lock_guard<std::mutex> lock(mu_);
+  if (status_ != ReaderStatus::kStopped) {
+    ShutdownImpl();
+    status_ = ReaderStatus::kStopped;
+  }
+}
+
+void ReaderBase::Start() {
+  std::lock_guard<std::mutex> lock(mu_);
+  if (status_ != ReaderStatus::kRunning) {
+    StartImpl();
+    status_ = ReaderStatus::kRunning;
+  }
+}
+
+ReaderBase::~ReaderBase() {}
+
+DecoratedReader::~DecoratedReader() { reader_->Shutdown(); }
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h
index 64d4ceab624312ed366d7e835072899f1f033a88..82562bf883d88787858912f7039cf8fef003eccf 100644
--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <memory>
+#include <unordered_set>
 #include <vector>
 
 #include "paddle/fluid/framework/ddim.h"
@@ -26,59 +27,116 @@ namespace framework {
 
 class ReaderBase {
  public:
-  virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
+  virtual void ReadNext(std::vector<LoDTensor>* out);
 
-  virtual void ReInit() = 0;
+  virtual void Shutdown();
+
+  virtual void Start();
+
+  // Return the readers which are the end of decorating chain. Basically
+  // they are readers just before read op.
+  std::unordered_set<ReaderBase*> GetEndPoints();
 
   virtual ~ReaderBase();
+
+ protected:
+  virtual void ReadNextImpl(std::vector<LoDTensor>* out) {}
+
+  virtual void ShutdownImpl() {}
+
+  virtual void StartImpl() {}
+
+  enum ReaderStatus { kRunning, kStopped };
+
+  ReaderStatus status_{kRunning};
+
+  mutable std::mutex mu_;
+
+ private:
+  friend class DecoratedReader;
+  // These methods can be only invoked inside DecoratedReader to record the
+  // decorating chain.
+  void InsertDecoratedReader(
+      const std::shared_ptr<ReaderBase>& decorated_reader);
+  // A set of which readers that decorated this reader.
+  std::vector<std::weak_ptr<ReaderBase>> decorated_readers_;
 };
 
-class DecoratedReader : public ReaderBase {
+class DecoratedReader : public ReaderBase,
+                        public std::enable_shared_from_this<DecoratedReader> {
  public:
   explicit DecoratedReader(const std::shared_ptr<ReaderBase>& reader)
       : ReaderBase(), reader_(reader) {
     PADDLE_ENFORCE_NOT_NULL(reader_);
   }
 
-  void ReInit() override { reader_->ReInit(); }
-
- protected:
-  std::shared_ptr<ReaderBase> reader_;
-};
-
-class FileReader : public ReaderBase {
- public:
-  explicit FileReader(const std::vector<DDim>& dims);
+  void RegisterDecorateChain() {
+    reader_->InsertDecoratedReader(shared_from_this());
+  }
 
-  void ReadNext(std::vector<LoDTensor>* out) override;
+  ~DecoratedReader();
 
  protected:
-  virtual void ReadNextImpl(std::vector<LoDTensor>* out) = 0;
+  void ShutdownImpl() override { reader_->Shutdown(); }
 
- private:
-  std::vector<DDim> dims_;
+  void StartImpl() override { reader_->Start(); }
+
+  std::shared_ptr<ReaderBase> reader_;
 };
 
+// FileReader is just a conceptual class.
+class FileReader : public ReaderBase {};
+
 // The ReaderHolder is used as reader' unified wrapper,
 // making it easier to access different type reader in Variables.
 class ReaderHolder {
  public:
-  void Reset(ReaderBase* reader) { reader_.reset(reader); }
+  template <typename T>
+  void Reset(const std::shared_ptr<T>& reader) {
+    auto reader_base = std::dynamic_pointer_cast<ReaderBase>(reader);
+    PADDLE_ENFORCE_NOT_NULL(reader_base);
+    reader_ = reader_base;
+  }
 
-  std::shared_ptr<ReaderBase> Get() const { return reader_; }
+  const std::shared_ptr<ReaderBase>& Get() const { return reader_; }
 
   void ReadNext(std::vector<LoDTensor>* out) {
     PADDLE_ENFORCE_NOT_NULL(reader_);
     reader_->ReadNext(out);
   }
-  void ReInit() {
+
+  void ResetAll() {
+    auto end_readers = reader_->GetEndPoints();
+    for (auto* reader : end_readers) {
+      reader->Shutdown();
+    }
+    for (auto* reader : end_readers) {
+      reader->Start();
+    }
+  }
+
+  void Shutdown() {
     PADDLE_ENFORCE_NOT_NULL(reader_);
-    reader_->ReInit();
+    reader_->Shutdown();
   }
 
+  void Start() {
+    PADDLE_ENFORCE_NOT_NULL(reader_);
+    reader_->Start();
+  }
+
+  operator const std::shared_ptr<ReaderBase>&() const { return this->reader_; }
+
  private:
   std::shared_ptr<ReaderBase> reader_;
 };
 
+template <typename T, typename... ARGS>
+inline std::shared_ptr<DecoratedReader> MakeDecoratedReader(ARGS&&... args) {
+  std::shared_ptr<DecoratedReader> reader(new T(std::forward<ARGS>(args)...));
+  reader->RegisterDecorateChain();
+  return reader;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/reader_test.cc b/paddle/fluid/framework/reader_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f0d07cb7c1367576084b9494e7758103bb45d1e5
--- /dev/null
+++ b/paddle/fluid/framework/reader_test.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/reader.h"
+#include <memory>
+#include "gtest/gtest.h"
+
+class StubDecoratedReader : public paddle::framework::DecoratedReader {
+ public:
+  explicit StubDecoratedReader(const std::shared_ptr<ReaderBase> &reader)
+      : DecoratedReader(reader) {}
+
+  void ReadNextImpl(std::vector<paddle::framework::LoDTensor> *out) override {}
+};
+
+class StubRootReader : public paddle::framework::ReaderBase {
+ public:
+  void ReadNextImpl(std::vector<paddle::framework::LoDTensor> *out) override {}
+};
+
+TEST(READER, decorate_chain) {
+  auto root = std::make_shared<StubRootReader>();
+  auto end_point1 =
+      paddle::framework::MakeDecoratedReader<StubDecoratedReader>(root);
+  auto end_point2 =
+      paddle::framework::MakeDecoratedReader<StubDecoratedReader>(root);
+
+  {
+    auto endpoints = root->GetEndPoints();
+    ASSERT_EQ(endpoints.size(), 2U);
+    ASSERT_NE(endpoints.count(end_point1.get()), 0);
+    ASSERT_NE(endpoints.count(end_point2.get()), 0);
+  }
+
+  {
+    auto end_point3 =
+        paddle::framework::MakeDecoratedReader<StubDecoratedReader>(root);
+    ASSERT_EQ(root->GetEndPoints().size(), 3U);
+  }
+  { ASSERT_EQ(root->GetEndPoints().size(), 2U); }
+}
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index ef224d68f1fc561f45e9d7a81425e62655457648..0bbfd66148e9bc9080654bf1b0b34477115a0e6b 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -82,7 +82,7 @@ class Tensor {
   template <typename T>
   const T* data() const;
 
-  bool IsInitialized() const;
+  inline bool IsInitialized() const;
 
   /**
    * @brief   Return a pointer to mutable memory block.
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index 0a1cb6d5703dace5e6be73285655ecd9d2ad89fb..cb2061c06a429d8e8116001a4aa4e8c46ea13428 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/tensor.h"
 #include <gtest/gtest.h>
 #include <string>
+#include "paddle/fluid/platform/float16.h"
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
@@ -213,3 +214,17 @@ TEST(Tensor, Layout) {
   src.set_layout(framework::DataLayout::kAnyLayout);
   ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout);
 }
+
+TEST(Tensor, FP16) {
+  using platform::float16;
+  framework::Tensor src;
+  float16* src_ptr = src.mutable_data<float16>({2, 3}, platform::CPUPlace());
+  for (int i = 0; i < 2 * 3; ++i) {
+    src_ptr[i] = static_cast<float16>(i);
+  }
+  EXPECT_EQ(src.memory_size(), 2 * 3 * sizeof(float16));
+  // EXPECT a human readable error message
+  // src.data<uint8_t>();
+  // Tensor holds the wrong type, it holds N6paddle8platform7float16E at
+  // [/paddle/Paddle/paddle/fluid/framework/tensor_impl.h:43]
+}
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index f98011e896f4033ef210e0eb69f93ce7800a3cd6..ab693004cfb038fd92afd9c60e0fcb4e16b9f8a9 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -15,6 +15,7 @@
 #include <algorithm>
 #include <limits>
 #include <vector>
+#include "paddle/fluid/framework/data_type.h"
 
 namespace paddle {
 namespace framework {
@@ -261,7 +262,8 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
     os.write(out.data(), size);
   }
   {  // the 3rd field, tensor data
-    uint64_t size = tensor.memory_size();
+    uint64_t size = tensor.numel() * framework::SizeOfType(tensor.type());
+
     auto* data_ptr = tensor.data<void>();
     PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
                    "Index overflow when writing tensor");
@@ -331,6 +333,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     tensor->Resize(framework::make_ddim(dims));
     void* buf;
     auto ctx = platform::CPUDeviceContext();
+    size_t size =
+        tensor->numel() *
+        framework::SizeOfType(framework::ToTypeIndex(desc.data_type()));
     if (platform::is_gpu_place(dev_ctx.GetPlace())) {
 #ifdef PADDLE_WITH_CUDA
       Tensor cpu_tensor;
@@ -338,7 +343,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       framework::VisitDataType(
           desc.data_type(),
           DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
-      is.read(static_cast<char*>(buf), cpu_tensor.memory_size());
+      is.read(static_cast<char*>(buf), size);
       auto dst_place = dev_ctx.GetPlace();
       framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
 #else
@@ -348,7 +353,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       framework::VisitDataType(
           desc.data_type(),
           DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
-      is.read(static_cast<char*>(buf), tensor->memory_size());
+      is.read(static_cast<char*>(buf), size);
     }
   }
 }
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 1895aea7f98cb1ad12b2ce16545339252349ea37..ba7645aa02413f28a648f35e381da7824604a455 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -1,4 +1,11 @@
-set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor )
+# analysis and tensorrt must be added before creating static library,
+# otherwise, there would be undefined reference to them in static library.
+add_subdirectory(analysis)
+if (TENSORRT_FOUND)
+  add_subdirectory(tensorrt)
+endif()
+
+set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor)
 
 # TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
 cc_library(paddle_fluid_api
@@ -7,15 +14,24 @@ cc_library(paddle_fluid_api
 
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 
-if(WITH_CONTRIB)
-  set(fluid_modules "${fluid_modules}" paddle_inference_api)
+# paddle_fluid_origin exclude inference api interface
+cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
+
+if(NOT APPLE)
+  add_subdirectory(api)
 endif()
 
 # Create static library
-cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api)
+cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api)
+if(NOT APPLE)
+  # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
+  set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
+  set_target_properties(paddle_fluid PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+endif()
+
 # Create shared library
 cc_library(paddle_fluid_shared SHARED
-    SRCS io.cc
+    SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
     DEPS ${fluid_modules} paddle_fluid_api)
 
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
@@ -23,15 +39,21 @@ if(NOT APPLE)
   # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
   set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.map")
   set_target_properties(paddle_fluid_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+  # check symbol hidden
+  FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake
+    "execute_process(COMMAND bash -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh"
+    " ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_fluid.so\" RESULT_VARIABLE symbol_res)\n"
+    "if(NOT \"\${symbol_res}\" STREQUAL \"0\")\n"
+    "  message(FATAL_ERROR \"Check symbol failed.\")\n"
+    "endif()\n")
+  add_custom_command(
+    OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol"
+    COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake"
+    DEPENDS paddle_fluid_shared)
+  add_custom_target(check_symbol ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol")
 endif()
 
 if(WITH_TESTING)
   # both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book
   add_subdirectory(tests/book)
 endif()
-
-add_subdirectory(analysis)
-
-if (TENSORRT_FOUND)
-  add_subdirectory(tensorrt)
-endif()
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index cdd67fdc929851979fe0a38afe1af74ec7321b8a..27fe575cb6167a726ff92a8f3d2e47b6f536ba39 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -6,9 +6,11 @@ cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph
   tensorrt_subgraph_node_mark_pass.cc
   analyzer.cc
   helper.cc
+        model_store_pass.cc
   DEPS framework_proto proto_desc)
 cc_test(test_node SRCS node_tester.cc DEPS analysis)
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
+cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis)
 
 set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
 
@@ -19,10 +21,14 @@ function (inference_analysis_test TARGET)
         set(multiValueArgs SRCS)
         cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
+        set(mem_opt "")
+        if(WITH_GPU)
+            set(mem_opt "--fraction_of_gpu_memory_to_use=0.5")
+        endif()
         cc_test(${TARGET}
                 SRCS "${analysis_test_SRCS}"
                 DEPS analysis
-                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model --fraction_of_gpu_memory_to_use=0.5)
+                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt})
         set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
     endif(WITH_TESTING)
 endfunction(inference_analysis_test)
@@ -36,3 +42,4 @@ inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_
 inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
 inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc)
 inference_analysis_test(test_analyzer SRCS analyzer_tester.cc)
+inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc)
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index a4625f008c15300b88ef0bce71cd7d8aa473c9a8..c4ab26a2288bb9d8f3cd54a797d2062e0606b219 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -17,35 +17,52 @@
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/model_store_pass.h"
 #include "paddle/fluid/inference/analysis/pass_manager.h"
 #include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
 #include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
 
 namespace paddle {
-namespace inference {
-namespace analysis {
 
-DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
+DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, true,
             "Enable subgraph to TensorRT engine for acceleration");
 
 DEFINE_string(inference_analysis_graphviz_log_root, "./",
               "Graphviz debuger for data flow graphs.");
 
+DEFINE_string(inference_analysis_output_storage_path, "",
+              "optimized model output path");
+
+namespace inference {
+namespace analysis {
+
 class DfgPassManagerImpl final : public DfgPassManager {
  public:
   DfgPassManagerImpl() {
     // TODO(Superjomn) set the key with pass reprs.
     AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
     if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) {
-      auto trt_teller = [](const Node* node) {
+      auto trt_teller = [&](const Node* node) {
+        std::unordered_set<std::string> teller_set(
+            {"elementwise_add", "mul", "conv2d", "pool2d", "relu"});
         if (!node->IsFunction()) return false;
-        return static_cast<const Function*>(node)->func_type() == "mul";
+
+        const auto* func = static_cast<const Function*>(node);
+        if (teller_set.count(func->func_type()))
+          return true;
+        else {
+          return false;
+        }
       };
+
       AddPass("tensorrt-subgraph-marker",
               new TensorRTSubgraphNodeMarkPass(trt_teller));
       AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller));
     }
     AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
+    if (!FLAGS_inference_analysis_output_storage_path.empty()) {
+      AddPass("model-store-pass", new ModelStorePass);
+    }
   }
 
   std::string repr() const override { return "dfg-pass-manager"; }
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index e9e14fb1947da059c8d126d3da182ce446f6421e..c82fdfff86c91b4e07e3c1b80987d3d8d796ad23 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -16,28 +16,23 @@ limitations under the License. */
 
 /*
  * This file contains Analyzer, an class that exposed as a library that analyze
- * and optimize
- * Fluid ProgramDesc for inference. Similar to LLVM, it has multiple flags to
- * control whether
- * an process is applied on the program.
+ * and optimize Fluid ProgramDesc for inference. Similar to LLVM, it has
+ * multiple flags to
+ * control whether an process is applied on the program.
  *
  * The processes are called Passes in analysis, the Passes are placed in a
- * pipeline, the first
- * Pass is the FluidToDataFlowGraphPass which transforms a Fluid ProgramDesc to
- * a data flow
- * graph, the last Pass is DataFlowGraphToFluidPass which transforms a data flow
- * graph to a
- * Fluid ProgramDesc. The passes in the middle of the pipeline can be any Passes
- * which take a
- * node or data flow graph as input.
+ * pipeline, the first Pass is the FluidToDataFlowGraphPass which transforms a
+ * Fluid ProgramDesc to
+ * a data flow graph, the last Pass is DataFlowGraphToFluidPass which transforms
+ * a data flow graph to a Fluid ProgramDesc. The passes in the middle of the
+ * pipeline can be any Passes
+ * which take a node or data flow graph as input.
  *
  * The Analyzer can be used in two methods, the first is a executable file which
- * can be used to
- * pre-process the inference model and can be controlled by passing difference
- * command flags;
+ * can be used to pre-process the inference model and can be controlled by
+ * passing difference command flags;
  * the other way is to compose inside the inference API as a runtime pre-process
- * phase in the
- * inference service.
+ * phase in the inference service.
  */
 
 #include <gflags/gflags.h>
@@ -45,13 +40,15 @@ limitations under the License. */
 #include "paddle/fluid/inference/analysis/pass_manager.h"
 
 namespace paddle {
-namespace inference {
-namespace analysis {
 
 // TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
 // flag if not available.
 DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine);
 DECLARE_string(inference_analysis_graphviz_log_root);
+DECLARE_string(inference_analysis_output_storage_path);
+
+namespace inference {
+namespace analysis {
 
 class Analyzer : public OrderedRegistry<PassManager> {
  public:
diff --git a/paddle/fluid/framework/details/ssa_graph.cc b/paddle/fluid/inference/analysis/analyzer_main.cc
similarity index 50%
rename from paddle/fluid/framework/details/ssa_graph.cc
rename to paddle/fluid/inference/analysis/analyzer_main.cc
index 1b8c889449059c563ea39f86250075ac2537cdbe..5e1fe3eb797cdced56a61aa2db0c3d18601824f8 100644
--- a/paddle/fluid/framework/details/ssa_graph.cc
+++ b/paddle/fluid/inference/analysis/analyzer_main.cc
@@ -1,4 +1,4 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,4 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/details/ssa_graph.h"
+/*
+ * This file implements analysizer -- an executation help to analyze and
+ * optimize trained model.
+ */
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  using paddle::inference::analysis::Analyzer;
+  using paddle::inference::analysis::Argument;
+
+  Argument argument;
+  Analyzer analyzer;
+  analyzer.Run(&argument);
+
+  return 0;
+}
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index d7c1a72932a39f878add2bb884e280b91d3c38c0..24bfb3993cf569561980006b6627b56327dd0f67 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -13,13 +13,25 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/analyzer.h"
+#include <google/protobuf/text_format.h>
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, main) {
+TEST(Analyzer, analysis_without_tensorrt) {
+  FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = false;
+  Argument argument;
+  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
+  Analyzer analyser;
+  analyser.Run(&argument);
+}
+
+TEST(Analyzer, analysis_with_tensorrt) {
+  FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = true;
+  Argument argument;
+  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
   Analyzer analyser;
   analyser.Run(&argument);
 }
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 6d316f20bff7a68754b0afec6463bd5d7579227f..a17d6281a2976f0600c7ce94c2d43e65d30de265 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -23,6 +23,7 @@
 
 #pragma once
 
+#include <string>
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 
@@ -36,6 +37,16 @@ namespace analysis {
  * All the fields should be registered here for clearness.
  */
 struct Argument {
+  Argument() = default;
+  explicit Argument(const std::string& fluid_model_dir)
+      : fluid_model_dir(new std::string(fluid_model_dir)) {}
+  // The directory of the trained model.
+  std::unique_ptr<std::string> fluid_model_dir;
+  // The path of `__model__` and `param`, this is used when the file name of
+  // model and param is changed.
+  std::unique_ptr<std::string> fluid_model_program_path;
+  std::unique_ptr<std::string> fluid_model_param_path;
+
   // The graph that process by the Passes or PassManagers.
   std::unique_ptr<DataFlowGraph> main_dfg;
 
@@ -44,6 +55,9 @@ struct Argument {
 
   // The processed program desc.
   std::unique_ptr<framework::proto::ProgramDesc> transformed_program_desc;
+
+  // The output storage path of ModelStorePass.
+  std::unique_ptr<std::string> model_output_store_path;
 };
 
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc
index d09bf3ed161703b0cf273522921e157c7360a0bc..7f64bc75ae8ad40a268739cdc36051e76af9f49a 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -90,6 +90,20 @@ std::string DataFlowGraph::DotString() const {
   return dot.Build();
 }
 
+std::string DataFlowGraph::HumanReadableInfo(bool show_values,
+                                             bool show_functions) const {
+  std::stringstream values, functions;
+  for (auto &n : nodes.nodes()) {
+    if (show_values && n->IsValue()) {
+      values << n->repr() << "\n";
+    }
+    if (show_functions && n->IsFunction()) {
+      functions << n->repr() << "\n";
+    }
+  }
+  return "Values:\n" + values.str() + "\n\n" + "Functions:\n" + functions.str();
+}
+
 //
 // NodesBFSIterator
 //
@@ -146,7 +160,7 @@ bool GraphTraits<DataFlowGraph>::NodesBFSIterator::operator==(
   if ((!queue_.empty()) && (!other.queue_.empty())) {
     return queue_.front() == other.queue_.front() &&
            visited_.size() == other.visited_.size();  // here need to check the
-                                                      // equality of queue and
+    // equality of queue and
     // visited. Just a light but week implementation.
   }
   return false;
@@ -208,6 +222,149 @@ Node *GraphTraits<DataFlowGraph>::NodesDFSIterator::operator->() {
   return stack_.top();
 }
 
+inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
+  return node.inlinks.size() == n;
+}
+
+GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
+    const std::vector<Node *> &source) {
+  PADDLE_ENFORCE(!source.empty(),
+                 "Start points of topological sorting should not be empty!");
+  // CHECK all the inputs' in-degree is 0
+  for (auto *node : source) {
+    PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
+  }
+
+  std::unordered_set<Node *> visited;
+  std::unordered_set<Node *> to_visit{source.begin(), source.end()};
+
+  std::vector<Node *> inlink_visited;
+  while (!to_visit.empty()) {
+    std::vector<Node *> queue(to_visit.begin(), to_visit.end());
+    for (auto *p : queue) {
+      if (p->deleted()) {
+        visited.insert(p);
+        to_visit.erase(p);
+        continue;
+      }
+      inlink_visited.clear();
+
+      std::copy_if(p->inlinks.begin(), p->inlinks.end(),
+                   std::back_inserter(inlink_visited),
+                   [&](Node *x) { return visited.count(x); });
+
+      if (inlink_visited.size() == p->inlinks.size()) {
+        sorted_.push_back(p);
+        for (auto *_ : p->outlinks) {
+          if (!visited.count(_)) {
+            to_visit.insert(_);
+          }
+        }
+
+        to_visit.erase(p);
+        visited.insert(p);
+      }
+    }
+  }
+}
+
+GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
+    const paddle::inference::analysis::GraphTraits<
+        DataFlowGraph>::NodesTSIterator &other)
+    : sorted_(other.sorted_), cursor_(other.cursor_) {}
+
+Node &GraphTraits<DataFlowGraph>::NodesTSIterator::operator*() {
+  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
+  return *sorted_[cursor_];
+}
+
+paddle::inference::analysis::GraphTraits<DataFlowGraph>::NodesTSIterator
+    &GraphTraits<DataFlowGraph>::NodesTSIterator::operator++() {
+  if (++cursor_ >= sorted_.size()) {
+    sorted_.clear();
+    cursor_ = 0;
+  }
+  return *this;
+}
+paddle::inference::analysis::GraphTraits<DataFlowGraph>::NodesTSIterator &
+GraphTraits<DataFlowGraph>::NodesTSIterator::operator=(
+    const paddle::inference::analysis::GraphTraits<
+        DataFlowGraph>::NodesTSIterator &other) {
+  cursor_ = other.cursor_;
+  sorted_ = other.sorted_;
+  return *this;
+}
+
+bool GraphTraits<DataFlowGraph>::NodesTSIterator::operator==(
+    const paddle::inference::analysis::GraphTraits<
+        DataFlowGraph>::NodesTSIterator &other) {
+  return sorted_ == other.sorted_ && cursor_ == other.cursor_;
+}
+
+Node *GraphTraits<DataFlowGraph>::NodesTSIterator::operator->() {
+  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
+  return sorted_[cursor_];
+}
+
+std::pair<std::vector<Node *>, std::vector<Node *>>
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
+  std::unordered_set<Node *> nodes(graph.begin(), graph.end());
+  std::unordered_set<Node *> inputs;
+  std::unordered_set<Node *> outputs;
+  // Input a Value, check whether its inlink is in the subgraph.
+  auto inlink_in_subgraph = [&](Node *n) {
+    for (auto *in : n->inlinks) {
+      if (nodes.count(in)) return true;
+    }
+    return false;
+  };
+  for (auto &node : graph) {
+    for (auto *in : node->inlinks) {
+      // The Value that is written by nodes inside a sub-graph shouldn't be the
+      // input of the sub-graph.
+      if (!nodes.count(in) && in->type() == Node::Type::kValue &&
+          !inlink_in_subgraph(in)) {
+        inputs.insert(in);
+      }
+    }
+    for (auto *out : node->outlinks) {
+      if (!nodes.count(out) && out->type() == Node::Type::kValue) {
+        outputs.insert(out);
+      }
+    }
+  }
+  return std::make_pair(std::vector<Node *>(inputs.begin(), inputs.end()),
+                        std::vector<Node *>(outputs.begin(), outputs.end()));
+}
+
+void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) {
+  std::vector<Node *> op_nodes;
+  for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
+    if (node.type() == Node::Type::kValue || node.deleted()) {
+      continue;
+    }
+    op_nodes.push_back(&node);
+  }
+  size_t op_num = op_nodes.size();
+  for (size_t i = 0; i < op_num; i++) {
+    if (op_nodes[i]->type() == Node::Type::kFunction) continue;
+    std::unordered_set<std::string> follow_up_input_names;
+    for (size_t j = i + 1; j < op_num; j++) {
+      for (auto *in : op_nodes[j]->inlinks) {
+        follow_up_input_names.insert(in->name());
+      }
+    }
+    std::vector<Node *> filtered_subgraph_outlinks;
+    for (auto *out : op_nodes[i]->outlinks) {
+      if (follow_up_input_names.count(out->name())) {
+        filtered_subgraph_outlinks.push_back(out);
+      }
+    }
+    PADDLE_ENFORCE_GE(filtered_subgraph_outlinks.size(), 1UL);
+    op_nodes[i]->outlinks = filtered_subgraph_outlinks;
+  }
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
index a4fefc83e0c551d52bec87299bcbc966e7a2dbf7..bb3ec6bbc1d9555386aba8837b019d2511653258 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -36,6 +36,8 @@ namespace analysis {
 
 /*
  * DataFlowGraph - A container of Value and Function Nodes.
+ *
+ * This is the base graph for any other type of graphs, such as SSA or CFG.
  */
 struct DataFlowGraph {
   NodeMap nodes;
@@ -48,6 +50,9 @@ struct DataFlowGraph {
   // Output a DOT graph file for debug.
   std::string DotString() const;
 
+  std::string HumanReadableInfo(bool show_values = true,
+                                bool show_functions = true) const;
+
  private:
   // Remove duplicate edges and so on.
   void Clean();
@@ -107,6 +112,32 @@ struct GraphTraits<DataFlowGraph> {
     std::unordered_set<Node *> visited_;
   };
 
+  // Topological sorting iterator on nodes.
+  struct NodesTSIterator
+      : public std::iterator<std::forward_iterator_tag, Node *> {
+    NodesTSIterator() = default;
+    explicit NodesTSIterator(const std::vector<Node *> &source);
+    NodesTSIterator(NodesTSIterator &&other)
+        : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
+      other.cursor_ = 0;
+    }
+    NodesTSIterator(const NodesTSIterator &other);
+
+    Node &operator*();
+    NodesTSIterator &operator++();
+    // TODO(Superjomn) current implementation just compare the first
+    // element, need to compare the graph and all the elements in the queue and
+    // set.
+    NodesTSIterator &operator=(const NodesTSIterator &other);
+    bool operator==(const NodesTSIterator &other);
+    bool operator!=(const NodesTSIterator &other) { return !(*this == other); }
+    Node *operator->();
+
+   private:
+    std::vector<Node *> sorted_;
+    size_t cursor_{0};
+  };
+
   explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {}
 
   // default use BFS to visit the nodes.
@@ -119,17 +150,24 @@ struct GraphTraits<DataFlowGraph> {
   iterator_range<NodesDFSIterator> nodes_in_DFS() {
     return iterator_range<NodesDFSIterator>(nodes_dfs_begin(), nodes_dfs_end());
   }
+  iterator_range<NodesTSIterator> nodes_in_TS() {
+    return iterator_range<NodesTSIterator>(nodes_ts_begin(), nodes_ts_end());
+  }
 
  private:
   NodesBFSIterator nodes_bfs_begin() {
     return NodesBFSIterator(graph_->inputs);
   }
   NodesBFSIterator nodes_bfs_end() { return NodesBFSIterator(); }
+
   NodesDFSIterator nodes_dfs_begin() {
     return NodesDFSIterator(graph_->inputs);
   }
   NodesDFSIterator nodes_dfs_end() { return NodesDFSIterator(); }
 
+  NodesTSIterator nodes_ts_begin() { return NodesTSIterator(graph_->inputs); }
+  NodesTSIterator nodes_ts_end() { return NodesTSIterator(); }
+
  private:
   DataFlowGraph *graph_;
 };
@@ -137,37 +175,10 @@ struct GraphTraits<DataFlowGraph> {
 // Extract the inputs and outputs of a graph. The inputs and outputs of a
 // sub-graph is the inputs nodes and output nodes that doesn't inside the
 // sub-graph.
-static std::pair<std::vector<Node *>, std::vector<Node *>>
-ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
-  std::unordered_set<Node *> nodes(graph.begin(), graph.end());
-  std::unordered_set<Node *> inputs;
-  std::unordered_set<Node *> outputs;
-  // Input a Value, check whether its inlink is in the subgraph.
-  auto inlink_in_subgraph = [&](Node *n) {
-    for (auto *in : n->inlinks) {
-      if (nodes.count(in)) return true;
-    }
-    return false;
-  };
-  for (auto &node : graph) {
-    for (auto *in : node->inlinks) {
-      // The Value that is written by nodes inside a sub-graph shouldn't be the
-      // input of the sub-graph.
-      if (!nodes.count(in) && in->type() == Node::Type::kValue &&
-          !inlink_in_subgraph(in)) {
-        inputs.insert(in);
-      }
-    }
-    for (auto *out : node->outlinks) {
-      if (!nodes.count(out) && out->type() == Node::Type::kValue) {
-        outputs.insert(out);
-      }
-    }
-  }
-  return std::make_pair(std::vector<Node *>(inputs.begin(), inputs.end()),
-                        std::vector<Node *>(outputs.begin(), outputs.end()));
-}
+std::pair<std::vector<Node *>, std::vector<Node *>>
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph);  // NOLINT
 
+void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph);
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
index 9d7cceeb65888b8ba3fdf39e88fc2877abd82d11..a881262665f156812da9e1576aa29b05fc398499 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
@@ -20,15 +20,15 @@ namespace inference {
 namespace analysis {
 
 TEST(DataFlowGraph, BFS) {
-  auto desc = LoadProgramDesc();
+  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
   auto dfg = ProgramDescToDFG(desc);
   dfg.Build();
 
-  for (auto* in : dfg.inputs) {
+  for (auto *in : dfg.inputs) {
     LOG(INFO) << "inputs: " << in->name() << " "
               << static_cast<int>(in->type());
   }
-  for (auto* out : dfg.outputs) {
+  for (auto *out : dfg.outputs) {
     LOG(INFO) << "outputs: " << out->name() << " "
               << static_cast<int>(out->type());
   }
@@ -44,7 +44,7 @@ TEST(DataFlowGraph, BFS) {
 }
 
 TEST(DataFlowGraph, DFS) {
-  auto desc = LoadProgramDesc();
+  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
   auto dfg = ProgramDescToDFG(desc);
   dfg.Build();
   GraphTraits<DataFlowGraph> trait(&dfg);
@@ -57,6 +57,71 @@ TEST(DataFlowGraph, DFS) {
   ASSERT_EQ(count, dfg.nodes.size());
 }
 
+// Topological sorting.
+/*
+ * Graph topology
+ * inputs: 0, 1, 2
+ * 0 -> 4
+ * 0 -> 5
+ * 1 -> 6
+ * 2 -> 7
+ * 4 -> 5
+ * 4 -> 7
+ * 4 -> 3
+ * 7 -> 3
+ */
+TEST(DataFlowGraph, TS) {
+  DataFlowGraph graph;
+
+  for (int i = 0; i < 8; i++) {
+    auto *node = graph.nodes.Create(Node::Type::kValue);
+    node->SetName("node-" + std::to_string(i));
+  }
+
+  auto add_link = [&](int i, int j) {
+    Node *source = graph.nodes.GetMutable(i);
+    Node *target = graph.nodes.GetMutable(j);
+    target->inlinks.push_back(source);
+    source->outlinks.push_back(target);
+  };
+
+  graph.inputs.push_back(graph.nodes.GetMutable(0));
+  graph.inputs.push_back(graph.nodes.GetMutable(1));
+  graph.inputs.push_back(graph.nodes.GetMutable(2));
+
+  add_link(0, 4);
+  add_link(0, 5);
+  add_link(1, 6);
+  add_link(2, 7);
+  add_link(4, 5);
+  add_link(4, 7);
+  add_link(4, 3);
+  add_link(7, 3);
+
+  auto its = GraphTraits<DataFlowGraph>(&graph).nodes_in_TS();
+  std::vector<int> sorted_ids;
+  for (auto it = its.begin(); it != its.end(); ++it) {
+    LOG(INFO) << it->name();
+    sorted_ids.push_back(it->id());
+  }
+
+  // Assert a occurs prior to b in the sorted_ids.
+  auto assert_positive_sequence_pair = [&](int a, int b) {
+    auto a_offset = std::find(sorted_ids.begin(), sorted_ids.end(), a);
+    auto b_offset = std::find(sorted_ids.begin(), sorted_ids.end(), b);
+    ASSERT_LT(a_offset, b_offset);
+  };
+
+  assert_positive_sequence_pair(2, 7);
+  assert_positive_sequence_pair(7, 3);
+  assert_positive_sequence_pair(4, 3);
+  assert_positive_sequence_pair(0, 4);
+  assert_positive_sequence_pair(0, 5);
+  assert_positive_sequence_pair(1, 6);
+  assert_positive_sequence_pair(4, 5);
+  assert_positive_sequence_pair(4, 7);
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
index 29ca008123addf07959b965a4b54bf55b18c401d..18c32fa09199003f17183207828cdfe4e627ae1a 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -22,14 +22,18 @@
 
 namespace paddle {
 namespace inference {
+
+DEFINE_int32(tensorrt_max_batchsize, 3, "TensorRT maximum batch size");
+DEFINE_int32(tensorrt_workspace_size, 2048, "TensorRT workspace size");
+
 namespace analysis {
 
 using framework::proto::ProgramDesc;
 
 std::vector<std::string> ExtractParameters(
-    const std::vector<std::unique_ptr<Node>>& nodes);
+    const std::vector<std::unique_ptr<Node>> &nodes);
 
-bool DataFlowGraphToFluidPass::Initialize(Argument* argument) {
+bool DataFlowGraphToFluidPass::Initialize(Argument *argument) {
   ANALYSIS_ARGUMENT_CHECK_FIELD(argument)
   ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc)
   PADDLE_ENFORCE(!argument->transformed_program_desc);
@@ -47,76 +51,157 @@ bool DataFlowGraphToFluidPass::Initialize(Argument* argument) {
 
 bool DataFlowGraphToFluidPass::Finalize() { return true; }
 
-void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) {
-  auto traits = GraphTraits<DataFlowGraph>(graph);
-  for (auto it = traits.nodes().begin(); it != traits.nodes().end(); ++it) {
-    if (it->deleted()) continue;
+void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
+  FilterRedundantOutputOfSubGraph(graph);
+  LOG(INFO) << "graph.inputs " << graph->inputs.size();
+  for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
+    if (node.deleted()) continue;
 
-    switch (it->type()) {
+    switch (node.type()) {
       case Node::Type::kFunction: {
-        LOG(INFO) << "add function " << it->repr();
-        AddFluidOp(&(*it));
+        LOG(INFO) << "add function " << node.repr();
+        AddFluidOp(&node);
       } break;
       case Node::Type::kFunctionBlock: {
-        LOG(INFO) << "add engine op " << it->repr() << " , "
-                  << static_cast<FunctionBlock*>(&(*it))->subgraph.size();
-        AddEngineOp(&(*it));
+        LOG(INFO) << "add engine op " << node.repr() << " , "
+                  << static_cast<FunctionBlock *>(&node)->subgraph.size();
+        AddEngineOp(&node);
       } break;
       default:
         continue;
     }
   }
+
+  PADDLE_ENFORCE(argument_->transformed_program_desc.get());
 }
 
-void DataFlowGraphToFluidPass::AddFluidOp(Node* node) {
-  auto* ori_op = static_cast<framework::proto::OpDesc*>(node->pb_desc());
+void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
+  auto *ori_op = static_cast<framework::proto::OpDesc *>(node->pb_desc());
   // currently only the main block is analyzed.
-  auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
-  auto* op = main_block->add_ops();
+  auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
+  auto *op = main_block->add_ops();
   *op = *ori_op;  // copy the attributes, by default, these will not be changed
-                  // by analysis phrase.
+  // by analysis phrase.
   // The inputs and outputs of the existing ops are not changed by tensorrt
   // subgraph pass.
   // NOTE It might be changed by other passes in the long run.
 }
 
-void CreateTrtEngineOp(Node* node, const DataFlowGraph& graph,
-                       const framework::proto::BlockDesc& block) {
+void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
+                       framework::proto::BlockDesc *block) {
   static int counter{0};
   PADDLE_ENFORCE(node->IsFunctionBlock());
   framework::OpDesc desc;
-  auto* func = static_cast<FunctionBlock*>(node);
+  auto *func = static_cast<FunctionBlock *>(node);
 
   // collect inputs
-  std::vector<std::string> io;
-  for (auto* x : func->inlinks) {
-    io.push_back(x->name());
+  std::unordered_set<std::string> input_names;
+  for (auto *x : func->inlinks) {
+    input_names.insert(x->name());
   }
-  desc.SetInput("Xs", io);
+  desc.SetInput(
+      "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
 
-  // collect outputs
-  io.clear();
-  for (auto* x : func->outlinks) {
-    io.push_back(x->name());
+  std::unordered_set<std::string> output_names;
+  for (auto *x : func->outlinks) {
+    output_names.insert(x->name());
   }
-  desc.SetOutput("Ys", io);
 
+  std::vector<std::string> output_temp(output_names.begin(),
+                                       output_names.end());
+  desc.SetOutput("Ys", output_temp);
   desc.SetType("tensorrt_engine");
+
+  std::unordered_map<std::string, std::string> output_name_map;
+
+  // The following procedure is used to rename all the intermediate
+  // variables and the output variables of the subgraph.
+  // Why we do this?
+  // During the transition from fluid OP to tensorrt OP, we map
+  // the input and output Tensor(fluid data structure) of fluid OP
+  // to the correspondin ITensor (trt data structure) through the
+  // Tensor name. When we set up ITensor for an variable, we must
+  // ensure that it has not been set before.
+  // If there is variable in the fluid graph, which is not only the
+  // input of a OP, but also the output of a Op, there will be problems.
+  // So we have to rename the variable in the subgraph to make sure
+  // it is either an OP's input or an OP's output.
+
+  auto subgraph_nodes = func->subgraph;
+  for (int index = 0; index < block->ops_size(); index++) {
+    framework::proto::OpDesc *op = block->mutable_ops(index);
+    auto correspond_node = subgraph_nodes[index];
+    PADDLE_ENFORCE_EQ(correspond_node->name(), op->type());
+
+    std::unordered_map<std::string, size_t> var2id;
+    for (auto *in_var : correspond_node->inlinks) {
+      var2id[in_var->name()] = in_var->id();
+    }
+    // rename for the input variables of op inside subgraph
+    for (int i = 0; i < op->inputs_size(); i++) {
+      framework::proto::OpDesc_Var *in_var = op->mutable_inputs(i);
+      std::vector<std::string> replaced_names;
+      for (int k = 0; k < in_var->arguments_size(); k++) {
+        std::string arg_value = in_var->arguments(k);
+        if (input_names.count(arg_value)) {
+          replaced_names.push_back(arg_value);
+        } else {
+          replaced_names.push_back(arg_value +
+                                   std::to_string(var2id[arg_value]));
+        }
+      }
+      in_var->clear_arguments();
+      for (size_t k = 0; k < replaced_names.size(); k++) {
+        in_var->add_arguments(replaced_names[k]);
+      }
+    }
+    var2id.clear();
+    for (auto out_var : correspond_node->outlinks) {
+      var2id[out_var->name()] = out_var->id();
+    }
+
+    // rename for the output variables of op inside subgraph
+    for (int i = 0; i < op->outputs_size(); i++) {
+      framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
+      std::vector<std::string> replaced_names;
+      for (int k = 0; k < out_var->arguments_size(); k++) {
+        std::string arg_value = out_var->arguments(k);
+        if (output_names.count(arg_value)) {
+          output_name_map[arg_value] =
+              arg_value + std::to_string(var2id[arg_value]);
+        }
+        replaced_names.push_back(arg_value + std::to_string(var2id[arg_value]));
+      }
+      out_var->clear_arguments();
+      for (size_t k = 0; k < replaced_names.size(); k++) {
+        out_var->add_arguments(replaced_names[k]);
+      }
+    }
+  }
+  // When tensorrt engine runs at the end of the operation,
+  // output_mapping help us copy the data from the renamed ITensor
+  // to Tensor.
+  std::vector<std::string> output_mapping;
+  for (auto name : output_names) {
+    PADDLE_ENFORCE(output_name_map.count(name) != 0);
+    output_mapping.push_back(output_name_map[name]);
+  }
+
+  PADDLE_ENFORCE(!block->vars().empty(), "the block has no var-desc");
   // Set attrs
-  SetAttr(desc.Proto(), "subgraph", block.SerializeAsString());
-  SetAttr(desc.Proto(), "engine_unique_key",
-          "trt-" + std::to_string(counter++));
-  SetAttr(desc.Proto(), "max_batch", 100);  // TODO(Superjomn) add config latter
-  SetAttr(desc.Proto(), "max_workspace",
-          1024);  // TODO(Superjomn) add config latter
+  SetAttr(desc.Proto(), "subgraph", block->SerializeAsString());
+  SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
+  SetAttr(desc.Proto(), "max_batch", FLAGS_tensorrt_max_batchsize);
+  SetAttr(desc.Proto(), "max_workspace", FLAGS_tensorrt_workspace_size);
   SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
+  SetAttr(desc.Proto(), "output_name_mapping", output_mapping);
   node->SetPbMsg(desc.Proto()->SerializeAsString());
 }
 
 std::vector<std::string> ExtractParameters(
-    const std::vector<std::unique_ptr<Node>>& nodes) {
+    const std::vector<std::unique_ptr<Node>> &nodes) {
   std::vector<std::string> parameters;
-  for (const auto& node : nodes) {
+  for (const auto &node : nodes) {
     if (!node->IsValue()) continue;
     PADDLE_ENFORCE(!node->pb_msg().empty(), "pb_msg should be set first");
     framework::proto::VarDesc var;
@@ -128,21 +213,32 @@ std::vector<std::string> ExtractParameters(
   return parameters;
 }
 
-void DataFlowGraphToFluidPass::AddEngineOp(Node* node) {
+void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
   // TODO(Superjomn) Here need to expose some arguments for default setting.
   PADDLE_ENFORCE(node->IsFunctionBlock());
-  auto* block_node = static_cast<FunctionBlock*>(node);
+  auto *block_node = static_cast<FunctionBlock *>(node);
   framework::proto::BlockDesc proto;
   framework::BlockDesc block_desc(nullptr, &proto);
+  block_desc.Proto()->set_parent_idx(-1);
+  block_desc.Proto()->set_idx(0);
+  LOG(INFO) << "origin variable size: "
+            << argument_->origin_program_desc->blocks(0).vars().size();
+  LOG(INFO) << "transformed variable size: "
+            << block_desc.Proto()->vars().size();
   // copy ops.
-  for (auto* node : block_node->subgraph) {
-    auto* op = block_desc.AppendOp();
+
+  for (auto *node : block_node->subgraph) {
+    auto *op = block_desc.AppendOp();
     PADDLE_ENFORCE(!node->pb_msg().empty());
     op->Proto()->ParseFromString(node->pb_msg());
   }
-  CreateTrtEngineOp(node, *argument_->main_dfg, *block_desc.Proto());
-  auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
-  auto* op = main_block->add_ops();
+
+  *block_desc.Proto()->mutable_vars() =
+      argument_->origin_program_desc->blocks(0).vars();
+  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty());
+  CreateTrtEngineOp(node, *argument_->main_dfg, block_desc.Proto());
+  auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
+  auto *op = main_block->add_ops();
   PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
   op->ParseFromString(node->pb_msg());
 }
@@ -151,7 +247,7 @@ namespace {
 class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
  public:
   using Config = DFG_GraphvizDrawPass::Config;
-  explicit DFG_DebuggerPass(const Config& config)
+  explicit DFG_DebuggerPass(const Config &config)
       : DFG_GraphvizDrawPass(config) {}
 
   std::string repr() const override { return "dfg-to-fluid-debuger-pass"; }
@@ -160,7 +256,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
 };
 }  // namespace
 
-Pass* DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
+Pass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
   return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
       FLAGS_inference_analysis_graphviz_log_root,
       "data_flow_graph_to_fluid_graphviz_debugger"));
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
index edc84b02ed20991e3e7c6c437d2b1fac169bae03..59c47365aa6c8ad5886c4515850d264f69cc4670 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
@@ -26,6 +26,10 @@
 
 namespace paddle {
 namespace inference {
+
+DECLARE_int32(tensorrt_max_batchsize);
+DECLARE_int32(tensorrt_workspace_size);
+
 namespace analysis {
 class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
  public:
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
index d8fc5e580a98f76233f01fdc4d7987311f78ee45..4ef381db295b986b91173a728b6d98640f6f4f51 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
@@ -26,21 +26,21 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, Test) {
-  DataFlowGraph graph;
+TEST(DataFlowGraph, Test) {
+  Argument argument(FLAGS_inference_model_dir);
 
   FluidToDataFlowGraphPass pass0;
   DataFlowGraphToFluidPass pass1;
   ASSERT_TRUE(pass0.Initialize(&argument));
   ASSERT_TRUE(pass1.Initialize(&argument));
 
-  pass0.Run(&graph);
-  pass1.Run(&graph);
+  pass0.Run(argument.main_dfg.get());
+  pass1.Run(argument.main_dfg.get());
 
   pass0.Finalize();
   pass1.Finalize();
 
-  LOG(INFO) << graph.nodes.size();
+  LOG(INFO) << argument.main_dfg->nodes.size();
 }
 
 };  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
index a6f85484756417e103cbb60bcb664e8b800b9f28..c05b0e5d4690d0a447edf63a149903704bc2c9be 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
@@ -46,9 +46,9 @@ std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) {
   for (size_t i = 0; i < graph->nodes.size(); i++) {
     const Node &node = graph->nodes.Get(i);
     if (!config_.display_deleted_node && node.deleted()) continue;
-    for (auto &in : node.inlinks) {
-      if (!config_.display_deleted_node && in->deleted()) continue;
-      dot.AddEdge(in->repr(), node.repr(), {});
+    for (auto &out : node.outlinks) {
+      if (!config_.display_deleted_node && out->deleted()) continue;
+      dot.AddEdge(node.repr(), out->repr(), {});
     }
   }
   return dot.Build();
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
index 162455b9c4e06b7fbb4bdede30444faf6a8a1509..928be7917047382d9b86294f6039b26b0ebf6f49 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
@@ -23,12 +23,18 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
-  auto dfg = ProgramDescToDFG(*argument.origin_program_desc);
+TEST(DFG_GraphvizDrawPass, dfg_graphviz_draw_pass_tester) {
+  Argument argument(FLAGS_inference_model_dir);
+  FluidToDataFlowGraphPass pass0;
+  ASSERT_TRUE(pass0.Initialize(&argument));
+  pass0.Run(argument.main_dfg.get());
+
+  // auto dfg = ProgramDescToDFG(*argument.origin_program_desc);
+
   DFG_GraphvizDrawPass::Config config("./", "test");
   DFG_GraphvizDrawPass pass(config);
   pass.Initialize(&argument);
-  pass.Run(&dfg);
+  pass.Run(argument.main_dfg.get());
 
   // test content
   std::ifstream file("./0-graph_test.dot");
@@ -40,7 +46,7 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
     no++;
   }
   // DFG is sensitive to ProgramDesc, be careful to change the existing models.
-  ASSERT_EQ(no, 82);
+  ASSERT_EQ(no, 83);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
index e918622d74cfb11d83090555be2a768cc14e7742..511631d3e067f14bc1230d9e4b4d92dbe604e1d4 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <glog/logging.h>
 #include <string>
 #include <vector>
 
@@ -25,10 +26,21 @@ namespace analysis {
 
 bool FluidToDataFlowGraphPass::Initialize(Argument *argument) {
   ANALYSIS_ARGUMENT_CHECK_FIELD(argument);
-  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc);
-  PADDLE_ENFORCE(argument);
+  if (argument->origin_program_desc) {
+    LOG(WARNING) << "argument's origin_program_desc is already set, might "
+                    "duplicate called";
+  }
+  if (!argument->fluid_model_program_path) {
+    ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_dir);
+    argument->fluid_model_program_path.reset(
+        new std::string(*argument->fluid_model_dir + "/__model__"));
+  }
+  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path);
+  auto program = LoadProgramDesc(*argument->fluid_model_program_path);
+  argument->origin_program_desc.reset(
+      new framework::proto::ProgramDesc(program));
+
   if (!argument->main_dfg) {
-    LOG(INFO) << "Init DFG";
     argument->main_dfg.reset(new DataFlowGraph);
   }
   desc_ = argument->origin_program_desc.get();
@@ -41,6 +53,8 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
   PADDLE_ENFORCE(graph);
   PADDLE_ENFORCE(desc_);
   // insert vars
+  // The `var2id` keeps a map from a variable's name to its Node-id, the Node-id
+  // will keep updating to its latest alias during the graph-building.
   std::unordered_map<std::string, size_t> var2id;
   auto &main_block = desc_->blocks(framework::kRootBlockIndex);
   for (int i = 0; i < main_block.vars_size(); i++) {
@@ -51,6 +65,16 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
     v->SetPbMsg(var.SerializeAsString());
     var2id[var.name()] = v->id();
   }
+
+  // The variables in a SSA can only write once, so if a variable is written
+  // multiple times(quite common in our ProgramDesc design), multiple alias
+  // Nodes of this variable will be created, and each will just write once.
+
+  // An set that keep all the names of the variables(the original, not alias)
+  // that have been written(as outputs). Once an Op's output variable hit the
+  // set, it should create a new alias and update the global alias for this
+  // variable. And that make a Data Flow Graph a SSA.
+  std::unordered_set<Node *> unique_written_vars;
   for (int i = 0; i < main_block.ops_size(); i++) {
     const auto &op = main_block.ops(i);
     auto *o = graph->nodes.Create(Node::Type::kFunction);
@@ -62,7 +86,6 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
     o->SetPbMsg(op.SerializeAsString());
 
     // set inputs and outputs
-    // TODO(Superjomn) make sure the InputNames is the real variable name.
     for (int j = 0; j < op.inputs_size(); j++) {
       auto &in_var = op.inputs(j);
       for (int k = 0; k < in_var.arguments_size(); k++) {
@@ -75,8 +98,21 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
       auto &out_var = op.outputs(j);
       for (int k = 0; k < out_var.arguments_size(); k++) {
         auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]);
+        if (unique_written_vars.count(out)) {
+          // Loop found, for example, a = op(a), use SSA, change to a1 = op(a).
+          auto *out_alias = graph->nodes.Create(Node::Type::kValue);
+          out_alias->SetName(out->name());
+          out_alias->SetPbDesc(out->pb_desc());
+          out_alias->SetPbMsg(out->pb_msg());
+          var2id[out_alias->name()] =
+              out_alias->id();  // update variable's alias Node
+          LOG(INFO) << "loop found in graph, create SSA alias node ["
+                    << out_alias->repr() << "] for [" << out->repr() << "]";
+          out = out_alias;
+        }
         out->inlinks.push_back(o);
         o->outlinks.push_back(out);
+        unique_written_vars.insert(out);
       }
     }
   }
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
index da8463b63bd0bb1633bfcb9d7d41a884ddd632c7..fb948bf2242abcbc1e841fd3b8457e63358782c5 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
@@ -30,7 +30,7 @@ namespace inference {
 namespace analysis {
 
 /*
- * Transform a FluidDesc to a data flow graph.
+ * Transform a FluidDesc to a SSA.
  */
 class FluidToDataFlowGraphPass final : public DataFlowGraphPass {
  public:
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
index cfbbc284e491bd62a6108d6d14e7896a57d1b63e..d218dcd05015aa4636c16569de4addf4936c8cd5 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
@@ -21,15 +21,16 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, Init) {
+TEST(FluidToDataFlowGraphPass, Test) {
   FluidToDataFlowGraphPass pass;
+  Argument argument(FLAGS_inference_model_dir);
   pass.Initialize(&argument);
-  DataFlowGraph graph;
-  pass.Run(&graph);
+  pass.Run(argument.main_dfg.get());
   // Analysis is sensitive to ProgramDesc, careful to change the original model.
-  ASSERT_EQ(graph.nodes.size(), 37);
+  ASSERT_EQ(argument.main_dfg->nodes.size(), 38UL);
   pass.Finalize();
-  LOG(INFO) << '\n' << graph.DotString();
+  ASSERT_FALSE(argument.main_dfg->DotString().empty());
+  EXPECT_FALSE(argument.main_dfg->inputs.empty());
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index f1064cd20f28092d80d3fd23a862da080b6cc2f3..a0f912b251d5ea29594a7f601d5b2bce91201790 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <cstdio>
+#include <fstream>
 #include <string>
 #include <typeindex>
 #include <unordered_map>
@@ -136,6 +137,20 @@ static void ExecShellCommand(const std::string &cmd, std::string *message) {
   }
 }
 
+static framework::proto::ProgramDesc LoadProgramDesc(
+    const std::string &model_path) {
+  std::ifstream fin(model_path, std::ios::in | std::ios::binary);
+  PADDLE_ENFORCE(fin.is_open(), "Cannot open file %s", model_path);
+  fin.seekg(0, std::ios::end);
+  std::string buffer(fin.tellg(), ' ');
+  fin.seekg(0, std::ios::beg);
+  fin.read(&buffer[0], buffer.size());
+  fin.close();
+  framework::proto::ProgramDesc program_desc;
+  program_desc.ParseFromString(buffer);
+  return program_desc;
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/model_store_pass.cc b/paddle/fluid/inference/analysis/model_store_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1c429176424bd5c1d8fa5e015c19d698f966880e
--- /dev/null
+++ b/paddle/fluid/inference/analysis/model_store_pass.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/argument.h"
+#include "paddle/fluid/inference/analysis/model_store_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void ModelStorePass::Run(DataFlowGraph *x) {
+  if (!argument_->fluid_model_param_path) {
+    PADDLE_ENFORCE_NOT_NULL(argument_->fluid_model_dir);
+    argument_->fluid_model_param_path.reset(
+        new std::string(*argument_->fluid_model_dir + "param"));
+  }
+  PADDLE_ENFORCE_NOT_NULL(argument_->model_output_store_path);
+  // Directly copy param file to destination.
+  std::stringstream ss;
+  // NOTE these commands only works on linux.
+  ss << "mkdir -p " << *argument_->model_output_store_path;
+  LOG(INFO) << "run command: " << ss.str();
+  PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
+  ss.str("");
+
+  ss << "cp " << *argument_->fluid_model_dir << "/*"
+     << " " << *argument_->model_output_store_path;
+  LOG(INFO) << "run command: " << ss.str();
+  PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
+
+  // Store program
+  PADDLE_ENFORCE_NOT_NULL(argument_->transformed_program_desc,
+                          "program desc is not transformed, should call "
+                          "DataFlowGraphToFluidPass first.");
+  const std::string program_output_path =
+      *argument_->model_output_store_path + "/__model__";
+  std::ofstream file(program_output_path, std::ios::binary);
+  PADDLE_ENFORCE(file.is_open(), "failed to open %s to write.",
+                 program_output_path);
+  const std::string serialized_message =
+      argument_->transformed_program_desc->SerializeAsString();
+  file.write(serialized_message.c_str(), serialized_message.size());
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/model_store_pass.h b/paddle/fluid/inference/analysis/model_store_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..fac7083925776b6209d49255c9e67b930cb1250b
--- /dev/null
+++ b/paddle/fluid/inference/analysis/model_store_pass.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file defines ModelStorePass, which store the runtime DFG to a Paddle
+ * model in the disk, and that model can be reloaded for prediction.
+ */
+
+#pragma once
+#include <string>
+#include "paddle/fluid/inference/analysis/pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+class ModelStorePass : public DataFlowGraphPass {
+ public:
+  bool Initialize(Argument* argument) override {
+    if (!argument) {
+      LOG(ERROR) << "invalid argument";
+      return false;
+    }
+    argument_ = argument;
+    return true;
+  }
+
+  void Run(DataFlowGraph* x) override;
+
+  std::string repr() const override { return "DFG-store-pass"; }
+  std::string description() const override {
+    return R"DD(This file defines ModelStorePass, which store the runtime DFG to a Paddle
+    model in the disk, and that model can be reloaded for prediction again.)DD";
+  }
+
+ private:
+  Argument* argument_{nullptr};
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/model_store_pass_tester.cc b/paddle/fluid/inference/analysis/model_store_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5f3526dd504e77e58d79b4f675db86a22fd0f26b
--- /dev/null
+++ b/paddle/fluid/inference/analysis/model_store_pass_tester.cc
@@ -0,0 +1,43 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/model_store_pass.h"
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/analyzer.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+DEFINE_string(inference_model_dir, "", "Model path");
+
+TEST(DFG_StorePass, test) {
+  Analyzer analyzer;
+  Argument argument(FLAGS_inference_model_dir);
+  argument.model_output_store_path.reset(
+      new std::string("./_dfg_store_pass_tmp"));
+  // disable storage in alalyzer
+  FLAGS_inference_analysis_output_storage_path = "";
+  analyzer.Run(&argument);
+
+  ModelStorePass pass;
+  pass.Initialize(&argument);
+  pass.Run(argument.main_dfg.get());
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/pass.h
index 6b4dbb3bb5ddd9f15f26758beef1d1b5bbf49142..6806f9ff7dada2c1e2328e1ffbfd225afefcf474 100644
--- a/paddle/fluid/inference/analysis/pass.h
+++ b/paddle/fluid/inference/analysis/pass.h
@@ -50,6 +50,7 @@ class Pass {
   // Create a debugger Pass that draw the DFG by graphviz toolkit.
   virtual Pass *CreateGraphvizDebugerPass() const { return nullptr; }
 
+  virtual void Run() { LOG(FATAL) << "not valid"; }
   // Run on a single Node.
   virtual void Run(Node *x) { LOG(FATAL) << "not valid"; }
   // Run on a single Function.
diff --git a/paddle/fluid/inference/analysis/pass_manager_tester.cc b/paddle/fluid/inference/analysis/pass_manager_tester.cc
index dac1c509d728114bd24a2ea1150c407646026fd4..13423e4837e12a96e7a5dfc9ca3f59bf8b14746a 100644
--- a/paddle/fluid/inference/analysis/pass_manager_tester.cc
+++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc
@@ -56,7 +56,7 @@ class TestNodePass final : public NodePass {
   std::string description() const override { return "some doc"; }
 };
 
-TEST_F(DFG_Tester, DFG_pass_manager) {
+TEST(PassManager, DFG_pass_manager) {
   TestDfgPassManager manager;
   DFG_GraphvizDrawPass::Config config("./", "dfg.dot");
 
@@ -64,12 +64,15 @@ TEST_F(DFG_Tester, DFG_pass_manager) {
   manager.Register("graphviz", new DFG_GraphvizDrawPass(config));
   manager.Register("dfg-to-fluid", new DataFlowGraphToFluidPass);
 
+  Argument argument(FLAGS_inference_model_dir);
+
   ASSERT_TRUE(&argument);
   ASSERT_TRUE(manager.Initialize(&argument));
   manager.RunAll();
 }
 
-TEST_F(DFG_Tester, Node_pass_manager) {
+TEST(PassManager, Node_pass_manager) {
+  Argument argument(FLAGS_inference_model_dir);
   // Pre-process: initialize the DFG with the ProgramDesc first.
   FluidToDataFlowGraphPass pass0;
   pass0.Initialize(&argument);
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc
index 389f9e1a9148a4daf0e5b751cce5cb6325252a4e..80809d4c43ca08298bad25cf614dcb4117d3f99a 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@@ -76,7 +76,7 @@ void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) {
 
 std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
   std::vector<Node *> marked_nodes;
-  for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes()) {
+  for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes_in_TS()) {
     if (node.attr(kMarkerAttrName).Bool()) {
       marked_nodes.push_back(&node);
     }
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
index 8134494f8bccb132f2ed7d1ba1fb615a298596ed..39cc433b40fad17f4f12359d4e907a250a88bd63 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
@@ -31,8 +31,8 @@ SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
   return false;
 };
 
-TEST_F(DFG_Tester, Split) {
-  auto desc = LoadProgramDesc();
+TEST(SubGraphSplitter, Split) {
+  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
   auto dfg = ProgramDescToDFG(desc);
   LOG(INFO) << "spliter\n" << dfg.DotString();
 
@@ -63,8 +63,8 @@ TEST_F(DFG_Tester, Split) {
   ASSERT_EQ(subgraphs.back().size(), 6UL);
 }
 
-TEST_F(DFG_Tester, Fuse) {
-  auto desc = LoadProgramDesc();
+TEST(SubGraphSplitter, Fuse) {
+  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
   auto dfg = ProgramDescToDFG(desc);
 
   size_t count0 = dfg.nodes.size();
@@ -82,7 +82,7 @@ TEST_F(DFG_Tester, Fuse) {
 
   // At least one nodes should be deleted.
   ASSERT_EQ(dfg.nodes.size(), count0 + 1);  // added a new FunctionBlock
-  ASSERT_EQ(6UL, count1);
+  ASSERT_EQ(6, count1);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
index a6c15e848b99ca318f4583e3d4b88345fe8e5ebc..c1d932878e559180af987594535959afdf475587 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
@@ -22,11 +22,11 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) {
+TEST(TensorRTSubgraphNodeMarkPass, test) {
   // init
   FluidToDataFlowGraphPass pass;
+  Argument argument(FLAGS_inference_model_dir);
   ASSERT_TRUE(pass.Initialize(&argument));
-  argument.main_dfg.reset(new DataFlowGraph);
   pass.Run(argument.main_dfg.get());
 
   TensorRTSubgraphNodeMarkPass::teller_t teller = [](const Node* node) {
@@ -41,7 +41,7 @@ TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) {
   for (auto& node : argument.main_dfg->nodes.nodes()) {
     counter += node->attr(ATTR_supported_by_tensorrt).Bool();
   }
-
+  ASSERT_EQ(counter, 2);
   LOG(INFO) << counter << " nodes marked";
 }
 
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
index 9993de22800bc0aafdcbf46618e6b479ac1eb187..faf876de6d65d20cf7a084cd97392cfc8d791a42 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
@@ -25,6 +25,9 @@ TensorRTSubGraphPass::TensorRTSubGraphPass(
 
 void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
   SubGraphFuse(graph, node_inside_subgraph_teller_)();
+  VLOG(4) << "debug info "
+          << graph->HumanReadableInfo(false /*show_values*/,
+                                      true /*show_functions*/);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
index 1d749d3fa3f39b351ccee6ebeb82467f7220a0b6..67a5af83d89b771536ea11be51b35244ff5c09d6 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
@@ -25,7 +25,7 @@ namespace analysis {
 
 DEFINE_string(dot_dir, "./", "");
 
-TEST_F(DFG_Tester, tensorrt_single_pass) {
+TEST(TensorRTSubGraphPass, main) {
   std::unordered_set<std::string> teller_set(
       {"elementwise_add", "mul", "sigmoid"});
   SubGraphSplitter::NodeInsideSubgraphTeller teller = [&](const Node* node) {
@@ -35,7 +35,8 @@ TEST_F(DFG_Tester, tensorrt_single_pass) {
     return false;
   };
 
-  LOG(INFO) << "init";
+  Argument argument(FLAGS_inference_model_dir);
+
   DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"};
   DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"};
 
@@ -44,13 +45,11 @@ TEST_F(DFG_Tester, tensorrt_single_pass) {
   FluidToDataFlowGraphPass pass0;
   TensorRTSubGraphPass trt_pass(std::move(teller));
 
-  LOG(INFO) << "Initialize";
   dfg_pass.Initialize(&argument);
   dfg_pass1.Initialize(&argument);
   pass0.Initialize(&argument);
   trt_pass.Initialize(&argument);
 
-  LOG(INFO) << "Run";
   argument.main_dfg.reset(new DataFlowGraph);
   pass0.Run(argument.main_dfg.get());
   dfg_pass.Run(argument.main_dfg.get());
diff --git a/paddle/fluid/inference/analysis/ut_helper.h b/paddle/fluid/inference/analysis/ut_helper.h
index ce1191a567a4198f003520c40bf02487c48c56eb..1073a6f686eaeeaaae2d93ab044149b7df518085 100644
--- a/paddle/fluid/inference/analysis/ut_helper.h
+++ b/paddle/fluid/inference/analysis/ut_helper.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/analysis/helper.h"
 
 namespace paddle {
 namespace inference {
@@ -32,27 +32,12 @@ namespace analysis {
 
 DEFINE_string(inference_model_dir, "", "inference test model dir");
 
-static framework::proto::ProgramDesc LoadProgramDesc(
-    const std::string& model_dir = FLAGS_inference_model_dir) {
-  std::string msg;
-  std::string net_file = FLAGS_inference_model_dir + "/__model__";
-  std::ifstream fin(net_file, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", net_file);
-  fin.seekg(0, std::ios::end);
-  msg.resize(fin.tellg());
-  fin.seekg(0, std::ios::beg);
-  fin.read(&(msg.at(0)), msg.size());
-  fin.close();
-  framework::proto::ProgramDesc program_desc;
-  program_desc.ParseFromString(msg);
-  return program_desc;
-}
-
 static DataFlowGraph ProgramDescToDFG(
     const framework::proto::ProgramDesc& desc) {
   DataFlowGraph graph;
   FluidToDataFlowGraphPass pass;
   Argument argument;
+  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
   argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc));
   pass.Initialize(&argument);
   pass.Run(&graph);
@@ -63,7 +48,7 @@ static DataFlowGraph ProgramDescToDFG(
 class DFG_Tester : public ::testing::Test {
  protected:
   void SetUp() override {
-    auto desc = LoadProgramDesc(FLAGS_inference_model_dir);
+    auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
     argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc));
   }
 
diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
similarity index 67%
rename from paddle/contrib/inference/CMakeLists.txt
rename to paddle/fluid/inference/api/CMakeLists.txt
index a8bbb4eb8081420ae0bbaf761bd27303c0d043cb..08d0f493ab30d92a121d089d9003bc575429b4dd 100644
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -19,6 +19,7 @@ endif(APPLE)
 
 
 set(inference_deps paddle_inference_api paddle_fluid_api)
+
 if(WITH_GPU AND TENSORRT_FOUND)
     set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
 endif()
@@ -26,13 +27,13 @@ endif()
 function(inference_api_test TARGET_NAME)
     if (WITH_TESTING)
         set(options "")
-        set(oneValueArgs "")
+        set(oneValueArgs SRC)
         set(multiValueArgs ARGS)
         cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
         set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
         cc_test(${TARGET_NAME}
-                SRCS ${TARGET_NAME}.cc
+                SRCS ${inference_test_SRC}
                 DEPS "${inference_deps}"
                 ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
         if(inference_test_ARGS)
@@ -42,47 +43,41 @@ function(inference_api_test TARGET_NAME)
     endif(WITH_TESTING)
 endfunction(inference_api_test)
 
-cc_library(paddle_inference_api
-    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
-    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+cc_library(paddle_inference_api SRCS api.cc api_impl.cc DEPS lod_tensor)
 
-cc_library(paddle_inference_api_shared SHARED
-    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
-    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 
 cc_test(test_paddle_inference_api
-        SRCS test_paddle_inference_api.cc
+        SRCS api_tester.cc
         DEPS paddle_inference_api)
 
-inference_api_test(test_paddle_inference_api_impl
+inference_api_test(test_api_impl SRC api_impl_tester.cc
                     ARGS test_word2vec test_image_classification)
 
 if(WITH_GPU AND TENSORRT_FOUND)
 cc_library(paddle_inference_tensorrt_subgraph_engine
-        SRCS paddle_inference_api_tensorrt_subgraph_engine.cc
-        DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api)
+        SRCS api_tensorrt_subgraph_engine.cc
+        DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter)
 
-inference_api_test(test_paddle_inference_api_tensorrt_subgraph_engine ARGS test_word2vec)
+inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec)
 endif()
 
 if (WITH_ANAKIN) # only needed in CI
     # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
     # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
     # compile the libinference_anakin_api.a and compile with anakin.so.
-    nv_library(inference_anakin_api SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
-    nv_library(inference_anakin_api_shared SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
+    fetch_include_recursively(${ANAKIN_INCLUDE})
+    # compile the libinference_anakin_api.a and anakin.so.
+    nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc)
+    nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc)
     target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
     target_compile_options(inference_anakin_api_shared BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
     target_link_libraries(inference_anakin_api anakin anakin_saber_common)
     target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common)
     if (WITH_TESTING)
-        cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
-                                  ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
-                                  DEPS inference_anakin_api)
-        target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
+        # this test is unstable, disable it first.
+        #cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
+                                  #ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
+                                  #DEPS inference_anakin_api_shared)
+        #target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
      endif(WITH_TESTING)
 endif()
-
-if(WITH_TESTING)
-    add_subdirectory(demo)
-endif()
diff --git a/paddle/contrib/inference/README.md b/paddle/fluid/inference/api/README.md
similarity index 100%
rename from paddle/contrib/inference/README.md
rename to paddle/fluid/inference/api/README.md
diff --git a/paddle/contrib/inference/paddle_inference_api.cc b/paddle/fluid/inference/api/api.cc
similarity index 75%
rename from paddle/contrib/inference/paddle_inference_api.cc
rename to paddle/fluid/inference/api/api.cc
index ea46b3006f8d0964cc8229d3683ee7b602d6ef0d..63c3f0d7b3f5c2b9246e2b041796caf5eb562826 100644
--- a/paddle/contrib/inference/paddle_inference_api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include <glog/logging.h>
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
 
 namespace paddle {
 
@@ -23,7 +24,6 @@ int PaddleDtypeSize(PaddleDType dtype) {
     case PaddleDType::INT64:
       return sizeof(int64_t);
     default:
-      //
       assert(false);
       return -1;
   }
@@ -41,19 +41,36 @@ PaddleBuf::PaddleBuf(PaddleBuf&& other)
 PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; }
 
 PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
+  if (!other.memory_owned_) {
+    data_ = other.data_;
+    length_ = other.length_;
+    memory_owned_ = other.memory_owned_;
+  } else {
+    Resize(other.length());
+    memcpy(data_, other.data(), other.length());
+    length_ = other.length();
+    memory_owned_ = true;
+  }
+  return *this;
+}
+
+PaddleBuf& PaddleBuf::operator=(PaddleBuf&& other) {
   // only the buffer with external memory can be copied
-  assert(!other.memory_owned_);
   data_ = other.data_;
   length_ = other.length_;
   memory_owned_ = other.memory_owned_;
+  other.data_ = nullptr;
+  other.length_ = 0;
+  other.memory_owned_ = false;
   return *this;
 }
 
 void PaddleBuf::Resize(size_t length) {
   // Only the owned memory can be reset, the external memory can't be changed.
   if (length_ == length) return;
-  assert(memory_owned_);
-  Free();
+  if (memory_owned_) {
+    Free();
+  }
   data_ = new char[length];
   length_ = length;
   memory_owned_ = true;
@@ -69,7 +86,7 @@ void PaddleBuf::Reset(void* data, size_t length) {
 void PaddleBuf::Free() {
   if (memory_owned_ && data_) {
     assert(length_ > 0);
-    delete static_cast<char*>(data_);
+    delete[] static_cast<char*>(data_);
     data_ = nullptr;
     length_ = 0;
   }
diff --git a/paddle/fluid/inference/api/api_anakin_engine.cc b/paddle/fluid/inference/api/api_anakin_engine.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6b374ceefbc180a5c22abe591f12e1c3d89bc64a
--- /dev/null
+++ b/paddle/fluid/inference/api/api_anakin_engine.cc
@@ -0,0 +1,167 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/api/api_anakin_engine.h"
+#include <cuda.h>
+#include <vector>
+
+namespace paddle {
+
+template <typename Target>
+PaddleInferenceAnakinPredictor<Target>::PaddleInferenceAnakinPredictor(
+    const AnakinConfig &config) {
+  CHECK(Init(config));
+}
+
+template <typename Target>
+bool PaddleInferenceAnakinPredictor<Target>::Init(const AnakinConfig &config) {
+  if (!(graph_.load(config.model_file))) {
+    LOG(FATAL) << "fail to load graph from " << config.model_file;
+    return false;
+  }
+  auto inputs = graph_.get_ins();
+  for (auto &input_str : inputs) {
+    graph_.ResetBatchSize(input_str, config.max_batch_size);
+  }
+  // optimization for graph
+  if (!(graph_.Optimize())) {
+    return false;
+  }
+  // construct executer
+  if (executor_p_ == nullptr) {
+    executor_p_ = new anakin::Net<Target, anakin::saber::AK_FLOAT,
+                                  anakin::Precision::FP32>(graph_, true);
+  }
+  return true;
+}
+
+template <typename Target>
+bool PaddleInferenceAnakinPredictor<Target>::Run(
+    const std::vector<PaddleTensor> &inputs,
+    std::vector<PaddleTensor> *output_data, int batch_size) {
+  for (const auto &input : inputs) {
+    if (input.dtype != PaddleDType::FLOAT32) {
+      LOG(ERROR) << "Only support float type inputs. " << input.name
+                 << "'s type is not float";
+      return false;
+    }
+    auto d_tensor_in_p = executor_p_->get_in(input.name);
+    auto net_shape = d_tensor_in_p->valid_shape();
+    if (net_shape.size() != input.shape.size()) {
+      LOG(ERROR) << " input  " << input.name
+                 << "'s shape size should be equal to that of net";
+      return false;
+    }
+    int sum = 1;
+    for_each(input.shape.begin(), input.shape.end(), [&](int n) { sum *= n; });
+    if (sum > net_shape.count()) {
+      graph_.Reshape(input.name, input.shape);
+      delete executor_p_;
+      executor_p_ = new anakin::Net<Target, anakin::saber::AK_FLOAT,
+                                    anakin::Precision::FP32>(graph_, true);
+      d_tensor_in_p = executor_p_->get_in(input.name);
+    }
+
+    anakin::saber::Shape tmp_shape;
+    for (auto s : input.shape) {
+      tmp_shape.push_back(s);
+    }
+    d_tensor_in_p->reshape(tmp_shape);
+
+    float *d_data_p = d_tensor_in_p->mutable_data();
+    if (cudaMemcpy(d_data_p, static_cast<float *>(input.data.data()),
+                   d_tensor_in_p->valid_size() * sizeof(float),
+                   cudaMemcpyHostToDevice) != 0) {
+      LOG(ERROR) << "copy data from CPU to GPU error";
+      return false;
+    }
+    cudaStreamSynchronize(NULL);
+  }
+  cudaDeviceSynchronize();
+  executor_p_->prediction();
+  cudaDeviceSynchronize();
+
+  if (output_data->empty()) {
+    LOG(ERROR) << "At least one output should be set with tensors' names.";
+    return false;
+  }
+  for (auto &output : *output_data) {
+    auto *tensor = executor_p_->get_out(output.name);
+    output.shape = tensor->valid_shape();
+    if (output.data.length() < tensor->valid_size() * sizeof(float)) {
+      output.data.Resize(tensor->valid_size() * sizeof(float));
+    }
+    // Copy data from GPU -> CPU
+    if (cudaMemcpy(output.data.data(), tensor->mutable_data(),
+                   tensor->valid_size() * sizeof(float),
+                   cudaMemcpyDeviceToHost) != 0) {
+      LOG(ERROR) << "copy data from GPU to CPU error";
+      return false;
+    }
+    cudaStreamSynchronize(NULL);
+  }
+  return true;
+}
+
+template <typename Target>
+anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
+    &PaddleInferenceAnakinPredictor<Target>::get_executer() {
+  return *executor_p_;
+}
+
+// the cloned new Predictor of anakin share the same net weights from original
+// Predictor
+template <typename Target>
+std::unique_ptr<PaddlePredictor>
+PaddleInferenceAnakinPredictor<Target>::Clone() {
+  VLOG(3) << "Anakin Predictor::clone";
+  std::unique_ptr<PaddlePredictor> cls(
+      new PaddleInferenceAnakinPredictor<Target>());
+  // construct executer from other graph
+  auto anakin_predictor_p =
+      dynamic_cast<PaddleInferenceAnakinPredictor<Target> *>(cls.get());
+  if (!anakin_predictor_p) {
+    LOG(ERROR) << "fail to call Init";
+    return nullptr;
+  }
+  anakin_predictor_p->get_executer().init(graph_);
+
+  return std::move(cls);
+}
+
+template class PaddleInferenceAnakinPredictor<anakin::NV>;
+template class PaddleInferenceAnakinPredictor<anakin::X86>;
+
+// A factory to help create difference predictor.
+template <>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
+    AnakinConfig, PaddleEngineKind::kAnakin>(const AnakinConfig &config) {
+  VLOG(3) << "Anakin Predictor create.";
+  if (config.target_type == AnakinConfig::NVGPU) {
+    VLOG(3) << "Anakin Predictor create on [ NVIDIA GPU ].";
+    std::unique_ptr<PaddlePredictor> x(
+        new PaddleInferenceAnakinPredictor<anakin::NV>(config));
+    return x;
+  } else if (config.target_type == AnakinConfig::X86) {
+    VLOG(3) << "Anakin Predictor create on [ Intel X86 ].";
+    std::unique_ptr<PaddlePredictor> x(
+        new PaddleInferenceAnakinPredictor<anakin::X86>(config));
+    return x;
+  } else {
+    VLOG(3) << "Anakin Predictor create on unknown platform.";
+    return nullptr;
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h
similarity index 65%
rename from paddle/contrib/inference/paddle_inference_api_anakin_engine.h
rename to paddle/fluid/inference/api/api_anakin_engine.h
index 212ba41cdf8ff2feccb6b6498f9679d76a2efe7c..836badd9799228c6c294dcad5df73d039d36a1ff 100644
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
+++ b/paddle/fluid/inference/api/api_anakin_engine.h
@@ -19,41 +19,46 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include <vector>
 
-// from anakin
 #include "framework/core/net/net.h"
+#include "framework/graph/graph.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "saber/core/shape.h"
 #include "saber/saber_types.h"
 
 namespace paddle {
 
+template <typename Target>
 class PaddleInferenceAnakinPredictor : public PaddlePredictor {
  public:
   PaddleInferenceAnakinPredictor() {}
 
-  PaddleInferenceAnakinPredictor(const AnakinConfig& config);
+  explicit PaddleInferenceAnakinPredictor(const AnakinConfig& config);
 
   // NOTE Unlike the native engine, the buffers of anakin engine's output_data
   // should be allocated first.
   bool Run(const std::vector<PaddleTensor>& inputs,
-           std::vector<PaddleTensor>* output_data) override;
+           std::vector<PaddleTensor>* output_data,
+           int batch_size = -1) override;
 
   std::unique_ptr<PaddlePredictor> Clone() override;
 
-  anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
+  anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
   get_executer();
 
-  ~PaddleInferenceAnakinPredictor() override{};
+  ~PaddleInferenceAnakinPredictor() override {
+    delete executor_p_;
+    executor_p_ = nullptr;
+  };
 
  private:
   bool Init(const AnakinConfig& config);
 
-  anakin::graph::Graph<anakin::NV,
-                       anakin::saber::AK_FLOAT,
-                       anakin::Precision::FP32>
+  anakin::graph::Graph<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
       graph_;
-  anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
-      executor_;
+  anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>*
+      executor_p_{nullptr};
   AnakinConfig config_;
 };
 
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc b/paddle/fluid/inference/api/api_anakin_engine_tester.cc
similarity index 61%
rename from paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
rename to paddle/fluid/inference/api/api_anakin_engine_tester.cc
index f92e9d4190412f5847e353ef1dc0324cad668c9a..62e820b68c79a47d963bb174663bfc8c4ac22de3 100644
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine_tester.cc
@@ -12,18 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include "gflags/gflags.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
 
-DEFINE_string(model, "", "Directory of the inference model.");
+DEFINE_string(model, "", "Directory of the inference model(mobile_v2).");
 
 namespace paddle {
 
 AnakinConfig GetConfig() {
   AnakinConfig config;
+  // using AnakinConfig::X86 if you need to use cpu to do inference
+  config.target_type = AnakinConfig::NVGPU;
   config.model_file = FLAGS_model;
   config.device = 0;
   config.max_batch_size = 1;
@@ -36,28 +38,27 @@ TEST(inference, anakin) {
       CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config);
 
   float data[1 * 3 * 224 * 224] = {1.0f};
-
-  PaddleTensor tensor{.name = "input_0",
-                      .shape = std::vector<int>({1, 3, 224, 224}),
-                      .data = PaddleBuf(data, sizeof(data)),
-                      .dtype = PaddleDType::FLOAT32};
+  PaddleTensor tensor;
+  tensor.name = "input_0";
+  tensor.shape = std::vector<int>({1, 3, 224, 224});
+  tensor.data = PaddleBuf(data, sizeof(data));
+  tensor.dtype = PaddleDType::FLOAT32;
 
   // For simplicity, we set all the slots with the same data.
-  std::vector<PaddleTensor> paddle_tensor_feeds;
-  paddle_tensor_feeds.emplace_back(std::move(tensor));
+  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
 
-  PaddleTensor tensor_out{.name = "prob_out",
-                          .shape = std::vector<int>({1000, 1}),
-                          .data = PaddleBuf(),
-                          .dtype = PaddleDType::FLOAT32};
+  PaddleTensor tensor_out;
+  tensor_out.name = "prob_out";
+  tensor_out.shape = std::vector<int>({});
+  tensor_out.data = PaddleBuf();
+  tensor_out.dtype = PaddleDType::FLOAT32;
 
-  std::vector<PaddleTensor> outputs;
-  outputs.emplace_back(std::move(tensor_out));
+  std::vector<PaddleTensor> outputs(1, tensor_out);
 
   ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
 
   float* data_o = static_cast<float*>(outputs[0].data.data());
-  for (size_t j = 0; j < 1000; ++j) {
+  for (size_t j = 0; j < outputs[0].data.length(); ++j) {
     LOG(INFO) << "output[" << j << "]: " << data_o[j];
   }
 }
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
similarity index 87%
rename from paddle/contrib/inference/paddle_inference_api_impl.cc
rename to paddle/fluid/inference/api/api_impl.cc
index b1e5b875981e0142f6970cf6864b7b598743654b..08d7af6d3af7054061b15b904c69b2862c629562 100644
--- a/paddle/contrib/inference/paddle_inference_api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/contrib/inference/paddle_inference_api_impl.h"
+#include "paddle/fluid/inference/api/api_impl.h"
 
 namespace paddle {
 namespace {
@@ -66,6 +66,7 @@ bool NativePaddlePredictor::Init(
   if (parent_scope) {
     scope_ = parent_scope;
     sub_scope_ = &(parent_scope->NewScope());
+    PADDLE_ENFORCE_NOT_NULL(sub_scope_, "create sub scope fail");
   } else {
     paddle::framework::InitDevices(false);
     scope_.reset(new paddle::framework::Scope());
@@ -77,8 +78,8 @@ bool NativePaddlePredictor::Init(
   if (!config_.model_dir.empty()) {
     // Parameters are saved in separate files sited in
     // the specified `dirname`.
-    inference_program_ = paddle::inference::Load(
-        executor_.get(), scope_.get(), config_.model_dir);
+    inference_program_ = paddle::inference::Load(executor_.get(), scope_.get(),
+                                                 config_.model_dir);
   } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
     // All parameters are saved in a single file.
     // The file names should be consistent with that used
@@ -91,8 +92,8 @@ bool NativePaddlePredictor::Init(
   }
 
   ctx_ = executor_->Prepare(*inference_program_, 0);
-  executor_->CreateVariables(
-      *inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0);
+  executor_->CreateVariables(*inference_program_,
+                             sub_scope_ ? sub_scope_ : scope_.get(), 0);
 
   // Get the feed_target_names and fetch_target_names
   feed_target_names_ = inference_program_->GetFeedTargetNames();
@@ -102,13 +103,13 @@ bool NativePaddlePredictor::Init(
 
 NativePaddlePredictor::~NativePaddlePredictor() {
   if (sub_scope_) {
-    PADDLE_ENFORCE_NOT_NULL(scope_, "Should have parent scope!");
     scope_->DeleteScope(sub_scope_);
   }
-};
+}
 
 bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
-                                std::vector<PaddleTensor> *output_data) {
+                                std::vector<PaddleTensor> *output_data,
+                                int batch_size) {
   VLOG(3) << "Predictor::predict";
   Timer timer;
   timer.tic();
@@ -134,10 +135,9 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
   // if share variables, we need not create variables
   VLOG(4) << "Run prepared context";
   executor_->RunPreparedContext(
-      ctx_.get(),
-      sub_scope_ != nullptr ? sub_scope_ : scope_.get(),
-      &feed_targets,
-      &fetch_targets,
+      ctx_.get(), sub_scope_ != nullptr ? sub_scope_ : scope_.get(),
+      &feed_targets, &fetch_targets,
+      false, /* don't create local scope each time*/
       false /* don't create variable eatch time */);
   VLOG(4) << "Finish prepared context";
   if (!GetFetch(fetchs, output_data)) {
@@ -181,9 +181,15 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
     }
 
     // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
-    std::memcpy(static_cast<void *>(input_ptr),
-                inputs[i].data.data(),
+    std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
                 inputs[i].data.length());
+    // TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
+    framework::LoD lod;
+    for (auto &level : inputs[i].lod) {
+      lod.emplace_back(level);
+    }
+    input.set_lod(lod);
+
     feeds->push_back(input);
   }
   return true;
@@ -232,8 +238,7 @@ bool NativePaddlePredictor::GetFetch(
         size_t start = lod[0][j - 1] * common_dim;
         size_t end = lod[0][j] * common_dim;
         if (end > start) {
-          std::copy(output_ptr + start,
-                    output_ptr + end,
+          std::copy(output_ptr + start, output_ptr + end,
                     data.begin() + (j - 1) * max_dim * common_dim);
         }
       }
@@ -250,6 +255,10 @@ bool NativePaddlePredictor::GetFetch(
       buffer.Resize(sizeof(float) * data.size());
     }
     std::memcpy(buffer.data(), data.data(), buffer.length());
+    // copy LoD
+    for (const auto &level : fetchs[i].lod()) {
+      outputs->at(i).lod.emplace_back(level);
+    }
     outputs->at(i).dtype = PaddleDType::FLOAT32;
     // TODO(panyx0718): support other types? fill tensor name? avoid a copy.
   }
@@ -257,15 +266,13 @@ bool NativePaddlePredictor::GetFetch(
 }
 
 template <>
-std::unique_ptr<PaddlePredictor>
-CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
-    const NativeConfig &config) {
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
+    NativeConfig, PaddleEngineKind::kNative>(const NativeConfig &config) {
   VLOG(3) << "create NativePaddlePredictor";
   if (config.use_gpu) {
     // 1. GPU memeroy
     PADDLE_ENFORCE_GT(
-        config.fraction_of_gpu_memory,
-        0.f,
+        config.fraction_of_gpu_memory, 0.f,
         "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
     PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
     std::vector<std::string> flags;
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.h b/paddle/fluid/inference/api/api_impl.h
similarity index 93%
rename from paddle/contrib/inference/paddle_inference_api_impl.h
rename to paddle/fluid/inference/api/api_impl.h
index f9ec6f55449fc46b4a44b9563980cb5f8e80a951..4f28c3cd34bade4189871210e6168c6c1c610c2c 100644
--- a/paddle/contrib/inference/paddle_inference_api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -19,7 +19,7 @@
 #include <string>
 #include <vector>
 
-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
 
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -38,7 +38,8 @@ class NativePaddlePredictor : public PaddlePredictor {
   bool Init(std::shared_ptr<framework::Scope> parent_scope);
 
   bool Run(const std::vector<PaddleTensor> &inputs,
-           std::vector<PaddleTensor> *output_data) override;
+           std::vector<PaddleTensor> *output_data,
+           int batch_size = -1) override;
 
   std::unique_ptr<PaddlePredictor> Clone() override;
 
diff --git a/paddle/contrib/inference/test_paddle_inference_api_impl.cc b/paddle/fluid/inference/api/api_impl_tester.cc
similarity index 97%
rename from paddle/contrib/inference/test_paddle_inference_api_impl.cc
rename to paddle/fluid/inference/api/api_impl_tester.cc
index 88c4e665a3daed0ed34b23b75d360acbd586401f..fc1364b80ac1ee2d304eb2fe429eae5f56967516 100644
--- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -15,10 +15,10 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include <thread>
+#include <thread>  // NOLINT
 
 #include "gflags/gflags.h"
-#include "paddle/contrib/inference/paddle_inference_api_impl.h"
+#include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -121,8 +121,8 @@ void MainImageClassification(bool use_gpu) {
   // which should be in the range [0.0, 1.0].
   feed_target_shapes[0][0] = batch_size;
   framework::DDim input_dims = framework::make_ddim(feed_target_shapes[0]);
-  SetupTensor<float>(
-      &input, input_dims, static_cast<float>(0), static_cast<float>(1));
+  SetupTensor<float>(&input, input_dims, static_cast<float>(0),
+                     static_cast<float>(1));
   std::vector<framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&input);
 
@@ -249,7 +249,7 @@ void MainThreadsImageClassification(bool use_gpu) {
       const size_t len = local_outputs[0].data.length();
       float* data = static_cast<float*>(local_outputs[0].data.data());
       float* ref_data = refs[tid].data<float>();
-      EXPECT_EQ(refs[tid].numel(), len / sizeof(float));
+      EXPECT_EQ((size_t)refs[tid].numel(), len / sizeof(float));
       for (int i = 0; i < refs[tid].numel(); ++i) {
         EXPECT_NEAR(ref_data[i], data[i], 1e-3);
       }
diff --git a/paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
similarity index 75%
rename from paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc
rename to paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
index a11396cee91a758e86af2efd9e58b9da68442590..45b5a7638b7dc6a54bbd905766fd5c284cb6aea1 100644
--- a/paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/contrib/inference/paddle_inference_api.h"
-#include "paddle/contrib/inference/paddle_inference_api_impl.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/api/api_impl.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/operators/tensorrt_engine_op.h"
 
 namespace paddle {
 
@@ -64,8 +65,43 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
       return false;
     }
 
+    OptimizeInferenceProgram();
+    ctx_ = executor_->Prepare(*inference_program_, 0);
+
+    VLOG(5) << "to create variables";
+    executor_->CreateVariables(*inference_program_,
+                               sub_scope_ ? sub_scope_ : scope_.get(), 0);
+
+    // Get the feed_target_names and fetch_target_names
+    feed_target_names_ = inference_program_->GetFeedTargetNames();
+    fetch_target_names_ = inference_program_->GetFetchTargetNames();
+    return true;
+  }
+
+  bool Run(const std::vector<PaddleTensor>& inputs,
+           std::vector<PaddleTensor>* output_data,
+           int batch_size = -1) override {
+    PADDLE_ENFORCE_GT(batch_size, 0,
+                      "TensorRT engine needs the argument batch_size set");
+    FLAGS_tensorrt_engine_batch_size = batch_size;
+    return NativePaddlePredictor::Run(inputs, output_data, batch_size);
+  }
+
+  void OptimizeInferenceProgram() {
     // Analyze inference_program
     Argument argument;
+    if (!config_.model_dir.empty()) {
+      argument.fluid_model_dir.reset(new std::string(config_.model_dir));
+    } else {
+      PADDLE_ENFORCE(
+          !config_.param_file.empty(),
+          "Either model_dir or (param_file, prog_file) should be set.");
+      PADDLE_ENFORCE(!config_.prog_file.empty());
+      argument.fluid_model_program_path.reset(
+          new std::string(config_.prog_file));
+      argument.fluid_model_param_path.reset(
+          new std::string(config_.param_file));
+    }
     argument.origin_program_desc.reset(
         new ProgramDesc(*inference_program_->Proto()));
     Singleton<Analyzer>::Global().Run(&argument);
@@ -73,17 +109,8 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
     VLOG(5) << "transformed program:\n"
             << argument.transformed_program_desc->SerializeAsString();
     VLOG(5) << "to prepare executor";
-    *inference_program_->Proto() = *argument.transformed_program_desc;
-    ctx_ = executor_->Prepare(*inference_program_, 0);
-
-    VLOG(5) << "to create variables";
-    executor_->CreateVariables(
-        *inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0);
-
-    // Get the feed_target_names and fetch_target_names
-    feed_target_names_ = inference_program_->GetFeedTargetNames();
-    fetch_target_names_ = inference_program_->GetFetchTargetNames();
-    return true;
+    inference_program_.reset(
+        new framework::ProgramDesc(*argument.transformed_program_desc));
   }
 
  private:
@@ -98,8 +125,7 @@ CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
   if (config.use_gpu) {
     // 1. GPU memeroy
     PADDLE_ENFORCE_GT(
-        config.fraction_of_gpu_memory,
-        0.f,
+        config.fraction_of_gpu_memory, 0.f,
         "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
     PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
     std::vector<std::string> flags;
diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fcbf9b89d608e7961e3ef81ac1c70e083dae1cc0
--- /dev/null
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+namespace paddle {
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+void CompareTensorRTWithFluid(bool enable_tensorrt) {
+  FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = enable_tensorrt;
+
+  //# 1. Create PaddlePredictor with a config.
+  NativeConfig config0;
+  config0.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config0.use_gpu = true;
+  config0.fraction_of_gpu_memory = 0.3;
+  config0.device = 0;
+
+  TensorRTConfig config1;
+  config1.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config1.use_gpu = true;
+  config1.fraction_of_gpu_memory = 0.3;
+  config1.device = 0;
+
+  auto predictor0 =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
+  auto predictor1 =
+      CreatePaddlePredictor<TensorRTConfig,
+                            PaddleEngineKind::kAutoMixedTensorRT>(config1);
+
+  for (int batch_id = 0; batch_id < 1; batch_id++) {
+    //# 2. Prepare input.
+    std::vector<int64_t> data(20);
+    for (int i = 0; i < 20; i++) data[i] = i;
+
+    PaddleTensor tensor;
+    tensor.shape = std::vector<int>({10, 1});
+    tensor.data = PaddleBuf(data.data(), data.size() * sizeof(int64_t));
+    tensor.dtype = PaddleDType::INT64;
+
+    // For simplicity, we set all the slots with the same data.
+    std::vector<PaddleTensor> slots(4, tensor);
+
+    //# 3. Run
+    std::vector<PaddleTensor> outputs0;
+    std::vector<PaddleTensor> outputs1;
+    CHECK(predictor0->Run(slots, &outputs0));
+    CHECK(predictor1->Run(slots, &outputs1, 10));
+
+    //# 4. Get output.
+    ASSERT_EQ(outputs0.size(), 1UL);
+    ASSERT_EQ(outputs1.size(), 1UL);
+
+    const size_t num_elements = outputs0.front().data.length() / sizeof(float);
+    const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
+    EXPECT_EQ(num_elements, num_elements1);
+
+    auto *data0 = static_cast<float *>(outputs0.front().data.data());
+    auto *data1 = static_cast<float *>(outputs1.front().data.data());
+
+    ASSERT_GT(num_elements, 0UL);
+    for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
+      EXPECT_NEAR(data0[i], data1[i], 1e-3);
+    }
+  }
+}
+
+TEST(paddle_inference_api_tensorrt_subgraph_engine, without_tensorrt) {
+  CompareTensorRTWithFluid(false);
+}
+
+TEST(paddle_inference_api_tensorrt_subgraph_engine, with_tensorrt) {
+  CompareTensorRTWithFluid(true);
+}
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/test_paddle_inference_api.cc b/paddle/fluid/inference/api/api_tester.cc
similarity index 91%
rename from paddle/contrib/inference/test_paddle_inference_api.cc
rename to paddle/fluid/inference/api/api_tester.cc
index bc7faab6e208a66d7a56e41a56bd743c7644eea2..7a579610eefda24c911edd28b5f3a178aa10ab1e 100644
--- a/paddle/contrib/inference/test_paddle_inference_api.cc
+++ b/paddle/fluid/inference/api/api_tester.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/contrib/inference/paddle_inference_api.h"
-
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
 
 namespace paddle {
 
@@ -36,7 +35,8 @@ class DemoPredictor : public PaddlePredictor {
     LOG(INFO) << "I get other_config " << config.other_config;
   }
   bool Run(const std::vector<PaddleTensor> &inputs,
-           std::vector<PaddleTensor> *output_data) override {
+           std::vector<PaddleTensor> *output_data,
+           int batch_size = 0) override {
     LOG(INFO) << "Run";
     return false;
   }
diff --git a/paddle/fluid/inference/api/demo_ci/.gitignore b/paddle/fluid/inference/api/demo_ci/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..1269488f7fb1f4b56a8c0e5eb48cecbfadfa9219
--- /dev/null
+++ b/paddle/fluid/inference/api/demo_ci/.gitignore
@@ -0,0 +1 @@
+data
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ba73a6eaa6fc885b6b56c2d6330394e2f9c384bf
--- /dev/null
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -0,0 +1,73 @@
+cmake_minimum_required(VERSION 3.0)
+
+project(cpp_inference_demo CXX C)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+
+if(NOT DEFINED PADDLE_LIB)
+  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
+endif()
+if(NOT DEFINED DEMO_NAME)
+  message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name")
+endif()
+
+option(WITH_MKL        "Compile demo with MKL/OpenBlas support, default use MKL."       ON)
+option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."                    OFF)
+option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   ON)
+
+if(WITH_GPU)
+  set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library")
+endif()
+
+include_directories("${PADDLE_LIB}")
+include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
+include_directories("${PADDLE_LIB}/third_party/install/glog/include")
+include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
+include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
+include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
+include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
+
+include_directories("${PADDLE_LIB}/third_party/boost")
+include_directories("${PADDLE_LIB}/third_party/eigen3")
+
+link_directories("${PADDLE_LIB}/third_party/install/snappy/lib")
+link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
+link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
+link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
+link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
+link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
+
+add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
+
+if(WITH_MKL)
+  include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so 
+               ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5.so)
+  set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
+  if(EXISTS ${MKLDNN_PATH})
+    include_directories("${MKLDNN_PATH}/include")
+    set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
+  endif()
+else()
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a)
+endif()
+
+# Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
+if(WITH_STATIC_LIB)
+  set(DEPS
+      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.a)
+else()
+  set(DEPS
+      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.so)
+endif()
+set(EXTERNAL_LIB "-lrt -ldl -lpthread")
+
+set(DEPS ${DEPS}
+    ${MATH_LIB} ${MKLDNN_LIB}
+    glog gflags protobuf snappystream snappy z
+    ${EXTERNAL_LIB})
+if(WITH_GPU)
+  set(DEPS ${DEPS} ${CUDA_LIB}/libcudart.so)
+endif()
+
+target_link_libraries(${DEMO_NAME} ${DEPS})
diff --git a/paddle/fluid/inference/api/demo_ci/README.md b/paddle/fluid/inference/api/demo_ci/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7f013da7f30acd84ec484773f4ea716a08efa0ff
--- /dev/null
+++ b/paddle/fluid/inference/api/demo_ci/README.md
@@ -0,0 +1,26 @@
+# Inference Demos
+
+There are several demos:
+
+- simple_on_word2vec: 
+  - Follow the C++ codes is in `simple_on_word2vec.cc`. 
+  - It is suitable for word2vec model.
+- vis_demo: 
+  - Follow the C++ codes is in `vis_demo.cc`. 
+  - It is suitable for mobilenet, se_resnext50 and ocr three models.
+  - Input data format:
+    - Each line contains a single record
+    - Each record's format is
+    ```
+    <space splitted floats as data>\t<space splitted ints as shape>
+    ```
+
+To build and execute the demos, simply run 
+```
+./run.sh $PADDLE_ROOT $TURN_ON_MKL $TEST_GPU_CPU
+```
+- It will build and execute the demos in both static and shared library.
+- `$PADDLE_ROOT`: paddle library path
+- `$TURN_ON_MKL`: use MKL or Openblas
+- `$TEST_GPU_CPU`: test both GPU/CPU mode or only CPU mode
+- NOTE: for simple_on_word2vec, must run `ctest -R test_word2vec -R` to obtain word2vec model at first.
diff --git a/paddle/fluid/inference/api/demo_ci/clean.sh b/paddle/fluid/inference/api/demo_ci/clean.sh
new file mode 100755
index 0000000000000000000000000000000000000000..0d9f3d2aa237acaf3bd7adb031b1f2a73c555352
--- /dev/null
+++ b/paddle/fluid/inference/api/demo_ci/clean.sh
@@ -0,0 +1,4 @@
+set -x
+cd `dirname $0`
+rm -rf build/ data/
+set +x
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..3e829dd726b132844a45427b7b0b39eedf197496
--- /dev/null
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -0,0 +1,81 @@
+set -x
+PADDLE_ROOT=$1
+TURN_ON_MKL=$2 # use MKL or Openblas
+TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
+if [ $2 == ON ]; then
+  # You can export yourself if move the install path
+  MKL_LIB=${PADDLE_ROOT}/build/fluid_install_dir/third_party/install/mklml/lib
+  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${MKL_LIB}
+fi
+if [ $3 == ON ]; then
+  use_gpu_list='true false'
+else    
+  use_gpu_list='false'
+fi
+
+# download vis_demo data
+function download() {
+  dir_name=$1
+  mkdir -p $dir_name
+  cd $dir_name
+  wget -q ${URL_ROOT}$dir_name.tar.gz
+  tar xzf *.tar.gz
+  cd ..
+}
+URL_ROOT=http://paddlemodels.bj.bcebos.com/inference-vis-demos%2F
+mkdir -p data
+cd data
+vis_demo_list='se_resnext50 ocr mobilenet'
+for vis_demo_name in $vis_demo_list; do
+  download $vis_demo_name
+done
+cd ..
+
+# compile and test the demo
+mkdir -p build
+cd build
+
+for WITH_STATIC_LIB in ON OFF; do
+  # -----simple_on_word2vec-----
+  rm -rf *
+  cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
+    -DWITH_MKL=$TURN_ON_MKL \
+    -DDEMO_NAME=simple_on_word2vec \
+    -DWITH_GPU=$TEST_GPU_CPU \
+    -DWITH_STATIC_LIB=$WITH_STATIC_LIB
+  make -j
+  word2vec_model=${PADDLE_ROOT}'/build/python/paddle/fluid/tests/book/word2vec.inference.model'
+  if [ -d $word2vec_model ]; then
+    for use_gpu in $use_gpu_list; do
+      ./simple_on_word2vec \
+        --dirname=$word2vec_model \
+        --use_gpu=$use_gpu
+      if [ $? -ne 0 ]; then
+        echo "simple_on_word2vec demo runs fail."
+        exit 1
+      fi
+    done
+  fi
+  # ---------vis_demo---------
+  rm -rf *
+  cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
+    -DWITH_MKL=$TURN_ON_MKL \
+    -DDEMO_NAME=vis_demo \
+    -DWITH_GPU=$TEST_GPU_CPU \
+    -DWITH_STATIC_LIB=$WITH_STATIC_LIB
+  make -j
+  for use_gpu in $use_gpu_list; do
+    for vis_demo_name in $vis_demo_list; do 
+      ./vis_demo \
+        --modeldir=../data/$vis_demo_name/model \
+        --data=../data/$vis_demo_name/data.txt \
+        --refer=../data/$vis_demo_name/result.txt \
+        --use_gpu=$use_gpu
+      if [ $? -ne 0 ]; then
+        echo "vis demo $vis_demo_name runs fail."
+        exit 1
+      fi
+    done
+  done
+done
+set +x
diff --git a/paddle/contrib/inference/demo/simple_on_word2vec.cc b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
similarity index 60%
rename from paddle/contrib/inference/demo/simple_on_word2vec.cc
rename to paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
index c253014642f39a042430992548a285cc7078a959..03ac79e9edf0d7ce6e167c3d34af5ba84bbc0e72 100644
--- a/paddle/contrib/inference/demo/simple_on_word2vec.cc
+++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
@@ -16,21 +16,27 @@ limitations under the License. */
  * This file contains a simple demo for how to take a model for inference.
  */
 
+#include <gflags/gflags.h>
 #include <glog/logging.h>
-#include <gtest/gtest.h>
 #include <memory>
-#include <thread>
-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include <thread>  //NOLINT
+#include "paddle/fluid/inference/paddle_inference_api.h"
+#include "paddle/fluid/platform/enforce.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+DEFINE_bool(use_gpu, false, "Whether use gpu.");
 
 namespace paddle {
 namespace demo {
 
-DEFINE_string(dirname, "", "Directory of the inference model.");
-
 void Main(bool use_gpu) {
   //# 1. Create PaddlePredictor with a config.
   NativeConfig config;
-  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  if (FLAGS_dirname.empty()) {
+    LOG(INFO) << "Usage: ./simple_on_word2vec --dirname=path/to/your/model";
+    exit(1);
+  }
+  config.model_dir = FLAGS_dirname;
   config.use_gpu = use_gpu;
   config.fraction_of_gpu_memory = 0.15;
   config.device = 0;
@@ -41,10 +47,10 @@ void Main(bool use_gpu) {
     //# 2. Prepare input.
     int64_t data[4] = {1, 2, 3, 4};
 
-    PaddleTensor tensor{.name = "",
-                        .shape = std::vector<int>({4, 1}),
-                        .data = PaddleBuf(data, sizeof(data)),
-                        .dtype = PaddleDType::INT64};
+    PaddleTensor tensor;
+    tensor.shape = std::vector<int>({4, 1});
+    tensor.data = PaddleBuf(data, sizeof(data));
+    tensor.dtype = PaddleDType::INT64;
 
     // For simplicity, we set all the slots with the same data.
     std::vector<PaddleTensor> slots(4, tensor);
@@ -54,12 +60,16 @@ void Main(bool use_gpu) {
     CHECK(predictor->Run(slots, &outputs));
 
     //# 4. Get output.
-    ASSERT_EQ(outputs.size(), 1UL);
-    LOG(INFO) << "output buffer size: " << outputs.front().data.length();
+    PADDLE_ENFORCE(outputs.size(), 1UL);
+    // Check the output buffer size and result of each tid.
+    PADDLE_ENFORCE(outputs.front().data.length(), 33168UL);
+    float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
+                       0.000932706};
     const size_t num_elements = outputs.front().data.length() / sizeof(float);
     // The outputs' buffers are in CPU memory.
     for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
-      LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
+      PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
+                     result[i]);
     }
   }
 }
@@ -68,7 +78,7 @@ void MainThreads(int num_threads, bool use_gpu) {
   // Multi-threads only support on CPU
   // 0. Create PaddlePredictor with a config.
   NativeConfig config;
-  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config.model_dir = FLAGS_dirname;
   config.use_gpu = use_gpu;
   config.fraction_of_gpu_memory = 0.15;
   config.device = 0;
@@ -84,24 +94,28 @@ void MainThreads(int num_threads, bool use_gpu) {
       for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
         // 2. Dummy Input Data
         int64_t data[4] = {1, 2, 3, 4};
-        PaddleTensor tensor{.name = "",
-                            .shape = std::vector<int>({4, 1}),
-                            .data = PaddleBuf(data, sizeof(data)),
-                            .dtype = PaddleDType::INT64};
+        PaddleTensor tensor;
+        tensor.shape = std::vector<int>({4, 1});
+        tensor.data = PaddleBuf(data, sizeof(data));
+        tensor.dtype = PaddleDType::INT64;
+
         std::vector<PaddleTensor> inputs(4, tensor);
         std::vector<PaddleTensor> outputs;
         // 3. Run
         CHECK(predictor->Run(inputs, &outputs));
 
         // 4. Get output.
-        ASSERT_EQ(outputs.size(), 1UL);
-        LOG(INFO) << "TID: " << tid << ", "
-                  << "output buffer size: " << outputs.front().data.length();
+        PADDLE_ENFORCE(outputs.size(), 1UL);
+        // Check the output buffer size and result of each tid.
+        PADDLE_ENFORCE(outputs.front().data.length(), 33168UL);
+        float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
+                           0.000932706};
         const size_t num_elements =
             outputs.front().data.length() / sizeof(float);
         // The outputs' buffers are in CPU memory.
         for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
-          LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
+          PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
+                         result[i]);
         }
       }
     });
@@ -111,15 +125,18 @@ void MainThreads(int num_threads, bool use_gpu) {
   }
 }
 
-TEST(demo, word2vec_cpu) { Main(false /*use_gpu*/); }
-TEST(demo_multi_threads, word2vec_cpu_1) { MainThreads(1, false /*use_gpu*/); }
-TEST(demo_multi_threads, word2vec_cpu_4) { MainThreads(4, false /*use_gpu*/); }
-
-#ifdef PADDLE_WITH_CUDA
-TEST(demo, word2vec_gpu) { Main(true /*use_gpu*/); }
-TEST(demo_multi_threads, word2vec_gpu_1) { MainThreads(1, true /*use_gpu*/); }
-TEST(demo_multi_threads, word2vec_gpu_4) { MainThreads(4, true /*use_gpu*/); }
-#endif
-
 }  // namespace demo
 }  // namespace paddle
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  paddle::demo::Main(false /* use_gpu*/);
+  paddle::demo::MainThreads(1, false /* use_gpu*/);
+  paddle::demo::MainThreads(4, false /* use_gpu*/);
+  if (FLAGS_use_gpu) {
+    paddle::demo::Main(true /*use_gpu*/);
+    paddle::demo::MainThreads(1, true /*use_gpu*/);
+    paddle::demo::MainThreads(4, true /*use_gpu*/);
+  }
+  return 0;
+}
diff --git a/paddle/contrib/inference/demo/utils.h b/paddle/fluid/inference/api/demo_ci/utils.h
similarity index 93%
rename from paddle/contrib/inference/demo/utils.h
rename to paddle/fluid/inference/api/demo_ci/utils.h
index b5330d8d9d89260cfe3d5214e5a4ceb720cffdf1..cb8990671162dff47228736e69617229528cc093 100644
--- a/paddle/contrib/inference/demo/utils.h
+++ b/paddle/fluid/inference/api/demo_ci/utils.h
@@ -13,16 +13,15 @@
 // limitations under the License.
 
 #pragma once
+#include <algorithm>
 #include <string>
 #include <vector>
-
-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include "paddle/fluid/inference/paddle_inference_api.h"
 
 namespace paddle {
 namespace demo {
 
-static void split(const std::string& str,
-                  char sep,
+static void split(const std::string& str, char sep,
                   std::vector<std::string>* pieces) {
   pieces->clear();
   if (str.empty()) {
diff --git a/paddle/contrib/inference/demo/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
similarity index 70%
rename from paddle/contrib/inference/demo/vis_demo.cc
rename to paddle/fluid/inference/api/demo_ci/vis_demo.cc
index 45575f9a862de430236ae20cf498e542a45b1f4b..3800d49b34738d5a272033d75cb415ae9ad1fb8f 100644
--- a/paddle/contrib/inference/demo/vis_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
@@ -18,26 +18,24 @@ limitations under the License. */
 
 #include <gflags/gflags.h>
 #include <glog/logging.h>  // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
-#include <gtest/gtest.h>
 #include <fstream>
 #include <iostream>
-#include "paddle/contrib/inference/demo/utils.h"
-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include "paddle/fluid/inference/demo_ci/utils.h"
+#include "paddle/fluid/platform/enforce.h"
 
 #ifdef PADDLE_WITH_CUDA
 DECLARE_double(fraction_of_gpu_memory_to_use);
 #endif
-
-namespace paddle {
-namespace demo {
-
 DEFINE_string(modeldir, "", "Directory of the inference model.");
 DEFINE_string(refer, "", "path to reference result for comparison.");
 DEFINE_string(
-    data,
-    "",
+    data, "",
     "path of data; each line is a record, format is "
     "'<space splitted floats as data>\t<space splitted ints as shape'");
+DEFINE_bool(use_gpu, false, "Whether use gpu.");
+
+namespace paddle {
+namespace demo {
 
 struct Record {
   std::vector<float> data;
@@ -47,7 +45,7 @@ struct Record {
 void split(const std::string& str, char sep, std::vector<std::string>* pieces);
 
 Record ProcessALine(const std::string& line) {
-  LOG(INFO) << "process a line";
+  VLOG(3) << "process a line";
   std::vector<std::string> columns;
   split(line, '\t', &columns);
   CHECK_EQ(columns.size(), 2UL)
@@ -65,8 +63,8 @@ Record ProcessALine(const std::string& line) {
   for (auto& s : shape_strs) {
     record.shape.push_back(std::stoi(s));
   }
-  LOG(INFO) << "data size " << record.data.size();
-  LOG(INFO) << "data shape size " << record.shape.size();
+  VLOG(3) << "data size " << record.data.size();
+  VLOG(3) << "data shape size " << record.shape.size();
   return record;
 }
 
@@ -78,20 +76,22 @@ void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
   file.close();
 
   size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
-  LOG(INFO) << "predictor output numel " << numel;
-  LOG(INFO) << "reference output numel " << refer.data.size();
-  EXPECT_EQ(numel, refer.data.size());
+  VLOG(3) << "predictor output numel " << numel;
+  VLOG(3) << "reference output numel " << refer.data.size();
+  PADDLE_ENFORCE_EQ(numel, refer.data.size());
   switch (output.dtype) {
     case PaddleDType::INT64: {
       for (size_t i = 0; i < numel; ++i) {
-        EXPECT_EQ(static_cast<int64_t*>(output.data.data())[i], refer.data[i]);
+        PADDLE_ENFORCE_EQ(static_cast<int64_t*>(output.data.data())[i],
+                          refer.data[i]);
       }
       break;
     }
     case PaddleDType::FLOAT32:
       for (size_t i = 0; i < numel; ++i) {
-        EXPECT_NEAR(
-            static_cast<float*>(output.data.data())[i], refer.data[i], 1e-5);
+        PADDLE_ENFORCE_LT(
+            fabs(static_cast<float*>(output.data.data())[i] - refer.data[i]),
+            1e-5);
       }
       break;
   }
@@ -106,15 +106,15 @@ void Main(bool use_gpu) {
   config.prog_file = FLAGS_modeldir + "/__model__";
   config.use_gpu = use_gpu;
   config.device = 0;
-#ifdef PADDLE_WITH_CUDA
-  config.fraction_of_gpu_memory = FLAGS_fraction_of_gpu_memory_to_use;
-#endif
+  if (FLAGS_use_gpu) {
+    config.fraction_of_gpu_memory = 0.1;  // set by yourself
+  }
 
-  LOG(INFO) << "init predictor";
+  VLOG(3) << "init predictor";
   auto predictor =
       CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
 
-  LOG(INFO) << "begin to process data";
+  VLOG(3) << "begin to process data";
   // Just a single batch of data.
   std::string line;
   std::ifstream file(FLAGS_data);
@@ -123,27 +123,32 @@ void Main(bool use_gpu) {
   file.close();
 
   // Inference.
-  PaddleTensor input{
-      .name = "xx",
-      .shape = record.shape,
-      .data = PaddleBuf(record.data.data(), record.data.size() * sizeof(float)),
-      .dtype = PaddleDType::FLOAT32};
+  PaddleTensor input;
+  input.shape = record.shape;
+  input.data =
+      PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
+  input.dtype = PaddleDType::FLOAT32;
 
-  LOG(INFO) << "run executor";
+  VLOG(3) << "run executor";
   std::vector<PaddleTensor> output;
   predictor->Run({input}, &output);
 
-  LOG(INFO) << "output.size " << output.size();
+  VLOG(3) << "output.size " << output.size();
   auto& tensor = output.front();
-  LOG(INFO) << "output: " << SummaryTensor(tensor);
+  VLOG(3) << "output: " << SummaryTensor(tensor);
 
   // compare with reference result
   CheckOutput(FLAGS_refer, tensor);
 }
 
-TEST(demo, vis_demo_cpu) { Main(false /*use_gpu*/); }
-#ifdef PADDLE_WITH_CUDA
-TEST(demo, vis_demo_gpu) { Main(true /*use_gpu*/); }
-#endif
 }  // namespace demo
 }  // namespace paddle
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  paddle::demo::Main(false /* use_gpu*/);
+  if (FLAGS_use_gpu) {
+    paddle::demo::Main(true /*use_gpu*/);
+  }
+  return 0;
+}
diff --git a/paddle/contrib/inference/high_level_api.md b/paddle/fluid/inference/api/high_level_api.md
similarity index 98%
rename from paddle/contrib/inference/high_level_api.md
rename to paddle/fluid/inference/api/high_level_api.md
index eb92885052a453d8c837bbf6f6e984efb509332a..8b8b6916d7e2b1a2f9fd09e9dfd2fe5a332461f5 100644
--- a/paddle/contrib/inference/high_level_api.md
+++ b/paddle/fluid/inference/api/high_level_api.md
@@ -57,4 +57,4 @@ By specifying the engine kind and config, one can get a specific implementation.
 ## Reference
 
 - [paddle_inference_api.h](./paddle_inference_api.h)
-- [some demos](./demo)
+- [some demos](./demo_ci)
diff --git a/paddle/contrib/inference/high_level_api_cn.md b/paddle/fluid/inference/api/high_level_api_cn.md
similarity index 96%
rename from paddle/contrib/inference/high_level_api_cn.md
rename to paddle/fluid/inference/api/high_level_api_cn.md
index a57f015a4e44d43ee4e475cf606faa6f05e095fa..2fb914592cbcb1b0c3f2ef33ff9cf4c295e427b6 100644
--- a/paddle/contrib/inference/high_level_api_cn.md
+++ b/paddle/fluid/inference/api/high_level_api_cn.md
@@ -83,5 +83,5 @@ CHECK(predictor->Run(slots, &outputs));
 
 ## 详细代码参考
 
-- [inference demos](./demo)
-- [复杂单线程/多线程例子](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/inference/test_paddle_inference_api_impl.cc)
+- [inference demos](./demo_ci)
+- [复杂单线程/多线程例子](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/api/test_api_impl.cc)
diff --git a/paddle/contrib/inference/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
similarity index 94%
rename from paddle/contrib/inference/paddle_inference_api.h
rename to paddle/fluid/inference/api/paddle_inference_api.h
index b8ba2d14a5c161d491d838888ea14b776f769f23..b24414e8245b1a4d90acce4fa1ad5690e06b47dd 100644
--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -40,6 +40,7 @@ class PaddleBuf {
   // Copy only available when memory is managed externally.
   explicit PaddleBuf(const PaddleBuf&);
   PaddleBuf& operator=(const PaddleBuf&);
+  PaddleBuf& operator=(PaddleBuf&&);
   // Do not own the memory.
   PaddleBuf(void* data, size_t length)
       : data_(data), length_(length), memory_owned_{false} {}
@@ -67,9 +68,9 @@ struct PaddleTensor {
   PaddleTensor() = default;
   std::string name;  // variable name.
   std::vector<int> shape;
-  // TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
   PaddleBuf data;  // blob of data.
   PaddleDType dtype;
+  std::vector<std::vector<uint64_t>> lod;  // lod data
 };
 
 enum class PaddleEngineKind {
@@ -98,7 +99,8 @@ class PaddlePredictor {
   // responsible for the output tensor's buffer, either allocated or passed from
   // outside.
   virtual bool Run(const std::vector<PaddleTensor>& inputs,
-                   std::vector<PaddleTensor>* output_data) = 0;
+                   std::vector<PaddleTensor>* output_data,
+                   int batch_size = -1) = 0;
 
   // Clone a predictor that share the model weights, the Cloned predictor should
   // be thread-safe.
@@ -125,9 +127,11 @@ struct NativeConfig : public PaddlePredictor::Config {
 
 // Configurations for Anakin engine.
 struct AnakinConfig : public PaddlePredictor::Config {
+  enum TargetType { NVGPU = 0, X86 };
   int device;
   std::string model_file;
   int max_batch_size{-1};
+  TargetType target_type;
 };
 
 struct TensorRTConfig : public NativeConfig {
diff --git a/paddle/fluid/inference/check_symbol.sh b/paddle/fluid/inference/check_symbol.sh
new file mode 100755
index 0000000000000000000000000000000000000000..12b7b3e7e5982f193e48596b867953fc93841b61
--- /dev/null
+++ b/paddle/fluid/inference/check_symbol.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+lib=$1
+if [ $# -ne 1 ]; then echo "No input library"; exit -1 ; fi
+
+num_paddle_syms=$(nm -D ${lib} | grep paddle | wc -l)
+num_google_syms=$(nm -D ${lib} | grep google | grep -v paddle | grep T | wc -l)
+
+if [ $num_paddle_syms -le 0 ]; then echo "Have no paddle symbols"; exit -1 ; fi
+if [ $num_google_syms -ge 1 ]; then echo "Have some google symbols"; exit -1 ; fi
+
+exit 0
diff --git a/paddle/fluid/inference/paddle_fluid.sym b/paddle/fluid/inference/paddle_fluid.sym
new file mode 100644
index 0000000000000000000000000000000000000000..ef2a04d788aa86b7f6a61c4af479d70d1137f374
--- /dev/null
+++ b/paddle/fluid/inference/paddle_fluid.sym
@@ -0,0 +1 @@
+*paddle*
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 748f5a084e8c880df215a60fe51c835ba5cd3110..8f42a37cd3f8978b917b42e8f45a128b8422aa57 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,7 +1,8 @@
 # Add TRT tests
 nv_library(tensorrt_converter
-  SRCS mul_op.cc conv2d_op.cc fc_op.cc
-  DEPS tensorrt_engine mul_op)
+  SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
+activation_op.cc
+  DEPS tensorrt_engine operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
   ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_converter)
@@ -13,3 +14,10 @@ nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
 nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL)
+nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine conv_op SERIAL)
+nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op SERIAL)
+
+nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL)
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 8e7e23377d4b2fe7afd51f1f58048fc4ed3c6d99..dba1d50b2d1c487ced8e6ca51f2d257641ad5fc7 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -20,11 +20,60 @@ namespace tensorrt {
 
 class Conv2dOpConverter : public OpConverter {
  public:
-  Conv2dOpConverter() {}
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
     LOG(INFO)
         << "convert a fluid conv2d op to tensorrt conv layer without bias";
+
+    framework::OpDesc op_desc(op, nullptr);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1);  // Y is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1);
+
+    auto* X = engine_->GetITensor(op_desc.Input("Input").front());
+    // Declare weights
+    auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
+    PADDLE_ENFORCE_NOT_NULL(Y_v);
+    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
+    auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace());
+
+    PADDLE_ENFORCE_EQ(Y_t->dims().size(), 4UL);
+    const int n_output = Y_t->dims()[0];
+    const int filter_h = Y_t->dims()[2];
+    const int filter_w = Y_t->dims()[3];
+
+    const int groups = boost::get<int>(op_desc.GetAttr("groups"));
+    const std::vector<int> dilations =
+        boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
+    const std::vector<int> strides =
+        boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+    const std::vector<int> paddings =
+        boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+
+    nvinfer1::DimsHW nv_ksize(filter_h, filter_w);
+    nvinfer1::DimsHW nv_dilations(dilations[0], dilations[1]);
+    nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
+    nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
+
+    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
+                                  static_cast<void*>(weight_data),
+                                  Y_t->memory_size() / sizeof(float)};
+
+    TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    auto* layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Convolution, *const_cast<nvinfer1::ITensor*>(X), n_output,
+        nv_ksize, weight.get(), bias.get());
+    PADDLE_ENFORCE(layer != nullptr);
+    layer->setStride(nv_strides);
+    layer->setPadding(nv_paddings);
+    layer->setDilation(nv_dilations);
+    layer->setNbGroups(groups);
+
+    auto output_name = op_desc.Output("Output").front();
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3744550f60a1696aedd8a3ecd24f1b21d22325b9
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -0,0 +1,210 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class ElementwiseWeightOpConverter : public OpConverter {
+ public:
+  ElementwiseWeightOpConverter() {}
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    // Here the two nullptr looks strange, that's because the
+    // framework::OpDesc's constructor is strange.
+    framework::OpDesc op_desc(op, nullptr);
+    LOG(INFO) << "convert a fluid elementwise op to tensorrt IScaleLayer";
+
+    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+
+    auto* X = engine_->GetITensor(op_desc.Input("X").front());
+    nvinfer1::Dims dims_x = X->getDimensions();
+    PADDLE_ENFORCE(dims_x.nbDims >= 3);
+
+    auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
+    PADDLE_ENFORCE_NOT_NULL(Y_v);
+    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
+    auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace());
+    auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
+
+    std::vector<int> dims_y = framework::vectorize2int(Y_t->dims());
+    if (static_cast<int>(dims_y.size()) == dims_x.nbDims + 1) {
+      if (dims_y[0] == 1) dims_y.erase(dims_y.begin());
+    }
+
+    if (static_cast<int>(dims_y.size()) == 1 && dims_y[0] == dims_x.d[0]) {
+      scale_mode = nvinfer1::ScaleMode::kCHANNEL;
+    } else if (static_cast<int>(dims_y.size()) == dims_x.nbDims &&
+               dims_y[0] == dims_x.d[0]) {
+      scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
+      for (int i = 1; i < dims_x.nbDims; i++) {
+        if (dims_y[i] != dims_x.d[i]) {
+          scale_mode = nvinfer1::ScaleMode::kCHANNEL;
+          break;
+        }
+      }
+      if (scale_mode == nvinfer1::ScaleMode::kCHANNEL) {
+        for (int i = 1; i < dims_x.nbDims; i++) {
+          if (dims_y[i] != 1)
+            PADDLE_THROW(
+                "TensorRT unsupported weight shape for Elementwise op!");
+        }
+      }
+    } else {
+      PADDLE_THROW("TensorRT unsupported weight Shape for Elementwise op!");
+    }
+
+    TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT,
+                                         static_cast<void*>(weight_data),
+                                         Y_t->memory_size() / sizeof(float)};
+    TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, nullptr,
+                                         0};
+    TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
+                                         0};
+
+    nvinfer1::IScaleLayer* layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Scale, *const_cast<nvinfer1::ITensor*>(X), scale_mode,
+        shift_weights.get(), scale_weights.get(), power_weights.get());
+    auto output_name = op_desc.Output("Out")[0];
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
+    }
+  }
+};
+
+class ElementwiseTensorOpConverter : public OpConverter {
+ public:
+  ElementwiseTensorOpConverter() {}
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    // Here the two nullptr looks strange, that's because the
+    // framework::OpDesc's constructor is strange.
+    framework::OpDesc op_desc(op, nullptr);
+    LOG(INFO) << "convert a fluid elementwise op to tensorrt IScaleLayer";
+
+    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+
+    auto* X = engine_->GetITensor(op_desc.Input("X").front());
+    auto* Y = engine_->GetITensor(op_desc.Input("Y").front());
+    nvinfer1::Dims dims_x = X->getDimensions();
+    nvinfer1::Dims dims_y = Y->getDimensions();
+
+    // The two input tensor should have the same dims
+    PADDLE_ENFORCE(dims_x.nbDims >= 3);
+    if (dims_x.nbDims == dims_y.nbDims) {
+      for (int i = 0; i < dims_x.nbDims; i++) {
+        if (dims_x.d[i] != dims_y.d[i])
+          PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!");
+      }
+    } else {
+      PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!");
+    }
+
+    auto op_pair = ops.find(op_type_);
+    if (op_pair == ops.end()) {
+      PADDLE_THROW("Wrong elementwise op type!");
+    }
+    nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER(
+        engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
+        *const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
+
+    auto output_name = op_desc.Output("Out")[0];
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
+    }
+  }
+
+ protected:
+  static const std::unordered_map<std::string, nvinfer1::ElementWiseOperation>
+      ops;
+  std::string op_type_;
+};
+
+const std::unordered_map<std::string, nvinfer1::ElementWiseOperation>
+    ElementwiseTensorOpConverter::ops = {
+        {"add", nvinfer1::ElementWiseOperation::kSUM},
+        {"mul", nvinfer1::ElementWiseOperation::kPROD},
+        {"sub", nvinfer1::ElementWiseOperation::kSUB},
+        {"div", nvinfer1::ElementWiseOperation::kDIV},
+        {"min", nvinfer1::ElementWiseOperation::kMIN},
+        {"pow", nvinfer1::ElementWiseOperation::kPOW},
+        {"max", nvinfer1::ElementWiseOperation::kMAX},
+};
+
+class ElementwiseTensorAddOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorAddOpConverter() { op_type_ = "add"; }
+};
+
+class ElementwiseTensorMulOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorMulOpConverter() { op_type_ = "mul"; }
+};
+
+class ElementwiseTensorSubOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorSubOpConverter() { op_type_ = "sub"; }
+};
+
+class ElementwiseTensorDivOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorDivOpConverter() { op_type_ = "div"; }
+};
+
+class ElementwiseTensorMinOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorMinOpConverter() { op_type_ = "min"; }
+};
+
+class ElementwiseTensorMaxOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorMaxOpConverter() { op_type_ = "max"; }
+};
+
+class ElementwiseTensorPowOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorPowOpConverter() { op_type_ = "pow"; }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(elementwise_add_weight, ElementwiseWeightOpConverter);
+
+REGISTER_TRT_OP_CONVERTER(elementwise_add_tensor,
+                          ElementwiseTensorAddOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_sub_tensor,
+                          ElementwiseTensorSubOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_div_tensor,
+                          ElementwiseTensorDivOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_mul_tensor,
+                          ElementwiseTensorMulOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_max_tensor,
+                          ElementwiseTensorMaxOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_min_tensor,
+                          ElementwiseTensorMinOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_pow_tensor,
+                          ElementwiseTensorPowOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index bb603efaf30bb72d74b5583abc45d01a16c076a3..39fe1f609d7b94638506877fc301f19ef33ec8ac 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -32,13 +32,13 @@ void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
   for (int h = 0; h < shape.h(); ++h) {
     for (int w = 0; w < shape.w(); ++w) {
       odata[h * ostrides.h() + w * ostrides.w()] =
-          idata[h * ostrides.h() + w * ostrides.w()];
+          idata[h * istrides.h() + w * istrides.w()];
     }
   }
 }
-
+// indata c * k
 // Reorder the data layout from CK to KC.
-void ReorderCKtoKC(TensorRTEngine::Weight& iweights,
+void ReorderCKtoKC(TensorRTEngine::Weight& iweights,  // NOLINT
                    TensorRTEngine::Weight* oweights) {
   int c = iweights.dims[0];
   int k = iweights.dims[1];
@@ -79,9 +79,8 @@ class FcOpConverter : public OpConverter {
 
     framework::LoDTensor tmp;
     tmp.Resize(Y_t->dims());
-    memcpy(tmp.mutable_data<float>(platform::CPUPlace()), Y_t->data<float>(),
-           Y_t->dims()[0] * Y_t->dims()[1]);
-
+    memcpy(tmp.mutable_data<float>(platform::CPUPlace()), weight_data,
+           Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float));
     TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
                                   static_cast<void*>(weight_data),
                                   Y_t->memory_size() / sizeof(float)};
@@ -93,7 +92,7 @@ class FcOpConverter : public OpConverter {
 
     // The data layout of TRT FC layer's weight is different from fluid's FC,
     // need to reorder the elements.
-    ReorderCKtoKC(tmp_weight, &weight);
+    ReorderCKtoKC(weight, &tmp_weight);
 
     // Currently, the framework can only handle one fluid op -> one TRT layer,
     // but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just
@@ -103,7 +102,7 @@ class FcOpConverter : public OpConverter {
 
     auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected,
                                        *const_cast<nvinfer1::ITensor*>(X),
-                                       n_output, weight.get(), bias.get());
+                                       n_output, tmp_weight.get(), bias.get());
 
     auto output_name = op_desc.Output("Out").front();
     engine_->SetITensor(output_name, layer->getOutput(0));
@@ -118,4 +117,3 @@ class FcOpConverter : public OpConverter {
 }  // namespace paddle
 
 REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter);
-USE_OP(mul);
diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
index 3c342957360ad4192d838147bf37e84d233c2629..514eb659a8da73b6e56b5d17148ec0cb2aeaa135 100644
--- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
@@ -49,5 +49,4 @@ class MulOpConverter : public OpConverter {
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(mul);
 REGISTER_TRT_OP_CONVERTER(mul, MulOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 6697952051c4b1997ca6b550da17a52e64cb3454..41faaf7212accaaec238062b1340e8da8fa6be33 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -55,6 +55,32 @@ class OpConverter {
         it = Registry<OpConverter>::Lookup("fc");
       }
     }
+    if (op_desc.Type().find("elementwise") != std::string::npos) {
+      static std::unordered_set<std::string> add_tensor_op_set{
+          "add", "mul", "sub", "div", "max", "min", "pow"};
+      // TODO(xingzhaolong): all mul, sub, div
+      // static std::unordered_set<std::string> add_weight_op_set {"add", "mul",
+      // "sub", "div"};
+      static std::unordered_set<std::string> add_weight_op_set{"add"};
+      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
+      int op_type_len = op_desc.Type().size();
+      std::string op_type = op_desc.Type().substr(op_type_len - 3, op_type_len);
+      std::string Y = op_desc.Input("Y")[0];
+      if (parameters.count(Y)) {
+        PADDLE_ENFORCE(add_weight_op_set.count(op_type) > 0,
+                       "Unsupported elementwise type" + op_type);
+        it =
+            Registry<OpConverter>::Lookup("elementwise_" + op_type + "_weight");
+        PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
+                                op_desc.Type());
+      } else {
+        PADDLE_ENFORCE(add_tensor_op_set.count(op_type) > 0,
+                       "Unsupported elementwise type" + op_type);
+        it =
+            Registry<OpConverter>::Lookup("elementwise_" + op_type + "_tensor");
+      }
+    }
+
     if (!it) {
       it = Registry<OpConverter>::Lookup(op_desc.Type());
     }
@@ -93,6 +119,10 @@ class OpConverter {
   framework::Scope* scope_{nullptr};
 };
 
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
 #define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__)                      \
   struct trt_##op_type__##_converter : public ::paddle::framework::Registrar { \
     trt_##op_type__##_converter() {                                            \
@@ -111,7 +141,3 @@ class OpConverter {
   extern int TouchConverterRegister_##op_type__();                      \
   static int use_op_converter_trt_##op_type__ __attribute__((unused)) = \
       TouchConverterRegister_##op_type__();
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..11cad95361867476c6f775af778015da37f1cfb1
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -0,0 +1,80 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Pool2dOp, IPoolingLayer in TRT. This Layer doesn't has weights.
+ */
+class Pool2dOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4)
+        << "convert a fluid pool2d op to tensorrt pool2d layer without bias";
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+
+    std::string pool_type =
+        boost::get<std::string>(op_desc.GetAttr("pooling_type"));
+    std::vector<int> ksize =
+        boost::get<std::vector<int>>(op_desc.GetAttr("ksize"));
+    std::vector<int> strides =
+        boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+    std::vector<int> paddings =
+        boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+
+    const nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]);
+    const nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
+    const nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
+
+    PADDLE_ENFORCE_EQ(input1->getDimensions().nbDims, 3UL);
+
+    nvinfer1::PoolingType nv_pool_type = nvinfer1::PoolingType::kMAX;
+    if (pool_type == "max") {
+      nv_pool_type = nvinfer1::PoolingType::kMAX;
+    } else if (pool_type == "avg") {
+      nv_pool_type = nvinfer1::PoolingType::kAVERAGE;
+    } else {
+      PADDLE_THROW("TensorRT unsupported pooling type!");
+    }
+
+    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling,
+                                       *const_cast<nvinfer1::ITensor*>(input1),
+                                       nv_pool_type, nv_ksize);
+    PADDLE_ENFORCE_NOT_NULL(layer, "pool layer could not be created.");
+    layer->setStride(nv_strides);
+    layer->setPadding(nv_paddings);
+
+    auto output_name = op_desc.Output("Out")[0];
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(pool2d);
+REGISTER_TRT_OP_CONVERTER(pool2d, Pool2dOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
index 0a02a7bebf9efbd0555707e6cfa701ef1e7d9659..e82762ea03ecd00bce7cfb83b130a3436ccbfed3 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
@@ -37,7 +37,7 @@ TEST(ReluOpConverter, main) {
   validator.SetOp(*desc.Proto());
   LOG(INFO) << "execute";
 
-  validator.Execute(10);
+  validator.Execute(5);
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f8711c6b60d74639529624c25429bc245de46479
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(conv2d_op, test) {
+  std::unordered_set<std::string> parameters({"conv2d-Y"});
+  framework::Scope scope;
+  TRTConvertValidation validator(5, parameters, scope, 1 << 15);
+
+  validator.DeclInputVar("conv2d-X", nvinfer1::Dims3(2, 5, 5));
+  validator.DeclParamVar("conv2d-Y", nvinfer1::Dims4(3, 2, 3, 3));
+  validator.DeclOutputVar("conv2d-Out", nvinfer1::Dims3(3, 5, 5));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("conv2d");
+  desc.SetInput("Input", {"conv2d-X"});
+  desc.SetInput("Filter", {"conv2d-Y"});
+  desc.SetOutput("Output", {"conv2d-Out"});
+
+  const std::vector<int> strides({1, 1});
+  const std::vector<int> paddings({1, 1});
+  const std::vector<int> dilations({1, 1});
+  const int groups = 1;
+
+  desc.SetAttr("strides", strides);
+  desc.SetAttr("paddings", paddings);
+  desc.SetAttr("dilations", dilations);
+  desc.SetAttr("groups", groups);
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(3);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+USE_OP(conv2d);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7537d02a35b66a41c158cd8eb1b1e5d4107e7d84
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
@@ -0,0 +1,73 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(elementwise_op, add_weight_test) {
+  std::unordered_set<std::string> parameters({"elementwise_add-Y"});
+  framework::Scope scope;
+  TRTConvertValidation validator(10, parameters, scope, 1 << 15);
+  validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3));
+  validator.DeclParamVar("elementwise_add-Y", nvinfer1::Dims3(10, 1, 1));
+  // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
+  validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("elementwise_add");
+  desc.SetInput("X", {"elementwise_add-X"});
+  desc.SetInput("Y", {"elementwise_add-Y"});
+  desc.SetOutput("Out", {"elementwise_add-Out"});
+
+  int axis = 1;
+  desc.SetAttr("axis", axis);
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(8);
+}
+
+TEST(elementwise_op, add_tensor_test) {
+  std::unordered_set<std::string> parameters;
+  framework::Scope scope;
+  TRTConvertValidation validator(8, parameters, scope, 1 << 15);
+  validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3));
+  validator.DeclInputVar("elementwise_add-Y", nvinfer1::Dims3(10, 3, 3));
+  // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
+  validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("elementwise_add");
+  desc.SetInput("X", {"elementwise_add-X"});
+  desc.SetInput("Y", {"elementwise_add-Y"});
+  desc.SetOutput("Out", {"elementwise_add-Out"});
+
+  // the defalut axis of elementwise op is -1
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(8);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+USE_OP(elementwise_add);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
index a30253072ac581ceca85ca10151a176f87a7cb39..1ae2668e733aad23241c63b9985e708396d0b1bc 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
@@ -23,11 +23,10 @@ namespace tensorrt {
 TEST(fc_op, test) {
   std::unordered_set<std::string> parameters({"mul-Y"});
   framework::Scope scope;
-  TRTConvertValidation validator(20, parameters, scope, 1000);
-
-  validator.DeclInputVar("mul-X", nvinfer1::Dims4(8, 3, 1, 1));
-  validator.DeclParamVar("mul-Y", nvinfer1::Dims2(3, 2));
-  validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(8, 2));
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("mul-X", nvinfer1::Dims3(10, 1, 1));
+  validator.DeclParamVar("mul-Y", nvinfer1::Dims2(10, 2));
+  validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(1, 2));
 
   // Prepare Op description
   framework::OpDesc desc;
@@ -44,3 +43,4 @@ TEST(fc_op, test) {
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
+USE_OP(mul);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
index 1ce1130e5d660d717a1262a1fbdb4b620462c0b3..3d34cd7d5d0deca4d83a3f5b5ed0fb396c6acd56 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
@@ -23,7 +23,7 @@ namespace tensorrt {
 TEST(MulOpConverter, main) {
   framework::Scope scope;
   std::unordered_set<std::string> parameters;
-  TRTConvertValidation validator(10, parameters, scope, 1000);
+  TRTConvertValidation validator(10, parameters, scope, 1000, false);
   validator.DeclInputVar("mul-X", nvinfer1::Dims2(10, 6));
   validator.DeclInputVar("mul-Y", nvinfer1::Dims2(6, 10));
   validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(10, 10));
@@ -39,7 +39,7 @@ TEST(MulOpConverter, main) {
   validator.SetOp(*desc.Proto());
   LOG(INFO) << "execute";
 
-  validator.Execute(10);
+  validator.Execute(2);
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index 9b79f86b0edba983019bd932f52b08711ff36d41..d6651a5b244ba31a01220e6299cb2016ae61fe64 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -25,12 +25,42 @@ TEST(OpConverter, ConvertBlock) {
   framework::ProgramDesc prog;
   auto* block = prog.MutableBlock(0);
   auto* conv2d_op = block->AppendOp();
+
+  // init trt engine
+  cudaStream_t stream_;
+  std::unique_ptr<TensorRTEngine> engine_;
+  engine_.reset(new TensorRTEngine(5, 1 << 15, &stream_));
+  engine_->InitNetwork();
+  PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
+
+  engine_->DeclareInput("conv2d-X", nvinfer1::DataType::kFLOAT,
+                        nvinfer1::Dims3(2, 5, 5));
+
   conv2d_op->SetType("conv2d");
+  conv2d_op->SetInput("Input", {"conv2d-X"});
+  conv2d_op->SetInput("Filter", {"conv2d-Y"});
+  conv2d_op->SetOutput("Output", {"conv2d-Out"});
 
-  OpConverter converter;
+  const std::vector<int> strides({1, 1});
+  const std::vector<int> paddings({1, 1});
+  const std::vector<int> dilations({1, 1});
+  const int groups = 1;
+
+  conv2d_op->SetAttr("strides", strides);
+  conv2d_op->SetAttr("paddings", paddings);
+  conv2d_op->SetAttr("dilations", dilations);
+  conv2d_op->SetAttr("groups", groups);
+
+  // init scope
   framework::Scope scope;
-  converter.ConvertBlock(*block->Proto(), {}, scope,
-                         nullptr /*TensorRTEngine*/);
+  std::vector<int> dim_vec = {3, 2, 3, 3};
+  auto* x = scope.Var("conv2d-Y");
+  auto* x_tensor = x->GetMutable<framework::LoDTensor>();
+  x_tensor->Resize(framework::make_ddim(dim_vec));
+
+  OpConverter converter;
+  converter.ConvertBlock(*block->Proto(), {"conv2d-Y"}, scope,
+                         engine_.get() /*TensorRTEngine*/);
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c5dddbc8cd37b9fb1ba39382af2da5ad045f3af2
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <gtest/gtest.h>
+#include <fstream>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(Pool2dOpConverter, main) {
+  framework::Scope scope;
+  std::unordered_set<std::string> parameters;
+  TRTConvertValidation validator(5, parameters, scope, 1 << 15);
+
+  // The ITensor's Dims should not contain the batch size.
+  // So, the ITensor's Dims of input and output should be C * H * W.
+  validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 4, 4));
+  validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 2, 2));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("pool2d");
+  desc.SetInput("X", {"pool2d-X"});
+  desc.SetOutput("Out", {"pool2d-Out"});
+
+  std::vector<int> ksize({2, 2});
+  std::vector<int> strides({2, 2});
+  std::vector<int> paddings({0, 0});
+  std::string pooling_t = "max";
+
+  desc.SetAttr("pooling_type", pooling_t);
+  desc.SetAttr("ksize", ksize);
+  desc.SetAttr("strides", strides);
+  desc.SetAttr("paddings", paddings);
+
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+
+  validator.Execute(3);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(pool2d);
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index 3b1f531adc5d756259df1c350f7f44bf71ee1f93..63c2f978f253df11100ecca83acae5eab6a0337d 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -39,7 +39,7 @@ namespace tensorrt {
 float random(float low, float high) {
   static std::random_device rd;
   static std::mt19937 mt(rd());
-  std::uniform_real_distribution<double> dist(1.0, 10.0);
+  std::uniform_real_distribution<double> dist(low, high);
   return dist(mt);
 }
 
@@ -49,6 +49,7 @@ void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place,
   size_t num_elements = analysis::AccuDims(dims, dims.size());
   PADDLE_ENFORCE_GT(num_elements, 0);
   auto* data = tensor->mutable_data<float>(place);
+
   for (size_t i = 0; i < num_elements; i++) {
     *(data + i) = random(0., 1.);
   }
@@ -62,13 +63,16 @@ class TRTConvertValidation {
  public:
   TRTConvertValidation() = delete;
 
-  TRTConvertValidation(int batch_size,
+  TRTConvertValidation(int max_batch_size,
                        const std::unordered_set<std::string>& parameters,
                        framework::Scope& scope,  // NOLINT
-                       int workspace_size = 1 << 10)
-      : parameters_(parameters), scope_(scope) {
+                       int workspace_size = 1 << 10, bool if_add_batch = true)
+      : parameters_(parameters),
+        scope_(scope),
+        if_add_batch_(if_add_batch),
+        max_batch_size_(max_batch_size) {
     // create engine.
-    engine_.reset(new TensorRTEngine(10, 1 << 10, &stream_));
+    engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, &stream_));
     engine_->InitNetwork();
 
     PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
@@ -83,7 +87,7 @@ class TRTConvertValidation {
 
   // Declare a parameter varaible in the scope.
   void DeclParamVar(const std::string& name, const nvinfer1::Dims& dims) {
-    DeclVar(name, dims);
+    DeclVar(name, dims, true);
   }
 
   void DeclOutputVar(const std::string& name, const nvinfer1::Dims& dims) {
@@ -91,12 +95,18 @@ class TRTConvertValidation {
   }
 
   // Declare a variable in a fluid Scope.
-  void DeclVar(const std::string& name, const nvinfer1::Dims& dims) {
+  void DeclVar(const std::string& name, const nvinfer1::Dims& dims,
+               bool is_param = false) {
     platform::CPUPlace place;
     platform::CPUDeviceContext ctx(place);
 
     // Init Fluid tensor.
     std::vector<int> dim_vec(dims.d, dims.d + dims.nbDims);
+    // There is no batchsize in ITensor's shape, but We should add it to
+    // tensor's shape of fluid. If the variable is not parameter and the
+    // if_add_batch_ flag is true, add the max batchsize to dim_vec.
+    if (is_param != true && if_add_batch_ == true)
+      dim_vec.insert(dim_vec.begin(), max_batch_size_);
     auto* x = scope_.Var(name);
     auto* x_tensor = x->GetMutable<framework::LoDTensor>();
     x_tensor->Resize(framework::make_ddim(dim_vec));
@@ -130,6 +140,7 @@ class TRTConvertValidation {
 
   void Execute(int batch_size) {
     // Execute Fluid Op
+    PADDLE_ENFORCE_LE(batch_size, max_batch_size_);
     platform::CPUPlace place;
     platform::CPUDeviceContext ctx(place);
     op_->Run(scope_, place);
@@ -138,20 +149,25 @@ class TRTConvertValidation {
     cudaStreamSynchronize(*engine_->stream());
 
     ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
-    const size_t output_space_size = 200;
+    const size_t output_space_size = 3000;
     for (const auto& output : op_desc_->OutputArgumentNames()) {
       std::vector<float> fluid_out;
       std::vector<float> trt_out(output_space_size);
-      engine_->GetOutputInCPU(output, &trt_out[0],
-                              output_space_size * sizeof(float));
+      engine_->GetOutputInCPU(output, &trt_out[0], output_space_size);
       cudaStreamSynchronize(*engine_->stream());
 
       auto* var = scope_.FindVar(output);
       auto tensor = var->GetMutable<framework::LoDTensor>();
       framework::TensorToVector(*tensor, ctx, &fluid_out);
+
+      size_t fluid_out_size = fluid_out.size();
+      if (if_add_batch_ == true) {
+        fluid_out_size =
+            batch_size * (framework::product(tensor->dims()) / max_batch_size_);
+      }
       // Compare two output
       ASSERT_FALSE(fluid_out.empty());
-      for (size_t i = 0; i < fluid_out.size(); i++) {
+      for (size_t i = 0; i < fluid_out_size; i++) {
         // Loose the threshold for CI in different machine model.
         EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 2e-5);
       }
@@ -167,6 +183,12 @@ class TRTConvertValidation {
   std::unique_ptr<framework::OpDesc> op_desc_;
   const std::unordered_set<std::string>& parameters_;
   framework::Scope& scope_;
+  // The ITensor of trt does not cotain the batch size,
+  // bug, in most cases, we need to set batch size for
+  // fluid's tensor shape. This variable indicates
+  // whether to add batch size to tensor shape of fluid.
+  bool if_add_batch_;
+  int max_batch_size_;
 };
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 596e0fe9da3d272ecb1c0f8dbef09a75d08a4b1a..b821c3d0bf425c46fae634fbf53f7ee63100ca5c 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -1,7 +1,7 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use
+this file except in compliance with the License.
 You may obtain a copy of the License at
 
 http://www.apache.org/licenses/LICENSE-2.0
@@ -26,26 +26,31 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-void TensorRTEngine::Build(const DescType& paddle_model) {
+int TensorRTEngine::runtime_batch_ = 1;
+
+void TensorRTEngine::Build(const DescType &paddle_model) {
   PADDLE_ENFORCE(false, "not implemented");
 }
 
 void TensorRTEngine::Execute(int batch_size) {
-  std::vector<void*> buffers;
-  for (auto& buf : buffers_) {
+  batch_size_ = batch_size;
+  std::vector<void *> buffers;
+  for (auto &buf : buffers_) {
     PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated");
     PADDLE_ENFORCE_GT(buf.max_size, 0);
     PADDLE_ENFORCE(buf.device == DeviceType::GPU);
     buffers.push_back(buf.buffer);
   }
+  PADDLE_ENFORCE_NOT_NULL(stream_);
   infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);
   cudaStreamSynchronize(*stream_);
+  SetRuntimeBatch(batch_size);
 }
 
 TensorRTEngine::~TensorRTEngine() {
   cudaStreamSynchronize(*stream_);
   // clean buffer
-  for (auto& buf : buffers_) {
+  for (auto &buf : buffers_) {
     if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {
       PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer));
       buf.buffer = nullptr;
@@ -70,46 +75,51 @@ void TensorRTEngine::FreezeNetwork() {
 
   // allocate GPU buffers.
   buffers_.resize(buffer_sizes_.size());
-  for (auto& item : buffer_sizes_) {
+  for (auto &item : buffer_sizes_) {
+    // The output buffers are not set in the network building phrase, need to
+    // infer from the TesorRT network.
     if (item.second == 0) {
       auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str());
       auto dims = infer_engine_->getBindingDimensions(slot_offset);
       item.second = kDataTypeSize[static_cast<int>(
                         infer_engine_->getBindingDataType(slot_offset))] *
-                    analysis::AccuDims(dims.d, dims.nbDims);
+                    analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
+      PADDLE_ENFORCE_GT(item.second, 0);
     }
-    auto& buf = buffer(item.first);
+
+    auto &buf = buffer(item.first);
+    buf.max_size = item.second * max_batch_;
     CHECK(buf.buffer == nullptr);  // buffer should be allocated only once.
-    PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second));
-    VLOG(4) << "buffer malloc " << item.first << " " << item.second << " "
-            << buf.buffer;
-    buf.size = buf.max_size = item.second;
+
+    PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second * max_batch_));
+    buf.size = 0;
+    PADDLE_ENFORCE_LE(buf.max_size, 1 << 30);  // 10G
     buf.device = DeviceType::GPU;
   }
 }
 
-nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
+nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
                                                 nvinfer1::DataType dtype,
-                                                const nvinfer1::Dims& dims) {
+                                                const nvinfer1::Dims &dims) {
   PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate input name %s",
                     name);
 
   PADDLE_ENFORCE(infer_network_ != nullptr, "should initnetwork first");
-  auto* input = infer_network_->addInput(name.c_str(), dtype, dims);
+  auto *input = infer_network_->addInput(name.c_str(), dtype, dims);
   PADDLE_ENFORCE(input, "infer network add input %s failed", name);
   buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] *
-                        analysis::AccuDims(dims.d, dims.nbDims);
+                        analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
   PADDLE_ENFORCE(input->isNetworkInput());
   TensorRTEngine::SetITensor(name, input);
   return input;
 }
 
-void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
-                                   const std::string& name) {
+void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
+                                   const std::string &name) {
   PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
                     name);
 
-  auto* output = layer->getOutput(offset);
+  auto *output = layer->getOutput(offset);
   SetITensor(name, output);
   PADDLE_ENFORCE(output != nullptr);
   output->setName(name.c_str());
@@ -121,11 +131,11 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
   buffer_sizes_[name] = 0;
 }
 
-void TensorRTEngine::DeclareOutput(const std::string& name) {
+void TensorRTEngine::DeclareOutput(const std::string &name) {
   PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
                     name);
 
-  auto* output = TensorRTEngine::GetITensor(name);
+  auto *output = TensorRTEngine::GetITensor(name);
   PADDLE_ENFORCE(output != nullptr);
   output->setName(name.c_str());
   PADDLE_ENFORCE(!output->isNetworkInput());
@@ -135,38 +145,52 @@ void TensorRTEngine::DeclareOutput(const std::string& name) {
   buffer_sizes_[name] = 0;
 }
 
-void* TensorRTEngine::GetOutputInGPU(const std::string& name) {
+void *TensorRTEngine::GetOutputInGPU(const std::string &name) {
   return buffer(name).buffer;
 }
 
-void TensorRTEngine::GetOutputInGPU(const std::string& name, void* dst,
+void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst,
                                     size_t max_size) {
   // determine data size
+  auto *output = TensorRTEngine::GetITensor(name);
+  nvinfer1::Dims dims = output->getDimensions();
+  auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
+  size_t dst_size = dim_size * runtime_batch_ *
+                    kDataTypeSize[static_cast<int>(output->getType())];
+
   auto it = buffer_sizes_.find(name);
   PADDLE_ENFORCE(it != buffer_sizes_.end());
   PADDLE_ENFORCE_GT(it->second, 0);
-  PADDLE_ENFORCE_GE(max_size, it->second);
-  auto& buf = buffer(name);
+  PADDLE_ENFORCE_LE(dst_size, it->second);
+  PADDLE_ENFORCE_GE(max_size, dst_size);
+  auto &buf = buffer(name);
   PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
-  PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, it->second,
+  PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size,
                                     cudaMemcpyDeviceToDevice, *stream_),
                     0);
 }
 
-void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst,
+void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
                                     size_t max_size) {
   // determine data size
+
+  auto *output = TensorRTEngine::GetITensor(name);
+  nvinfer1::Dims dims = output->getDimensions();
+  auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
+  size_t dst_size = dim_size * runtime_batch_ *
+                    kDataTypeSize[static_cast<int>(output->getType())];
   auto it = buffer_sizes_.find(name);
   PADDLE_ENFORCE(it != buffer_sizes_.end());
   PADDLE_ENFORCE_GT(it->second, 0);
-  PADDLE_ENFORCE_GE(max_size, it->second);
-  auto& buf = buffer(name);
+  PADDLE_ENFORCE_LE(dst_size, it->second);
+  PADDLE_ENFORCE_GE(max_size, dst_size);
+  auto &buf = buffer(name);
   PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
-  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, it->second,
+  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size,
                                        cudaMemcpyDeviceToHost, *stream_));
 }
 
-Buffer& TensorRTEngine::buffer(const std::string& name) {
+Buffer &TensorRTEngine::buffer(const std::string &name) {
   PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");
   auto it = buffer_sizes_.find(name);
   PADDLE_ENFORCE(it != buffer_sizes_.end());
@@ -174,19 +198,23 @@ Buffer& TensorRTEngine::buffer(const std::string& name) {
   return buffers_[slot_offset];
 }
 
-void TensorRTEngine::SetInputFromCPU(const std::string& name, const void* data,
+void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data,
                                      size_t size) {
-  auto& buf = buffer(name);
+  auto &buf = buffer(name);
   PADDLE_ENFORCE_NOT_NULL(buf.buffer);
+  PADDLE_ENFORCE_NOT_NULL(data);
+  PADDLE_ENFORCE_NOT_NULL(stream_);
   PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
   PADDLE_ENFORCE(buf.device == DeviceType::GPU);
+  buf.size = size;
   PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
                                        cudaMemcpyHostToDevice, *stream_));
 }
 
-void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data,
+void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
                                      size_t size) {
-  auto& buf = buffer(name);
+  auto &buf = buffer(name);
+  buf.size = size;
   PADDLE_ENFORCE_NOT_NULL(buf.buffer);
   PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
   PADDLE_ENFORCE(buf.device == DeviceType::GPU);
@@ -194,19 +222,25 @@ void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data,
                                        cudaMemcpyDeviceToDevice, *stream_));
 }
 
-void TensorRTEngine::SetITensor(const std::string& name,
-                                nvinfer1::ITensor* tensor) {
+void TensorRTEngine::SetITensor(const std::string &name,
+                                nvinfer1::ITensor *tensor) {
   PADDLE_ENFORCE(tensor != nullptr);
   PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s",
                     name);
   itensor_map_[name] = tensor;
 }
 
-nvinfer1::ITensor* TensorRTEngine::GetITensor(const std::string& name) {
+nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {
   PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name);
   return itensor_map_[name];
 }
 
+void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
+  runtime_batch_ = batch_size;
+}
+
+int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; }
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index b06a9bbc6758ae9410b2fce99ef2b1a9e7ab98c0..694468c419c20089de1cdecff1a903ad0cc6e99f 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -57,7 +57,9 @@ class TensorRTEngine : public EngineBase {
       : max_batch_(max_batch),
         max_workspace_(max_workspace),
         stream_(stream ? stream : &default_stream_),
-        logger_(logger) {}
+        logger_(logger) {
+    cudaStreamCreate(&default_stream_);
+  }
 
   virtual ~TensorRTEngine();
 
@@ -115,12 +117,19 @@ class TensorRTEngine : public EngineBase {
 
   nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
   nvinfer1::INetworkDefinition* network() { return infer_network_.get(); }
+  void SetRuntimeBatch(size_t batch_size);
+  int GetRuntimeBatch();
 
  private:
   // the max batch size
   int max_batch_;
+  // the runtime batch size
+  static int runtime_batch_;
   // the max memory size the engine uses
   int max_workspace_;
+
+  // batch size of the current data, will be updated each Executation.
+  int batch_size_{-1};
   cudaStream_t* stream_;
   // If stream_ is not set from outside, hold its own stream.
   cudaStream_t default_stream_;
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index e635f0f87d577a1f1ac74687ee60f762be525418..dc03702990587bf5e65d28da662d10df4d882110 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -28,7 +28,7 @@ class TensorRTEngineTest : public ::testing::Test {
  protected:
   void SetUp() override {
     ASSERT_EQ(0, cudaStreamCreate(&stream_));
-    engine_ = new TensorRTEngine(1, 1 << 10, &stream_);
+    engine_ = new TensorRTEngine(10, 1 << 10, &stream_);
     engine_->InitNetwork();
   }
 
@@ -71,7 +71,7 @@ TEST_F(TensorRTEngineTest, add_layer) {
 
   LOG(INFO) << "to get output";
   float y_cpu;
-  engine_->GetOutputInCPU("y", &y_cpu, sizeof(float));
+  engine_->GetOutputInCPU("y", &y_cpu, 1 * sizeof(float));
 
   LOG(INFO) << "to checkout output";
   ASSERT_EQ(y_cpu, x_v * 2 + 3);
@@ -103,11 +103,80 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
 
   LOG(INFO) << "to get output";
   float y_cpu[2] = {-1., -1.};
-  engine_->GetOutputInCPU("y", &y_cpu[0], sizeof(float) * 2);
+
+  auto dims = engine_->GetITensor("y")->getDimensions();
+  ASSERT_EQ(dims.nbDims, 3);
+  ASSERT_EQ(dims.d[0], 2);
+  ASSERT_EQ(dims.d[1], 1);
+  engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float));
   ASSERT_EQ(y_cpu[0], 4.5);
   ASSERT_EQ(y_cpu[1], 14.5);
 }
 
+TEST_F(TensorRTEngineTest, test_conv2d) {
+  // Weight in CPU memory.
+  float raw_weight[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+  float raw_bias[1] = {0};
+
+  TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 9);
+  TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 1);
+  auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
+                                  nvinfer1::Dims3{1, 3, 3});
+  auto* conv_layer =
+      TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3},
+                           weight.get(), bias.get());
+  PADDLE_ENFORCE(conv_layer != nullptr);
+  conv_layer->setStride(nvinfer1::DimsHW{1, 1});
+  conv_layer->setPadding(nvinfer1::DimsHW{1, 1});
+
+  engine_->DeclareOutput(conv_layer, 0, "y");
+  engine_->FreezeNetwork();
+  ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
+
+  float x_v[18] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                   1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+  engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
+                           18 * sizeof(float));
+  engine_->Execute(2);
+
+  LOG(INFO) << "to get output";
+  float* y_cpu = new float[18];
+  engine_->GetOutputInCPU("y", &y_cpu[0], 18 * sizeof(float));
+  ASSERT_EQ(y_cpu[0], 4.0);
+  ASSERT_EQ(y_cpu[1], 6.0);
+}
+
+TEST_F(TensorRTEngineTest, test_pool2d) {
+  // Weight in CPU memory.
+  auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
+                                  nvinfer1::Dims3{1, 2, 2});
+
+  nvinfer1::PoolingType pool_t = nvinfer1::PoolingType::kAVERAGE;
+  auto* pool_layer =
+      TRT_ENGINE_ADD_LAYER(engine_, Pooling, *const_cast<nvinfer1::ITensor*>(x),
+                           pool_t, nvinfer1::DimsHW{2, 2});
+
+  PADDLE_ENFORCE(pool_layer != nullptr);
+  pool_layer->setStride(nvinfer1::DimsHW{1, 1});
+  pool_layer->setPadding(nvinfer1::DimsHW{0, 0});
+
+  engine_->DeclareOutput(pool_layer, 0, "y");
+  engine_->FreezeNetwork();
+  ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
+
+  float x_v[8] = {1.0, 2.0, 5.0, 0.0, 2.0, 3.0, 5.0, 10.0};
+  engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
+                           8 * sizeof(float));
+  engine_->Execute(2);
+
+  LOG(INFO) << "to get output";
+  float* y_cpu = new float[2];
+  engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float));
+
+  ASSERT_EQ(y_cpu[0], 2.0);
+  ASSERT_EQ(y_cpu[1], 5.0);
+}
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index 2fa5a9540ba1311c7f87e6675a53044b23dd8276..017fc4cd7b11c150cb941fffca2606a4d707330f 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -17,7 +17,7 @@ function(inference_test TARGET_NAME)
     string(REGEX REPLACE "^_$" "" arg "${arg}")
     cc_test(test_inference_${TARGET_NAME}${arg}
         SRCS test_inference_${TARGET_NAME}.cc
-        DEPS paddle_fluid
+        DEPS paddle_fluid_origin
         ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model)
     set_tests_properties(test_inference_${TARGET_NAME}${arg}
         PROPERTIES DEPENDS test_${TARGET_NAME})
@@ -43,6 +43,6 @@ inference_test(word2vec)
 # TODO(TJ): clean me up
 cc_test(test_inference_nlp
   SRCS test_inference_nlp.cc
-  DEPS paddle_fluid
+  DEPS paddle_fluid_origin
   ARGS
   --model_path=${PADDLE_BINARY_DIR}/python/paddle/fluid/tests/book/recognize_digits_mlp.inference.model)
diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 5cc1db12bb71e428d493e7c6f718b1c6ed431858..e2a3e9d46ef9f303d191d59253ffbe9f4826184b 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -20,9 +20,6 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 #include "paddle/fluid/platform/cpu_helper.h"
-#ifdef PADDLE_WITH_MKLML
-#include <omp.h>
-#endif
 
 DEFINE_string(model_path, "", "Directory of the inference model.");
 DEFINE_string(data_file, "", "File of input index data.");
@@ -30,6 +27,7 @@ DEFINE_int32(repeat, 100, "Running the inference program repeat times");
 DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
 DEFINE_int32(num_threads, 1, "Number of threads should be used");
 DECLARE_bool(use_mkldnn);
+DECLARE_int32(paddle_num_threads);
 
 inline double GetCurrentMs() {
   struct timeval time;
@@ -160,12 +158,7 @@ TEST(inference, nlp) {
   std::unique_ptr<paddle::framework::Scope> scope(
       new paddle::framework::Scope());
 
-#ifdef PADDLE_WITH_MKLML
-  // only use 1 thread number per std::thread
-  omp_set_dynamic(0);
-  omp_set_num_threads(1);
-  paddle::platform::SetNumThreads(1);
-#endif
+  paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
 
   double start_ms = 0, stop_ms = 0;
   if (FLAGS_num_threads > 1) {
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index 44c36b1683b037832a218df02184e7cd2ba143e9..695790a37dce889e838462b401ca4e89f09271d5 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -210,13 +210,14 @@ void TestInference(const std::string& dirname,
 
     // Ignore the profiling results of the first run
     std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
+    bool CreateLocalScope = CreateVars;
     if (PrepareContext) {
       ctx = executor.Prepare(*inference_program, 0);
       executor.RunPreparedContext(ctx.get(), scope, &feed_targets,
-                                  &fetch_targets, true, CreateVars);
+                                  &fetch_targets, CreateLocalScope, CreateVars);
     } else {
       executor.Run(*inference_program, scope, &feed_targets, &fetch_targets,
-                   true, CreateVars);
+                   CreateLocalScope, CreateVars);
     }
 
     // Enable the profiler
@@ -232,10 +233,11 @@ void TestInference(const std::string& dirname,
         // Note: if you change the inference_program, you need to call
         // executor.Prepare() again to get a new ExecutorPrepareContext.
         executor.RunPreparedContext(ctx.get(), scope, &feed_targets,
-                                    &fetch_targets, CreateVars);
+                                    &fetch_targets, CreateLocalScope,
+                                    CreateVars);
       } else {
         executor.Run(*inference_program, scope, &feed_targets, &fetch_targets,
-                     CreateVars);
+                     CreateLocalScope, CreateVars);
       }
     }
 
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index 4194ba197948b47003863196efdac1c08a7ae4f6..c2f45fdc99b87bc12c2aadf1985de6e98a24fce7 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -15,12 +15,17 @@ limitations under the License. */
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "glog/logging.h"
 
+DEFINE_bool(free_idle_memory, false,
+            "If it is true, Paddle will try to free idle memory trunks during "
+            "running time.");
+
 namespace paddle {
 namespace memory {
 namespace detail {
 
-BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator,
-                               size_t min_chunk_size, size_t max_chunk_size)
+BuddyAllocator::BuddyAllocator(
+    std::unique_ptr<SystemAllocator> system_allocator, size_t min_chunk_size,
+    size_t max_chunk_size)
     : min_chunk_size_(min_chunk_size),
       max_chunk_size_(max_chunk_size),
       cache_(system_allocator->UseGpu()),
@@ -151,13 +156,14 @@ void BuddyAllocator::Free(void* p) {
   pool_.insert(
       IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
 
-  // Clean up if existing too much free memory
-
-  // Prefer freeing fallback allocation first
-  CleanIdleFallBackAlloc();
+  if (FLAGS_free_idle_memory) {
+    // Clean up if existing too much free memory
+    // Prefer freeing fallback allocation first
+    CleanIdleFallBackAlloc();
 
-  // Free normal allocation
-  CleanIdleNormalAlloc();
+    // Free normal allocation
+    CleanIdleNormalAlloc();
+  }
 }
 
 size_t BuddyAllocator::Used() { return total_used_; }
diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h
index 2f39d774d6fb6a2bc37877eb2f8b90bebd3cda28..f0c83efc23ce39c4fc89296d672e1e55751851bf 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <mutex>  // NOLINT
 #include <set>
 #include <tuple>
@@ -32,8 +33,8 @@ namespace detail {
 
 class BuddyAllocator {
  public:
-  BuddyAllocator(SystemAllocator* system_allocator, size_t min_chunk_size,
-                 size_t max_chunk_size);
+  BuddyAllocator(std::unique_ptr<SystemAllocator> system_allocator,
+                 size_t min_chunk_size, size_t max_chunk_size);
 
   ~BuddyAllocator();
 
@@ -103,7 +104,7 @@ class BuddyAllocator {
 
  private:
   /*! Allocate CPU/GPU memory from system */
-  SystemAllocator* system_allocator_;
+  std::unique_ptr<SystemAllocator> system_allocator_;
   std::mutex mutex_;
 };
 
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index bd98ed81899440a46415d30b6d74fec2dac4c155..7c800b3c164049244770ceb2070b177d8307e85e 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <vector>
+
 #include "paddle/fluid/memory/malloc.h"
 
 #include "glog/logging.h"
@@ -34,12 +36,15 @@ namespace memory {
 using BuddyAllocator = detail::BuddyAllocator;
 
 BuddyAllocator* GetCPUBuddyAllocator() {
+  static std::once_flag init_flag;
   static detail::BuddyAllocator* a = nullptr;
-  if (a == nullptr) {
-    a = new detail::BuddyAllocator(new detail::CPUAllocator,
-                                   platform::CpuMinChunkSize(),
-                                   platform::CpuMaxChunkSize());
-  }
+
+  std::call_once(init_flag, []() {
+    a = new detail::BuddyAllocator(
+        std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
+        platform::CpuMinChunkSize(), platform::CpuMaxChunkSize());
+  });
+
   return a;
 }
 
@@ -68,27 +73,33 @@ size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
 #ifdef PADDLE_WITH_CUDA
 
 BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
-  static BuddyAllocator** as = NULL;
-  if (as == NULL) {
+  static std::once_flag init_flag;
+  static detail::BuddyAllocator** a_arr = nullptr;
+
+  std::call_once(init_flag, [gpu_id]() {
     int gpu_num = platform::GetCUDADeviceCount();
-    as = new BuddyAllocator*[gpu_num];
-    for (int gpu = 0; gpu < gpu_num; gpu++) {
-      as[gpu] = nullptr;
+    PADDLE_ENFORCE(gpu_id < gpu_num, "gpu_id:%d should < gpu_num:%d", gpu_id,
+                   gpu_num);
+
+    a_arr = new BuddyAllocator*[gpu_num];
+    for (int i = 0; i < gpu_num; i++) {
+      a_arr[i] = nullptr;
+      platform::SetDeviceId(i);
+      a_arr[i] = new BuddyAllocator(
+          std::unique_ptr<detail::SystemAllocator>(new detail::GPUAllocator(i)),
+          platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
+
+      VLOG(10) << "\n\nNOTE: each GPU device use "
+               << FLAGS_fraction_of_gpu_memory_to_use * 100
+               << "% of GPU memory.\n"
+               << "You can set GFlags environment variable '"
+               << "FLAGS_fraction_of_gpu_memory_to_use"
+               << "' to change the fraction of GPU usage.\n\n";
     }
-  }
+  });
+
   platform::SetDeviceId(gpu_id);
-  if (!as[gpu_id]) {
-    as[gpu_id] = new BuddyAllocator(new detail::GPUAllocator(gpu_id),
-                                    platform::GpuMinChunkSize(),
-                                    platform::GpuMaxChunkSize());
-    VLOG(10) << "\n\nNOTE: each GPU device use "
-             << FLAGS_fraction_of_gpu_memory_to_use * 100
-             << "% of GPU memory.\n"
-             << "You can set GFlags environment variable '"
-             << "FLAGS_fraction_of_gpu_memory_to_use"
-             << "' to change the fraction of GPU usage.\n\n";
-  }
-  return as[gpu_id];
+  return a_arr[gpu_id];
 }
 
 template <>
@@ -125,12 +136,16 @@ void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
 }
 
 BuddyAllocator* GetCUDAPinnedBuddyAllocator() {
-  static BuddyAllocator* ba = NULL;
-  if (ba == NULL) {
-    ba = new BuddyAllocator(new detail::CUDAPinnedAllocator,
+  static std::once_flag init_flag;
+  static BuddyAllocator* ba = nullptr;
+
+  std::call_once(init_flag, []() {
+    ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
+                                new detail::CUDAPinnedAllocator),
                             platform::CUDAPinnedMinChunkSize(),
                             platform::CUDAPinnedMaxChunkSize());
-  }
+  });
+
   return ba;
 }
 
diff --git a/paddle/fluid/operators/.flatten_op.cc.swp b/paddle/fluid/operators/.flatten_op.cc.swp
new file mode 100644
index 0000000000000000000000000000000000000000..3395b6074b6a4c684a97674af702ca8b91dc85e9
Binary files /dev/null and b/paddle/fluid/operators/.flatten_op.cc.swp differ
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index ab1d2143330fb8cbfd535758a83bc71de939c4e0..4c3b8ec78190723598a56f7633764f10dd5047f3 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -168,6 +168,8 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP(relu);\n")
       elseif(${TARGET} STREQUAL "fake_dequantize")
         file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
+      elseif(${TARGET} STREQUAL "tensorrt_engine_op")
+          message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
       else()
         file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
       endif()
@@ -190,9 +192,9 @@ if(WITH_DISTRIBUTE)
     
     set(DISTRIBUTE_DEPS "")
     if(WITH_GRPC)
-        set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
+        set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
     else()
-        set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib)
+        set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib node)
         if(WITH_BRPC_RDMA)
             find_library(IBVERBS_LIBRARY NAMES ibverbs)
             ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
@@ -237,9 +239,9 @@ op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 op_library(softmax_op DEPS softmax)
 op_library(sequence_softmax_op DEPS softmax)
 if (WITH_GPU AND TENSORRT_FOUND)
-    op_library(tensorrt_engine_op DEPS tensorrt_engine)
+    op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter)
     nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
-      DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter
+      DEPS tensorrt_engine_op
       analysis)
 else()
     set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
@@ -259,12 +261,18 @@ op_library(max_sequence_len_op DEPS lod_rank_table)
 op_library(sequence_conv_op DEPS context_project)
 op_library(sequence_pool_op DEPS sequence_pooling)
 op_library(lstm_op DEPS sequence2batch lstm_compute)
+op_library(hierarchical_sigmoid_op DEPS matrix_bit_code)
 op_library(lstmp_op DEPS sequence2batch lstm_compute)
 op_library(gru_op DEPS sequence2batch gru_compute)
 op_library(recurrent_op DEPS executor)
 op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 op_library(cos_sim_op DEPS cos_sim_functor)
 op_library(parallel_do_op DEPS executor)
+op_library(unsqueeze_op DEPS reshape_op)
+op_library(squeeze_op DEPS reshape_op)
+op_library(extract_rows_op DEPS memory)
+op_library(flatten_op DEPS reshape_op)
+
 
 if (WITH_GPU)
     op_library(conv_op DEPS vol2col depthwise_conv im2col)
diff --git a/paddle/fluid/operators/adam_op.h b/paddle/fluid/operators/adam_op.h
index a7a28b02b67f2ef180ec0e273dbe7ef555f88ce2..84a584f424823a450effd4c36e9da600f5851da2 100644
--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
@@ -293,11 +293,18 @@ class AdamOpKernel : public framework::OpKernel<T> {
       auto& grad_tensor = grad_merge.value();
       const T* grad_data = grad_tensor.template data<T>();
       int64_t* rows = nullptr;
+// When compiled without CUDA, the CUDAMutableData() interface should not be
+// provided.
+#if defined(PADDLE_WITH_CUDA)
       if (platform::is_gpu_place(ctx.GetPlace())) {
         rows = grad_merge.mutable_rows()->CUDAMutableData(ctx.GetPlace());
       } else {
+#endif
         rows = grad_merge.mutable_rows()->data();
+
+#if defined(PADDLE_WITH_CUDA)
       }
+#endif
       auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
 
       SparseAdamFunctor<T> functor(
diff --git a/paddle/fluid/operators/auc_op.cc b/paddle/fluid/operators/auc_op.cc
index c9871a9fe6b3b0d0cf671c2d155715f92c94fd8f..5edecd18e673da326ec119cf9a383f24f8045089 100644
--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
@@ -24,26 +24,34 @@ class AucOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input of Out should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Indices"),
-                   "Input of Indices should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Predict"),
+                   "Input of Out should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Label"),
                    "Input of Label should not be null.");
-    auto inference_height = ctx->GetInputDim("Out")[0];
+    auto predict_width = ctx->GetInputDim("Predict")[1];
+    PADDLE_ENFORCE_EQ(predict_width, 2, "Only support binary classification");
+    auto predict_height = ctx->GetInputDim("Predict")[0];
     auto label_height = ctx->GetInputDim("Label")[0];
 
-    PADDLE_ENFORCE_EQ(inference_height, label_height,
+    PADDLE_ENFORCE_EQ(predict_height, label_height,
                       "Out and Label should have same height.");
 
+    int num_thres = ctx->Attrs().Get<int>("num_thresholds");
+
     ctx->SetOutputDim("AUC", {1});
-    ctx->ShareLoD("Out", /*->*/ "AUC");
+    ctx->SetOutputDim("TPOut", {num_thres});
+    ctx->SetOutputDim("TNOut", {num_thres});
+    ctx->SetOutputDim("FPOut", {num_thres});
+    ctx->SetOutputDim("FNOut", {num_thres});
+
+    ctx->ShareLoD("Predict", /*->*/ "AUC");
   }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
+        framework::ToDataType(ctx.Input<Tensor>("Predict")->type()),
         ctx.device_context());
   }
 };
@@ -51,22 +59,25 @@ class AucOp : public framework::OperatorWithKernel {
 class AucOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("Out",
-             "A floating point 2D tensor, values are in the range [0, 1]."
-             "Each row is sorted in descending order. This input should be the"
-             "output of topk."
+    AddInput("Predict",
+             "A floating point 2D tensor with shape [batch_size, 2], values "
+             "are in the range [0, 1]."
              "Typically, this tensor indicates the probability of each label");
-    AddInput("Indices",
-             "An int 2D tensor, indicating the indices of original"
-             "tensor before sorting. Typically, this tensor indicates which "
-             "label the probability stands for.");
     AddInput("Label",
-             "A 2D int tensor indicating the label of the training data."
-             "The height is batch size and width is always 1.");
+             "A 2D int tensor indicating the label of the training data. "
+             "shape: [batch_size, 1]");
+    AddInput("TP", "True-Positive value.");
+    AddInput("FP", "False-Positive value.");
+    AddInput("TN", "True-Negative value.");
+    AddInput("FN", "False-Negative value.");
     // TODO(typhoonzero): support weight input
     AddOutput("AUC",
               "A scalar representing the "
               "current area-under-the-curve.");
+    AddOutput("TPOut", "True-Positive value.");
+    AddOutput("FPOut", "False-Positive value.");
+    AddOutput("TNOut", "True-Negative value.");
+    AddOutput("FNOut", "False-Negative value.");
 
     AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
         .SetDefault("ROC");
diff --git a/paddle/fluid/operators/auc_op.h b/paddle/fluid/operators/auc_op.h
index 8b016c3d31ad83e66baeb298c61840cc529efa1e..0a18585edb54a76aff5ae72ecc71e0eebb9f9361 100644
--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
@@ -31,58 +31,54 @@ template <typename DeviceContext, typename T>
 class AucKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* inference = ctx.Input<Tensor>("Out");
+    auto* predict = ctx.Input<Tensor>("Predict");
     auto* label = ctx.Input<Tensor>("Label");
     auto* auc = ctx.Output<Tensor>("AUC");
+    // Only use output var for now, make sure it's persistable and
+    // not cleaned up for each batch.
+    auto* true_positive = ctx.Output<Tensor>("TPOut");
+    auto* false_positive = ctx.Output<Tensor>("FPOut");
+    auto* true_negative = ctx.Output<Tensor>("TNOut");
+    auto* false_negative = ctx.Output<Tensor>("FNOut");
 
-    float* auc_data = auc->mutable_data<float>(ctx.GetPlace());
+    auto* auc_data = auc->mutable_data<double>(ctx.GetPlace());
 
     std::string curve = ctx.Attr<std::string>("curve");
     int num_thresholds = ctx.Attr<int>("num_thresholds");
-    std::vector<float> thresholds_list;
+    std::vector<double> thresholds_list;
     thresholds_list.reserve(num_thresholds);
     for (int i = 1; i < num_thresholds - 1; i++) {
-      thresholds_list[i] = static_cast<float>(i) / (num_thresholds - 1);
+      thresholds_list[i] = static_cast<double>(i) / (num_thresholds - 1);
     }
-    const float kEpsilon = 1e-7;
+    const double kEpsilon = 1e-7;
     thresholds_list[0] = 0.0f - kEpsilon;
     thresholds_list[num_thresholds - 1] = 1.0f + kEpsilon;
 
-    size_t batch_size = inference->dims()[0];
-    size_t inference_width = inference->dims()[1];
+    size_t batch_size = predict->dims()[0];
+    size_t inference_width = predict->dims()[1];
 
-    const T* inference_data = inference->data<T>();
-    const int64_t* label_data = label->data<int64_t>();
+    const T* inference_data = predict->data<T>();
+    const auto* label_data = label->data<int64_t>();
 
-    // Create local tensor for storing the curve: TP, FN, TN, FP
-    // TODO(typhoonzero): use eigen op to caculate these values.
-    Tensor true_positive, false_positive, true_negative, false_negative;
-
-    true_positive.Resize({num_thresholds});
-    false_negative.Resize({num_thresholds});
-    true_negative.Resize({num_thresholds});
-    false_positive.Resize({num_thresholds});
-
-    int64_t* tp_data = true_positive.mutable_data<int64_t>(ctx.GetPlace());
-    int64_t* fn_data = false_negative.mutable_data<int64_t>(ctx.GetPlace());
-    int64_t* tn_data = true_negative.mutable_data<int64_t>(ctx.GetPlace());
-    int64_t* fp_data = false_positive.mutable_data<int64_t>(ctx.GetPlace());
+    auto* tp_data = true_positive->mutable_data<int64_t>(ctx.GetPlace());
+    auto* fn_data = false_negative->mutable_data<int64_t>(ctx.GetPlace());
+    auto* tn_data = true_negative->mutable_data<int64_t>(ctx.GetPlace());
+    auto* fp_data = false_positive->mutable_data<int64_t>(ctx.GetPlace());
 
     for (int idx_thresh = 0; idx_thresh < num_thresholds; idx_thresh++) {
-      // caculate TP, FN, TN, FP for current thresh
+      // calculate TP, FN, TN, FP for current thresh
       int64_t tp = 0, fn = 0, tn = 0, fp = 0;
       for (size_t i = 0; i < batch_size; i++) {
-        // NOTE: label_data used as bool, labels >0 will be treated as true.
+        // NOTE: label_data used as bool, labels > 0 will be treated as true.
         if (label_data[i]) {
-          // use first(max) data in each row
-          if (inference_data[i * inference_width] >=
+          if (inference_data[i * inference_width + 1] >=
               (thresholds_list[idx_thresh])) {
             tp++;
           } else {
             fn++;
           }
         } else {
-          if (inference_data[i * inference_width] >=
+          if (inference_data[i * inference_width + 1] >=
               (thresholds_list[idx_thresh])) {
             fp++;
           } else {
@@ -91,27 +87,27 @@ class AucKernel : public framework::OpKernel<T> {
         }
       }
       // store rates
-      tp_data[idx_thresh] = tp;
-      fn_data[idx_thresh] = fn;
-      tn_data[idx_thresh] = tn;
-      fp_data[idx_thresh] = fp;
+      tp_data[idx_thresh] += tp;
+      fn_data[idx_thresh] += fn;
+      tn_data[idx_thresh] += tn;
+      fp_data[idx_thresh] += fp;
     }
     // epsilon to avoid divide by zero.
-    float epsilon = 1e-6;
+    double epsilon = 1e-6;
     // Riemann sum to caculate auc.
     Tensor tp_rate, fp_rate, rec_rate;
     tp_rate.Resize({num_thresholds});
     fp_rate.Resize({num_thresholds});
     rec_rate.Resize({num_thresholds});
-    float* tp_rate_data = tp_rate.mutable_data<float>(ctx.GetPlace());
-    float* fp_rate_data = fp_rate.mutable_data<float>(ctx.GetPlace());
-    float* rec_rate_data = rec_rate.mutable_data<float>(ctx.GetPlace());
+    auto* tp_rate_data = tp_rate.mutable_data<double>(ctx.GetPlace());
+    auto* fp_rate_data = fp_rate.mutable_data<double>(ctx.GetPlace());
+    auto* rec_rate_data = rec_rate.mutable_data<double>(ctx.GetPlace());
     for (int i = 0; i < num_thresholds; i++) {
-      tp_rate_data[i] = (static_cast<float>(tp_data[i]) + epsilon) /
+      tp_rate_data[i] = (static_cast<double>(tp_data[i]) + epsilon) /
                         (tp_data[i] + fn_data[i] + epsilon);
       fp_rate_data[i] =
-          static_cast<float>(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon);
-      rec_rate_data[i] = (static_cast<float>(tp_data[i]) + epsilon) /
+          static_cast<double>(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon);
+      rec_rate_data[i] = (static_cast<double>(tp_data[i]) + epsilon) /
                          (tp_data[i] + fp_data[i] + epsilon);
     }
     *auc_data = 0.0f;
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 693bf973c2b8790d2c50cee9b86b365493e8c754..5912a1a17cbd29c3ebd83f37133c044f0905c8bd 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -216,6 +216,18 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
       saved_mean_e.setZero();
       saved_variance_e.setZero();
 
+      EigenVectorArrayMap<T> running_mean_arr(
+          mean_out->mutable_data<T>(ctx.GetPlace()), C);
+      EigenVectorArrayMap<T> running_var_arr(
+          variance_out->mutable_data<T>(ctx.GetPlace()), C);
+
+      if ((N * sample_size) == 1) {
+        LOG(WARNING) << "Only 1 element in normalization dimension, "
+                     << "we skip the batch norm calculation, let y = x.";
+        framework::TensorCopySync(*x, ctx.GetPlace(), y);
+        return;
+      }
+
       switch (data_layout) {
         case DataLayout::kNCHW: {
           ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
@@ -247,10 +259,6 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
           PADDLE_THROW("Unknown storage order: %s", data_layout_str);
       }
 
-      EigenVectorArrayMap<T> running_mean_arr(
-          mean_out->mutable_data<T>(ctx.GetPlace()), C);
-      EigenVectorArrayMap<T> running_var_arr(
-          variance_out->mutable_data<T>(ctx.GetPlace()), C);
       running_mean_arr =
           running_mean_arr * momentum + saved_mean_e * (1. - momentum);
       running_var_arr =
@@ -427,6 +435,11 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
     d_bias_arr.setZero();
     d_scale_arr.setZero();
 
+    if ((N * sample_size) == 1) {
+      framework::TensorCopySync(*d_y, ctx.GetPlace(), d_x);
+      return;
+    }
+
     const auto scale_inv_var_nhw = scale_arr * inv_var_arr / (N * sample_size);
 
     switch (data_layout) {
diff --git a/paddle/fluid/operators/batch_norm_op.cu.cc b/paddle/fluid/operators/batch_norm_op.cu.cc
index 550dd32d36767f90e880415bfffaf01aeb623609..ca6cd8669352fd5814f25a04433ca97fe4abe9ff 100644
--- a/paddle/fluid/operators/batch_norm_op.cu.cc
+++ b/paddle/fluid/operators/batch_norm_op.cu.cc
@@ -72,6 +72,9 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     int N, C, H, W, D;
     ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
 
+    auto *y = ctx.Output<Tensor>("Y");
+    y->mutable_data<T>(ctx.GetPlace());
+
     // ------------------- cudnn descriptors ---------------------
     cudnnTensorDescriptor_t data_desc_;
     cudnnTensorDescriptor_t bn_param_desc_;
@@ -93,7 +96,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     mode_ = CUDNN_BATCHNORM_SPATIAL;
 #endif
 
-    VLOG(1) << "Setting descriptors.";
+    VLOG(3) << "Setting descriptors.";
     std::vector<int> dims;
     std::vector<int> strides;
     if (data_layout == DataLayout::kNCHW) {
@@ -113,11 +116,6 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     const auto *scale = ctx.Input<Tensor>("Scale");
     const auto *bias = ctx.Input<Tensor>("Bias");
 
-    auto *y = ctx.Output<Tensor>("Y");
-
-    // alloc memory
-    y->mutable_data<T>(ctx.GetPlace());
-
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
 
     auto handle = dev_ctx.cudnn_handle();
@@ -162,22 +160,28 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
       functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
       functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
 
-      double this_factor = 1. - momentum;
-
-      CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining(
-          handle, mode_, CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
-          data_desc_, x->template data<T>(), data_desc_,
-          y->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
-          scale->template data<BatchNormParamType<T>>(),
-          bias->template data<BatchNormParamType<T>>(), this_factor,
-          mean_out->template mutable_data<BatchNormParamType<T>>(
-              ctx.GetPlace()),
-          variance_out->template mutable_data<BatchNormParamType<T>>(
-              ctx.GetPlace()),
-          epsilon, saved_mean->template mutable_data<BatchNormParamType<T>>(
-                       ctx.GetPlace()),
-          saved_variance->template mutable_data<BatchNormParamType<T>>(
-              ctx.GetPlace())));
+      if ((N * H * W * D) == 1) {
+        LOG(WARNING) << "Only 1 element in normalization dimension, "
+                     << "we skip the batch norm calculation, let y = x.";
+        framework::TensorCopySync(*x, ctx.GetPlace(), y);
+      } else {
+        double this_factor = 1. - momentum;
+
+        CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining(
+            handle, mode_, CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
+            data_desc_, x->template data<T>(), data_desc_,
+            y->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
+            scale->template data<BatchNormParamType<T>>(),
+            bias->template data<BatchNormParamType<T>>(), this_factor,
+            mean_out->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
+            variance_out->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
+            epsilon, saved_mean->template mutable_data<BatchNormParamType<T>>(
+                         ctx.GetPlace()),
+            saved_variance->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace())));
+      }
     }
 
     // clean when exit.
@@ -209,6 +213,25 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
     int N, C, H, W, D;
     ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
 
+    // init output
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    d_x->mutable_data<T>(ctx.GetPlace());
+    d_scale->mutable_data<T>(ctx.GetPlace());
+    d_bias->mutable_data<T>(ctx.GetPlace());
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    if ((N * H * W * D) == 1) {
+      framework::TensorCopySync(*d_y, ctx.GetPlace(), d_x);
+      math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
+          functor;
+      functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
+      functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
+      return;
+    }
+
     PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL);
     PADDLE_ENFORCE_EQ(scale->dims()[0], C);
 
@@ -247,21 +270,11 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
     CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor(
         bn_param_desc_, data_desc_, mode_));
 
-    // init output
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    d_x->mutable_data<T>(ctx.GetPlace());
-    d_scale->mutable_data<T>(ctx.GetPlace());
-    d_bias->mutable_data<T>(ctx.GetPlace());
-
     const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
     const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
     const void *saved_mean_data = saved_mean->template data<T>();
     const void *saved_var_data = saved_var->template data<T>();
 
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward(
         dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
         CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
diff --git a/paddle/fluid/operators/checkpoint_notify_op.cc b/paddle/fluid/operators/checkpoint_notify_op.cc
index c4219a429a53eb4869426a2674109555fb784b85..3a2527e407bb179c4873fa3ffe2e8f22fb47faf7 100644
--- a/paddle/fluid/operators/checkpoint_notify_op.cc
+++ b/paddle/fluid/operators/checkpoint_notify_op.cc
@@ -48,7 +48,7 @@ class CheckpointNotifyOp : public framework::OperatorBase {
       VLOG(3) << "checkpoint notify sending lookup table: " << lookup_table_name
               << " and dir:" << dir << " to " << epmap[i];
     }
-    rpc_client->Wait();
+    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
   }
 };
 
diff --git a/paddle/fluid/operators/conditional_block_op.cc b/paddle/fluid/operators/conditional_block_op.cc
index 8cc1d94260baccfe28d213b7e021956819e2e79e..580fde753816c30b188b8a99cc63fcbafde64e25 100644
--- a/paddle/fluid/operators/conditional_block_op.cc
+++ b/paddle/fluid/operators/conditional_block_op.cc
@@ -205,9 +205,10 @@ class ConditionalBlockGradInferShape : public framework::InferShapeBase {
       context->SetOutputsDim(framework::GradVarName("Params"),
                              context->GetInputsDim("Params"));
     }
-    PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("X")));
-    context->SetOutputsDim(framework::GradVarName("X"),
-                           context->GetInputsDim("X"));
+    if (context->HasOutputs(framework::GradVarName("X"))) {
+      context->SetOutputsDim(framework::GradVarName("X"),
+                             context->GetInputsDim("X"));
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index 1828be57b5a54005a0066b18ebebdb740726f67a..22cbf680c0670552fb014043c69fcadc56863529 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -20,10 +20,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/cudnn_helper.h"
 #include "paddle/fluid/platform/float16.h"
 
-DEFINE_bool(cudnn_deterministic, true,
+DEFINE_bool(cudnn_deterministic, false,
             "Whether allow using an autotuning algorithm for convolution "
             "operator. The autotuning algorithm may be non-deterministic. If "
-            "false, the algorithm is deterministic.");
+            "true, the algorithm is deterministic.");
 
 namespace paddle {
 namespace operators {
@@ -77,7 +77,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     // cudnn 7 can support groups, no need to do it mannually
     // FIXME(typhoonzero): find a better way to disable groups
     // rather than setting it to 1.
-    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
+    CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
         cudnn_conv_desc, groups));
     groups = 1;
 #endif
@@ -129,7 +129,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
 
-    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
         handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
         cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
         workspace_size_limit, &algo));
@@ -140,18 +140,18 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     if (dev_ctx.GetComputeCapability() >= 70 &&
         std::type_index(typeid(T)) ==
             std::type_index(typeid(platform::float16))) {
-      PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
           cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
       // Currently tensor core is only enabled using this algo
       algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
     } else {
-      PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
           cudnn_conv_desc, CUDNN_DEFAULT_MATH));
     }
 #endif
 
     // get workspace size able to allocate
-    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
         handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
         cudnn_output_desc, algo, &workspace_size_in_bytes));
     // It is possible for float16 on Volta GPU to allocate more memory than
@@ -165,7 +165,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     // ------------------- cudnn conv forward ---------------------
     ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
     for (int i = 0; i < groups; i++) {
-      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
+      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
           handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
           cudnn_filter_desc, filter_data + i * group_offset_filter,
           cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
@@ -218,7 +218,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     // cudnn 7 can support groups, no need to do it mannually
     // FIXME(typhoonzero): find a better way to disable groups
     // rather than setting it to 1.
-    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
+    CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
         cudnn_conv_desc, groups));
     groups = 1;
 #endif
@@ -272,8 +272,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
     if (input_grad) {
-      if (FLAGS_cudnn_deterministic) {
-        PADDLE_ENFORCE(
+      if (!FLAGS_cudnn_deterministic) {
+        CUDNN_ENFORCE(
             platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
                 handle, cudnn_filter_desc,
                 // dyDesc: Handle to the previously initialized input
@@ -289,7 +289,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
         data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
       }
 
-      PADDLE_ENFORCE(
+      CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
               handle, cudnn_filter_desc, cudnn_output_grad_desc,
               cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size));
@@ -297,8 +297,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     }
 
     if (filter_grad) {
-      if (FLAGS_cudnn_deterministic) {
-        PADDLE_ENFORCE(
+      if (!FLAGS_cudnn_deterministic) {
+        CUDNN_ENFORCE(
             platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
                 handle, cudnn_input_desc, cudnn_output_grad_desc,
                 cudnn_conv_desc, cudnn_filter_desc,
@@ -308,7 +308,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
         filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
       }
 
-      PADDLE_ENFORCE(
+      CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
               handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
               cudnn_filter_desc, filter_algo, &tmp_size));
@@ -326,7 +326,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       // Because beta is zero, it is unnecessary to reset input_grad.
 
       for (int i = 0; i < groups; i++) {
-        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
             handle, &alpha, cudnn_filter_desc,
             filter_data + i * group_offset_filter, cudnn_output_grad_desc,
             output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo,
@@ -339,7 +339,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset filter_grad.
       for (int i = 0; i < groups; i++) {
-        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
             handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
             cudnn_output_grad_desc, output_grad_data + i * group_offset_out,
             cudnn_conv_desc, filter_algo, cudnn_workspace,
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index 6b06913d1c83f4534238ac3dd22ac4035c0f0fbf..0511eb42a073ac305634110a71a35e501f062132 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -18,9 +18,6 @@
 namespace paddle {
 namespace operators {
 
-using conv_bwd_data = mkldnn::convolution_backward_data;
-using conv_bwd_weights = mkldnn::convolution_backward_weights;
-using conv_fwd = mkldnn::convolution_forward;
 using framework::DataLayout;
 using mkldnn::memory;
 using mkldnn::primitive;
@@ -29,6 +26,196 @@ using mkldnn::stream;
 using platform::to_void_cast;
 using platform::GetMKLDNNFormat;
 
+class ConvMKLDNNHandler : public platform::MKLDNNHandler {
+ public:
+  ConvMKLDNNHandler(
+      std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd,
+      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+      const std::string& base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key) {
+    conv_pd_ = conv_pd;
+  }
+
+  ConvMKLDNNHandler(
+      std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd,
+      std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc>
+          conv_bwd_data_pd,
+      std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc>
+          conv_bwd_weights_pd,
+      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+      const std::string& base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
+        conv_pd_(conv_pd),
+        conv_bwd_weights_pd_(conv_bwd_weights_pd),
+        conv_bwd_data_pd_(conv_bwd_data_pd) {
+    // If we are in Grad operatgor then update a key with BWD suffix to
+    // distinguish from FWD memory primitives
+    key_ += "-BWD";
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromWeightsPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto src_pd = conv_bwd_weights_pd_->src_primitive_desc();
+    auto user_pd = user_memory_p->get_primitive_desc();
+    return this->AcquireMemory(src_pd, user_pd, user_memory_p,
+                               "@weights-src_mem_p", pipeline);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromWeightsPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto diff_dst_pd = conv_bwd_weights_pd_->diff_dst_primitive_desc();
+    auto user_pd = user_memory_p->get_primitive_desc();
+    return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
+                               "@weights-diff_dst_mem_p", pipeline);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffWeightsMemoryFromWeightsPrimitive(
+      void* ptr) {
+    return this->AcquireMemoryFromPrimitive(
+        conv_bwd_weights_pd_->diff_weights_primitive_desc(), ptr,
+        "@diff_weights_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromDataPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto diff_dst_pd = conv_bwd_data_pd_->diff_dst_primitive_desc();
+    auto user_pd = user_memory_p->get_primitive_desc();
+    return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
+                               "@data-diff_dst_mem_p", pipeline);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromDataPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto weights_pd = conv_bwd_data_pd_->weights_primitive_desc();
+    auto user_pd = user_weights_memory_p->get_primitive_desc();
+    return this->AcquireMemory(weights_pd, user_pd, user_weights_memory_p,
+                               "@data-weights_mem_p", pipeline);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemoryFromDataPrimitive(
+      void* ptr) {
+    return this->AcquireMemoryFromPrimitive(
+        conv_bwd_data_pd_->diff_src_primitive_desc(), ptr, "@diff_src_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromPrimitive(void* ptr) {
+    return this->AcquireMemoryFromPrimitive(conv_pd_->dst_primitive_desc(), ptr,
+                                            "@dst_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto src_pd = conv_pd_->src_primitive_desc();
+    auto user_pd = user_memory_p->get_primitive_desc();
+    return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p",
+                               pipeline);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
+    auto weights_pd = conv_pd_->weights_primitive_desc();
+    return this->AcquireMemory(weights_pd, user_weights_pd,
+                               user_weights_memory_p, "@weights_mem_p",
+                               pipeline);
+  }
+
+  std::shared_ptr<mkldnn::convolution_forward> AcquireConvolution(
+      std::shared_ptr<mkldnn::memory> src_memory_p,
+      std::shared_ptr<mkldnn::memory> weights_memory_p,
+      std::shared_ptr<mkldnn::memory> dst_memory_p) {
+    auto prim_key = key_ + "@conv_p";
+    auto conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(
+        dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find convolution primitive in device context");
+    if (conv_p == nullptr) {
+      conv_p = std::make_shared<mkldnn::convolution_forward>(
+          *conv_pd_, *(src_memory_p), *(weights_memory_p.get()),
+          *(dst_memory_p.get()));
+
+      dev_ctx_.SetBlob(prim_key, conv_p);
+    } else {
+      is_reusing_ = true;
+    }
+    return conv_p;
+  }
+
+  std::shared_ptr<mkldnn::convolution_backward_weights>
+  AcquireConvolutionBackwardWeights(
+      std::shared_ptr<mkldnn::memory> src_memory_p,
+      std::shared_ptr<mkldnn::memory> diff_dst_memory_p,
+      std::shared_ptr<mkldnn::memory> diff_weights_memory_p) {
+    auto prim_key = key_ + "@conv_bwd_weights_p";
+    auto conv_bwd_weights_p =
+        std::static_pointer_cast<mkldnn::convolution_backward_weights>(
+            dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE(
+        (conv_bwd_weights_p != nullptr) || (is_reusing_ == false),
+        "Fail to find convolution bwd weights primitive in device context");
+    if (conv_bwd_weights_p == nullptr) {
+      // create backward conv primitive for weights
+      conv_bwd_weights_p =
+          std::make_shared<mkldnn::convolution_backward_weights>(
+              *conv_bwd_weights_pd_, *src_memory_p, *diff_dst_memory_p,
+              *diff_weights_memory_p);
+      dev_ctx_.SetBlob(prim_key, conv_bwd_weights_p);
+    } else {
+      is_reusing_ = true;
+    }
+    return conv_bwd_weights_p;
+  }
+
+  std::shared_ptr<mkldnn::convolution_backward_data>
+  AcquireConvolutionBackwardData(
+      std::shared_ptr<mkldnn::memory> diff_dst_memory_p,
+      std::shared_ptr<mkldnn::memory> weights_memory_p,
+      std::shared_ptr<mkldnn::memory> diff_src_memory_p) {
+    auto prim_key = key_ + "@conv_bwd_data_p";
+    auto conv_bwd_data_p =
+        std::static_pointer_cast<mkldnn::convolution_backward_data>(
+            dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE(
+        (conv_bwd_data_p != nullptr) || (is_reusing_ == false),
+        "Fail to find convolution bwd data primitive in device context");
+    if (conv_bwd_data_p == nullptr) {
+      conv_bwd_data_p = std::make_shared<mkldnn::convolution_backward_data>(
+          *conv_bwd_data_pd_, *diff_dst_memory_p, *weights_memory_p,
+          *diff_src_memory_p);
+      dev_ctx_.SetBlob(prim_key, conv_bwd_data_p);
+    } else {
+      is_reusing_ = true;
+    }
+    return conv_bwd_data_p;
+  }
+
+  // Generate keys for storing/retriving primitives for this operator
+  // TODO(jczaja): Make hashing function more optimial
+  static std::string GetHash(memory::dims& input_dims,     // NOLINT
+                             memory::dims& weights_dims,   // NOLINT
+                             std::vector<int>& strides,    // NOLINT
+                             std::vector<int>& paddings,   // NOLINT
+                             std::vector<int>& dilations,  // NOLINT
+                             int groups, const std::string& suffix) {
+    return dims2str(input_dims) + dims2str(weights_dims) + dims2str(strides) +
+           dims2str(paddings) + dims2str(dilations) + std::to_string(groups) +
+           suffix;
+  }
+
+ private:
+  std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd_;
+  std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc>
+      conv_bwd_weights_pd_;
+  std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc>
+      conv_bwd_data_pd_;
+};
+
 template <typename T>
 class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
@@ -36,10 +223,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                    "It must use CPUPlace.");
 
-    // Get unique name for index
-    const std::string key = ctx.op().Output("Output");
-    const std::string key_conv_pd = key + "@conv_pd";
-
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
@@ -80,86 +263,84 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         paddle::framework::vectorize2int(filter->dims());
     std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
 
-    // create mkldnn memory from input tensors (data/weights)
-    auto user_src_memory = memory(
-        {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine},
-        to_void_cast(input_data));
-    auto user_weights_memory =
-        memory({{{weights_tz}, memory::data_type::f32, filter->format()},
-                mkldnn_engine},
-               to_void_cast(filter_data));
+    // Get unique name for storing MKLDNN primitives
+    const std::string key = ConvMKLDNNHandler::GetHash(
+        src_tz, weights_tz, strides, paddings, dilations, groups,
+        ctx.op().Output("Output"));
+    const std::string key_conv_pd = key + "@conv_pd";
+
+    std::vector<primitive> pipeline;
+
+    auto user_src_md = platform::MKLDNNMemDesc(
+        {src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
+    auto user_weights_md = platform::MKLDNNMemDesc(
+        {weights_tz}, platform::MKLDNNGetDataType<T>(), filter->format());
 
     /* create memory descriptor for convolution without specified format
      * ('any') which lets a primitive (convolution in this case) choose
      * the memory format preferred for best performance
      */
-    auto src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32,
-                                          memory::format::any);
+    auto src_md = platform::MKLDNNMemDesc(
+        src_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
     auto weights_md = platform::MKLDNNMemDesc(
-        weights_tz, memory::data_type::f32, memory::format::any);
-    auto dst_md = platform::MKLDNNMemDesc(dst_tz, memory::data_type::f32,
-                                          memory::format::any);
+        weights_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
+    auto dst_md = platform::MKLDNNMemDesc(
+        dst_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
 
     // create a conv primitive descriptor and save it for usage in backward
-    std::shared_ptr<conv_fwd::primitive_desc> conv_pd = ConvFwdPrimitiveDesc(
-        src_md, weights_md, dst_md, strides, paddings, mkldnn_engine);
+    std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd =
+        ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
+                             mkldnn_engine);
+    // Save conv_pd/src_memory/weights_memory for backward pass
+    dev_ctx.SetBlob(key_conv_pd, conv_pd);
 
-    // create reorder primitive if the input format is not the preferred one
-    auto src_memory = user_src_memory;
-    primitive reorder_src;
-    bool is_src_reordered = false;
-    if (memory::primitive_desc(conv_pd->src_primitive_desc()) !=
-        user_src_memory.get_primitive_desc()) {
-      src_memory = memory(conv_pd->src_primitive_desc());
-      reorder_src = reorder(user_src_memory, src_memory);
-      is_src_reordered = true;
-    }
-    auto weights_memory = user_weights_memory;
-    primitive reorder_weights;
-    bool is_weights_reordered = false;
-    if (memory::primitive_desc(conv_pd->weights_primitive_desc()) !=
-        user_weights_memory.get_primitive_desc()) {
-      weights_memory = memory(conv_pd->weights_primitive_desc());
-      reorder_weights = reorder(user_weights_memory, weights_memory);
-      is_weights_reordered = true;
-    }
+    ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);
 
-    // create memory primitive for conv dst
-    auto dst_memory = memory(conv_pd->dst_primitive_desc(), output_data);
+    // create mkldnn memory from input tensors (data/weights)
+    auto user_src_memory_p =
+        handler.AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
+    auto user_weights_memory_p = handler.AcquireWeightsMemory(
+        user_weights_md, to_void_cast<T>(filter_data));
+
+    // create reorder primitive if the input format is not the preferred one
+    auto src_memory_p =
+        handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
+    auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive(
+        user_weights_memory_p, pipeline);
+    auto dst_memory_p =
+        handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
 
     // create convolution op primitive
-    auto conv_prim = conv_fwd(*conv_pd, src_memory, weights_memory, dst_memory);
+    auto conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
+                                             dst_memory_p);
 
     // push primitive to stream and wait until it's executed
-    std::vector<primitive> pipeline;
-    if (is_src_reordered) pipeline.push_back(reorder_src);
-    if (is_weights_reordered) pipeline.push_back(reorder_weights);
-    pipeline.push_back(conv_prim);
+    pipeline.push_back(*conv_p);
     stream(stream::kind::eager).submit(pipeline).wait();
 
-    // Save conv_pd/src_memory/weights_memory for backward pass
-    dev_ctx.SetBlob(key_conv_pd, conv_pd);
-
     output->set_layout(DataLayout::kMKLDNN);
-    output->set_format(GetMKLDNNFormat(dst_memory));
+    output->set_format(GetMKLDNNFormat(*dst_memory_p));
   }
 
  private:
-  std::unique_ptr<conv_fwd::primitive_desc> ConvFwdPrimitiveDesc(
-      const memory::desc& src, const memory::desc& weights,
-      const memory::desc& dst, const std::vector<int>& strides,
-      const std::vector<int>& paddings, const mkldnn::engine& engine) const {
+  std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
+  ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
+                       const memory::desc& dst, const std::vector<int>& strides,
+                       const std::vector<int>& paddings,
+                       const mkldnn::engine& engine) const {
     memory::dims stride_dims = {strides[0], strides[1]};
     memory::dims padding_dims = {paddings[0], paddings[1]};
 
-    auto conv_desc =
-        conv_fwd::desc(mkldnn::prop_kind::forward, mkldnn::convolution_direct,
-                       src, weights, dst, stride_dims, padding_dims,
-                       padding_dims, mkldnn::padding_kind::zero);
+    auto conv_desc = mkldnn::convolution_forward::desc(
+        mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights,
+        dst, stride_dims, padding_dims, padding_dims,
+        mkldnn::padding_kind::zero);
 
-    auto p_conv_pd = new conv_fwd::primitive_desc(conv_desc, engine);
+    auto p_conv_pd =
+        new mkldnn::convolution_forward::primitive_desc(conv_desc, engine);
 
-    return std::unique_ptr<conv_fwd::primitive_desc>(p_conv_pd);
+    return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
+        p_conv_pd);
   }
 };
 
@@ -197,13 +378,10 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 
     if (!input_grad && !filter_grad) return;
 
-    // Get an unique name from "argument" name of "Output" variable
-    // This name will be used as key when saving info into device context
-    const std::string key = ctx.op().Input("Output");
-    const std::string key_conv_pd = key + "@conv_pd";
-
     std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int groups = ctx.Attr<int>("groups");
 
     const T* input_data = input->data<T>();
     const T* filter_data = filter->data<T>();
@@ -223,146 +401,116 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
         paddle::framework::vectorize2int(filter->dims());
     std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
 
-    // create mkldnn memory from input tensors (input/weights/output_grad)
-    auto user_src_memory = memory(
-        {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine},
-        to_void_cast(input_data));
-    auto user_weights_memory =
-        memory({{{weights_tz}, memory::data_type::f32, filter->format()},
-                mkldnn_engine},
-               to_void_cast(filter_data));
-    auto user_diff_dst_memory =
-        memory({{{dst_tz}, memory::data_type::f32, output_grad->format()},
-                mkldnn_engine},
-               to_void_cast(output_grad_data));
+    // Get an unique name from "argument" name of "Output" variable
+    // as well as attributes of primitive to be created
+    // This name will be used as key when saving info into device context
+    const std::string key =
+        ConvMKLDNNHandler::GetHash(src_tz, weights_tz, strides, paddings,
+                                   dilations, groups, ctx.op().Input("Output"));
+
+    const std::string key_conv_pd = key + "@conv_pd";
+    std::vector<primitive> pipeline;
+
+    // Create user memory descriptors
+    auto user_src_md = platform::MKLDNNMemDesc(
+        {src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
+    auto user_weights_md = platform::MKLDNNMemDesc(
+        {weights_tz}, platform::MKLDNNGetDataType<T>(), filter->format());
+    auto user_diff_dst_md = platform::MKLDNNMemDesc(
+        {dst_tz}, platform::MKLDNNGetDataType<T>(), output_grad->format());
 
     /* create memory descriptor for conv backward without specified format
      * ('any') which lets a primitive (conv backward in this case) choose
      * the memory format preferred for best performance
      */
-    auto src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32,
-                                          memory::format::any);
-    auto diff_src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32,
-                                               memory::format::any);
+    auto src_md = platform::MKLDNNMemDesc(
+        src_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
+    auto diff_src_md = platform::MKLDNNMemDesc(
+        src_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
     auto weights_md = platform::MKLDNNMemDesc(
-        weights_tz, memory::data_type::f32, memory::format::any);
+        weights_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
     auto diff_weights_md = platform::MKLDNNMemDesc(
-        weights_tz, memory::data_type::f32, memory::format::any);
-    auto diff_dst_md = platform::MKLDNNMemDesc(dst_tz, memory::data_type::f32,
-                                               memory::format::any);
+        weights_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
+    auto diff_dst_md = platform::MKLDNNMemDesc(
+        dst_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
 
     // Retrieve conv_pd from device context
-    auto conv_pd = std::static_pointer_cast<conv_fwd::primitive_desc>(
-        dev_ctx.GetBlob(key_conv_pd));
+    auto conv_pd =
+        std::static_pointer_cast<mkldnn::convolution_forward::primitive_desc>(
+            dev_ctx.GetBlob(key_conv_pd));
     PADDLE_ENFORCE(conv_pd != nullptr,
                    "Fail to find conv_pd in device context");
 
+    // create backward convolution weights primitive descriptor
+    auto conv_bwd_weights_desc = mkldnn::convolution_backward_weights::desc(
+        mkldnn::convolution_direct, src_md, diff_weights_md, diff_dst_md,
+        strides, paddings, paddings, mkldnn::padding_kind::zero);
+    auto conv_bwd_weights_pd =
+        std::make_shared<mkldnn::convolution_backward_weights::primitive_desc>(
+            conv_bwd_weights_desc, mkldnn_engine, *conv_pd);
+
+    // create backward convolution data primitive descriptor
+    auto conv_bwd_data_desc = mkldnn::convolution_backward_data::desc(
+        mkldnn::convolution_direct, diff_src_md, weights_md, diff_dst_md,
+        strides, paddings, paddings, mkldnn::padding_kind::zero);
+    auto conv_bwd_data_pd =
+        std::make_shared<mkldnn::convolution_backward_data::primitive_desc>(
+            conv_bwd_data_desc, mkldnn_engine, *conv_pd);
+
+    ConvMKLDNNHandler handler(conv_pd, conv_bwd_data_pd, conv_bwd_weights_pd,
+                              dev_ctx, mkldnn_engine, key);
+
+    // create mkldnn memory from input tensors (data/weights)
+    auto user_src_memory_p =
+        handler.AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
+    auto user_weights_memory_p = handler.AcquireWeightsMemory(
+        user_weights_md, to_void_cast<T>(filter_data));
+    auto user_diff_dst_memory_p = handler.AcquireDiffDstMemory(
+        user_diff_dst_md, to_void_cast<T>(output_grad_data));
+
     // create backward conv primitive for weights
     if (filter_grad) {
-      // create backward convolution primitive descriptor
-      auto conv_bwd_weights_desc = conv_bwd_weights::desc(
-          mkldnn::convolution_direct, src_md, diff_weights_md, diff_dst_md,
-          strides, paddings, paddings, mkldnn::padding_kind::zero);
-      auto conv_bwd_weights_pd = conv_bwd_weights::primitive_desc(
-          conv_bwd_weights_desc, mkldnn_engine, *conv_pd);
-
-      // create reorder primitive if the input format is not the preferred one
-      auto src_memory = user_src_memory;
-      primitive reorder_src;
-      bool is_src_reordered = false;
-      if (memory::primitive_desc(conv_bwd_weights_pd.src_primitive_desc()) !=
-          user_src_memory.get_primitive_desc()) {
-        src_memory = memory(conv_bwd_weights_pd.src_primitive_desc());
-        reorder_src = reorder(user_src_memory, src_memory);
-        is_src_reordered = true;
-      }
-
-      auto diff_dst_memory_4filter = user_diff_dst_memory;
-      primitive reorder_diff_dst_4filter;
-      bool is_diff_dst_reordered_4filter = false;
-      if (memory::primitive_desc(
-              conv_bwd_weights_pd.diff_dst_primitive_desc()) !=
-          user_diff_dst_memory.get_primitive_desc()) {
-        diff_dst_memory_4filter =
-            memory(conv_bwd_weights_pd.diff_dst_primitive_desc());
-        reorder_diff_dst_4filter =
-            reorder(user_diff_dst_memory, diff_dst_memory_4filter);
-        is_diff_dst_reordered_4filter = true;
-      }
-
-      // create mkldnn memory for output (i.e. diff weights)
-      auto diff_weights_memory =
-          memory(conv_bwd_weights_pd.diff_weights_primitive_desc(),
-                 reinterpret_cast<void*>(filter_grad_data));
+      auto src_memory_p = handler.AcquireSrcMemoryFromWeightsPrimitive(
+          user_src_memory_p, pipeline);
 
-      // create backward conv primitive for weights
-      auto conv_bwd_weights_prim =
-          conv_bwd_weights(conv_bwd_weights_pd, src_memory,
-                           diff_dst_memory_4filter, diff_weights_memory);
-
-      // push primitive and execute it
-      std::vector<primitive> pipeline;
-      if (is_src_reordered) pipeline.push_back(reorder_src);
-      if (is_diff_dst_reordered_4filter)
-        pipeline.push_back(reorder_diff_dst_4filter);
-      pipeline.push_back(conv_bwd_weights_prim);
-      stream(stream::kind::eager).submit(pipeline).wait();
+      auto diff_dst_memory_4filter_p =
+          handler.AcquireDiffDstMemoryFromWeightsPrimitive(
+              user_diff_dst_memory_p, pipeline);
+
+      auto diff_weights_memory_p =
+          handler.AcquireDiffWeightsMemoryFromWeightsPrimitive(
+              reinterpret_cast<void*>(filter_grad_data));
+
+      auto conv_bwd_weights_p = handler.AcquireConvolutionBackwardWeights(
+          src_memory_p, diff_dst_memory_4filter_p, diff_weights_memory_p);
+
+      // push primitive to stream and wait until it's executed
+      pipeline.push_back(*conv_bwd_weights_p);
 
       filter_grad->set_layout(DataLayout::kMKLDNN);
-      filter_grad->set_format(GetMKLDNNFormat(diff_weights_memory));
+      filter_grad->set_format(GetMKLDNNFormat(*diff_weights_memory_p));
     }
 
     if (input_grad) {
-      // create backward convolution primitive descriptor
-      auto conv_bwd_data_desc = conv_bwd_data::desc(
-          mkldnn::convolution_direct, diff_src_md, weights_md, diff_dst_md,
-          strides, paddings, paddings, mkldnn::padding_kind::zero);
-      auto conv_bwd_data_pd = conv_bwd_data::primitive_desc(
-          conv_bwd_data_desc, mkldnn_engine, *conv_pd);
-
-      // create reorder primitive if the input format is not the preferred one
-      auto weights_memory = user_weights_memory;
-      primitive reorder_weights;
-      bool is_weights_reordered = false;
-      if (memory::primitive_desc(conv_bwd_data_pd.weights_primitive_desc()) !=
-          user_weights_memory.get_primitive_desc()) {
-        weights_memory = memory(conv_bwd_data_pd.weights_primitive_desc());
-        reorder_weights = reorder(user_weights_memory, weights_memory);
-        is_weights_reordered = true;
-      }
-
-      auto diff_dst_memory_4data = user_diff_dst_memory;
-      primitive reorder_diff_dst_4data;
-      bool is_diff_dst_reordered_4data = false;
-      if (memory::primitive_desc(conv_bwd_data_pd.diff_dst_primitive_desc()) !=
-          user_diff_dst_memory.get_primitive_desc()) {
-        diff_dst_memory_4data =
-            memory(conv_bwd_data_pd.diff_dst_primitive_desc());
-        reorder_diff_dst_4data =
-            reorder(user_diff_dst_memory, diff_dst_memory_4data);
-        is_diff_dst_reordered_4data = true;
-      }
-
-      // create mkldnn memory for output (i.e. diff src)
-      auto diff_src_memory = memory(conv_bwd_data_pd.diff_src_primitive_desc(),
-                                    reinterpret_cast<void*>(input_grad_data));
-
-      // create backward conv primitive for data
-      auto conv_bwd_data_prim =
-          conv_bwd_data(conv_bwd_data_pd, diff_dst_memory_4data, weights_memory,
-                        diff_src_memory);
-
-      // push primitive and execute it
-      std::vector<primitive> pipeline;
-      if (is_weights_reordered) pipeline.push_back(reorder_weights);
-      if (is_diff_dst_reordered_4data)
-        pipeline.push_back(reorder_diff_dst_4data);
-      pipeline.push_back(conv_bwd_data_prim);
-      stream(stream::kind::eager).submit(pipeline).wait();
+      auto weights_memory_p = handler.AcquireWeightsMemoryFromDataPrimitive(
+          user_weights_memory_p, pipeline);
+
+      auto diff_dst_memory_4data_p =
+          handler.AcquireDiffDstMemoryFromDataPrimitive(user_diff_dst_memory_p,
+                                                        pipeline);
+
+      auto diff_src_memory_p = handler.AcquireDiffSrcMemoryFromDataPrimitive(
+          reinterpret_cast<void*>(input_grad_data));
+
+      auto conv_bwd_data_p = handler.AcquireConvolutionBackwardData(
+          diff_dst_memory_4data_p, weights_memory_p, diff_src_memory_p);
+
+      pipeline.push_back(*conv_bwd_data_p);
 
       input_grad->set_layout(DataLayout::kMKLDNN);
-      input_grad->set_format(GetMKLDNNFormat(diff_src_memory));
+      input_grad->set_format(GetMKLDNNFormat(*diff_src_memory_p));
     }
+    stream(stream::kind::eager).submit(pipeline).wait();
   }  // Compute()
 };
 
diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
index 038ea8999072f562104c5386ed18b6b275816345..82fff68e7557b3f0b44e6faf2a50e5a0ecbba589 100644
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
@@ -87,7 +87,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
     // Get the algorithm
-    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
         handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
         // dxDesc: Handle to the previously initialized output tensor
         // descriptor.
@@ -95,7 +95,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
         workspace_size_limit, &algo));
 
     // get workspace size able to allocate
-    PADDLE_ENFORCE(
+    CUDNN_ENFORCE(
         platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
             handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
             cudnn_output_desc, algo, &workspace_size_in_bytes));
@@ -110,7 +110,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
     int filter_offset = filter->numel() / groups;
     T alpha = 1.0f, beta = 0.0f;
     for (int g = 0; g < groups; g++) {
-      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
+      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
           handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
           cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
           algo, cudnn_workspace, workspace_size_in_bytes, &beta,
@@ -178,11 +178,11 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
     auto handle = dev_ctx.cudnn_handle();
     if (input_grad) {
       // choose backward algorithm for data
-      PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
           handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
           cudnn_input_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
           workspace_size_limit, &data_algo));
-      PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
           handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
           cudnn_input_desc, data_algo, &fwd_ws_size));
       workspace_size_in_bytes = std::max(workspace_size_in_bytes, fwd_ws_size);
@@ -190,7 +190,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
 
     if (filter_grad) {
       // choose backward algorithm for filter
-      PADDLE_ENFORCE(
+      CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
               handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
               cudnn_filter_desc,
@@ -198,7 +198,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
               workspace_size_limit, &filter_algo));
 
       // get workspace for backwards filter algorithm
-      PADDLE_ENFORCE(
+      CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
               handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
               cudnn_filter_desc, filter_algo, &bwd_filter_ws_size));
@@ -222,7 +222,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset input_grad.
       for (int g = 0; g < groups; g++) {
-        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
             handle, &alpha, cudnn_output_desc,
             output_grad_data + output_grad_offset * g, cudnn_filter_desc,
             filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
@@ -237,7 +237,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       // Because beta is zero, it is unnecessary to reset filter_grad.
       // Gradient with respect to the filter
       for (int g = 0; g < groups; g++) {
-        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
             handle, &alpha, cudnn_output_desc,
             output_grad_data + output_grad_offset * g, cudnn_input_desc,
             input_data + input_offset * g, cudnn_conv_desc, filter_algo,
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index d5e095f9cad95b74b8ff79e4a60ccbdf11512a5a..a3bec3da45136bca5cb2763e7ffd6b67703a1813 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -124,8 +124,7 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
              "Tensor<float/double> with shape [N x D].");
     AddOutput("Y",
               "(Tensor, default Tensor<float>), a 2-D tensor with shape "
-              "[N x 1]. The cross entropy loss.")
-        .Reuse("X");
+              "[N x 1]. The cross entropy loss.");
     AddAttr<bool>("soft_label",
                   "(bool, default false), a flag indicating whether to "
                   "interpretate the given labels as soft labels.")
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 6d296ff7bf14de9175dc589dfa8b46c534127ca1..a44d84cd7b99107fef09a6b4dfa60172fabd718b 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -27,7 +27,8 @@ anchor_generator_op.cu)
 detection_library(target_assign_op SRCS target_assign_op.cc
 target_assign_op.cu)
 detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
-    polygon_box_transform_op.cu)
+polygon_box_transform_op.cu)
+detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
 
 # Export local libraries to parent
 set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
diff --git a/paddle/fluid/operators/detection/prior_box_op.cc b/paddle/fluid/operators/detection/prior_box_op.cc
index 4e35c38e4e03d4d0f00601812fdc4803519b89ae..b5cb6a724c095eb849f3a184f13843e1a0cca92f 100644
--- a/paddle/fluid/operators/detection/prior_box_op.cc
+++ b/paddle/fluid/operators/detection/prior_box_op.cc
@@ -149,6 +149,13 @@ class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
                    "(float) "
                    "Prior boxes center offset.")
         .SetDefault(0.5);
+    AddAttr<bool>(
+        "min_max_aspect_ratios_order",
+        "(bool) If set True, the output prior box is in order of"
+        "[min, max, aspect_ratios], which is consistent with Caffe."
+        "Please note, this order affects the weights order of convolution layer"
+        "followed by and does not affect the final detection results.")
+        .SetDefault(false);
     AddComment(R"DOC(
 Prior box operator
 Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
diff --git a/paddle/fluid/operators/detection/prior_box_op.cu b/paddle/fluid/operators/detection/prior_box_op.cu
index f67e6ca91c0852b5a3be35d23246884d1157caa4..1ea8cfc1d2af8cc6c332768a467cdcd4c0166319 100644
--- a/paddle/fluid/operators/detection/prior_box_op.cu
+++ b/paddle/fluid/operators/detection/prior_box_op.cu
@@ -28,8 +28,8 @@ __global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height,
                             const int im_width, const int as_num,
                             const T offset, const T step_width,
                             const T step_height, const T* min_sizes,
-                            const T* max_sizes, const int min_num,
-                            bool is_clip) {
+                            const T* max_sizes, const int min_num, bool is_clip,
+                            bool min_max_aspect_ratios_order) {
   int num_priors = max_sizes ? as_num * min_num + min_num : as_num * min_num;
   int box_num = height * width * num_priors;
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num;
@@ -44,14 +44,28 @@ __global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height,
     T min_size = min_sizes[m];
     if (max_sizes) {
       int s = p % (as_num + 1);
-      if (s < as_num) {
-        T ar = aspect_ratios[s];
-        bw = min_size * sqrt(ar) / 2.;
-        bh = min_size / sqrt(ar) / 2.;
+      if (!min_max_aspect_ratios_order) {
+        if (s < as_num) {
+          T ar = aspect_ratios[s];
+          bw = min_size * sqrt(ar) / 2.;
+          bh = min_size / sqrt(ar) / 2.;
+        } else {
+          T max_size = max_sizes[m];
+          bw = sqrt(min_size * max_size) / 2.;
+          bh = bw;
+        }
       } else {
-        T max_size = max_sizes[m];
-        bw = sqrt(min_size * max_size) / 2.;
-        bh = bw;
+        if (s == 0) {
+          bw = bh = min_size / 2.;
+        } else if (s == 1) {
+          T max_size = max_sizes[m];
+          bw = sqrt(min_size * max_size) / 2.;
+          bh = bw;
+        } else {
+          T ar = aspect_ratios[s - 1];
+          bw = min_size * sqrt(ar) / 2.;
+          bh = min_size / sqrt(ar) / 2.;
+        }
       }
     } else {
       int s = p % as_num;
@@ -94,6 +108,8 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
     auto variances = ctx.Attr<std::vector<float>>("variances");
     auto flip = ctx.Attr<bool>("flip");
     auto clip = ctx.Attr<bool>("clip");
+    auto min_max_aspect_ratios_order =
+        ctx.Attr<bool>("min_max_aspect_ratios_order");
 
     std::vector<float> aspect_ratios;
     ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
@@ -149,7 +165,7 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
     GenPriorBox<T><<<grid, block, 0, stream>>>(
         boxes->data<T>(), r.data<T>(), height, width, im_height, im_width,
         aspect_ratios.size(), offset, step_width, step_height, min.data<T>(),
-        max_data, min_num, clip);
+        max_data, min_num, clip, min_max_aspect_ratios_order);
 
     framework::Tensor v;
     framework::TensorFromVector(variances, ctx.device_context(), &v);
diff --git a/paddle/fluid/operators/detection/prior_box_op.h b/paddle/fluid/operators/detection/prior_box_op.h
index 1c62fd8d2c4d4e4deba4ca6442efbaff83e36c35..4e226abbb51c271502f0ca5419d488643b5a1a82 100644
--- a/paddle/fluid/operators/detection/prior_box_op.h
+++ b/paddle/fluid/operators/detection/prior_box_op.h
@@ -68,6 +68,8 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
     auto variances = ctx.Attr<std::vector<float>>("variances");
     auto flip = ctx.Attr<bool>("flip");
     auto clip = ctx.Attr<bool>("clip");
+    auto min_max_aspect_ratios_order =
+        ctx.Attr<bool>("min_max_aspect_ratios_order");
 
     std::vector<float> aspect_ratios;
     ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
@@ -108,26 +110,59 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
         int idx = 0;
         for (size_t s = 0; s < min_sizes.size(); ++s) {
           auto min_size = min_sizes[s];
-          // priors with different aspect ratios
-          for (size_t r = 0; r < aspect_ratios.size(); ++r) {
-            float ar = aspect_ratios[r];
-            box_width = min_size * sqrt(ar) / 2.;
-            box_height = min_size / sqrt(ar) / 2.;
-            e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-            e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-            e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-            e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
-            idx++;
-          }
-          if (max_sizes.size() > 0) {
-            auto max_size = max_sizes[s];
-            // square prior with size sqrt(minSize * maxSize)
-            box_width = box_height = sqrt(min_size * max_size) / 2.;
+          if (min_max_aspect_ratios_order) {
+            box_width = box_height = min_size / 2.;
             e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
             e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
             e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
             e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
             idx++;
+            if (max_sizes.size() > 0) {
+              auto max_size = max_sizes[s];
+              // square prior with size sqrt(minSize * maxSize)
+              box_width = box_height = sqrt(min_size * max_size) / 2.;
+              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
+              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
+              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
+              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
+              idx++;
+            }
+            // priors with different aspect ratios
+            for (size_t r = 0; r < aspect_ratios.size(); ++r) {
+              float ar = aspect_ratios[r];
+              if (fabs(ar - 1.) < 1e-6) {
+                continue;
+              }
+              box_width = min_size * sqrt(ar) / 2.;
+              box_height = min_size / sqrt(ar) / 2.;
+              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
+              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
+              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
+              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
+              idx++;
+            }
+          } else {
+            // priors with different aspect ratios
+            for (size_t r = 0; r < aspect_ratios.size(); ++r) {
+              float ar = aspect_ratios[r];
+              box_width = min_size * sqrt(ar) / 2.;
+              box_height = min_size / sqrt(ar) / 2.;
+              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
+              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
+              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
+              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
+              idx++;
+            }
+            if (max_sizes.size() > 0) {
+              auto max_size = max_sizes[s];
+              // square prior with size sqrt(minSize * maxSize)
+              box_width = box_height = sqrt(min_size * max_size) / 2.;
+              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
+              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
+              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
+              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
+              idx++;
+            }
           }
         }
       }
diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9a1643d5b35c067ba9064286bab32019fb34fbe8
--- /dev/null
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -0,0 +1,283 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <random>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+class RpnTargetAssignOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("DistMat"),
+                   "Input(DistMat) of RpnTargetAssignOp should not be null");
+
+    PADDLE_ENFORCE(
+        ctx->HasOutput("LocationIndex"),
+        "Output(LocationIndex) of RpnTargetAssignOp should not be null");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("ScoreIndex"),
+        "Output(ScoreIndex) of RpnTargetAssignOp should not be null");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("TargetLabel"),
+        "Output(TargetLabel) of RpnTargetAssignOp should not be null");
+
+    auto in_dims = ctx->GetInputDim("DistMat");
+    PADDLE_ENFORCE_EQ(in_dims.size(), 2,
+                      "The rank of Input(DistMat) must be 2.");
+  }
+};
+
+template <typename T>
+class RpnTargetAssignKernel : public framework::OpKernel<T> {
+ public:
+  void ScoreAssign(const T* dist_data, const Tensor& anchor_to_gt_max,
+                   const int row, const int col, const float pos_threshold,
+                   const float neg_threshold, int64_t* target_label_data,
+                   std::vector<int>* fg_inds, std::vector<int>* bg_inds) const {
+    int fg_offset = fg_inds->size();
+    int bg_offset = bg_inds->size();
+    for (int64_t i = 0; i < row; ++i) {
+      const T* v = dist_data + i * col;
+      T max_dist = *std::max_element(v, v + col);
+      for (int64_t j = 0; j < col; ++j) {
+        T val = dist_data[i * col + j];
+        if (val == max_dist) target_label_data[j] = 1;
+      }
+    }
+
+    // Pick the fg/bg and count the number
+    for (int64_t j = 0; j < col; ++j) {
+      if (anchor_to_gt_max.data<T>()[j] > pos_threshold) {
+        target_label_data[j] = 1;
+      } else if (anchor_to_gt_max.data<T>()[j] < neg_threshold) {
+        target_label_data[j] = 0;
+      }
+      if (target_label_data[j] == 1) {
+        fg_inds->push_back(fg_offset + j);
+      } else if (target_label_data[j] == 0) {
+        bg_inds->push_back(bg_offset + j);
+      }
+    }
+  }
+
+  void ReservoirSampling(const int num, const int offset,
+                         std::minstd_rand engine,
+                         std::vector<int>* inds) const {
+    std::uniform_real_distribution<float> uniform(0, 1);
+    const int64_t size = static_cast<int64_t>(inds->size());
+    if (size > num) {
+      for (int64_t i = num; i < size; ++i) {
+        int rng_ind = std::floor(uniform(engine) * i);
+        if (rng_ind < num)
+          std::iter_swap(inds->begin() + rng_ind + offset,
+                         inds->begin() + i + offset);
+      }
+    }
+  }
+
+  void RpnTargetAssign(const framework::ExecutionContext& ctx,
+                       const Tensor& dist, const float pos_threshold,
+                       const float neg_threshold, const int rpn_batch_size,
+                       const int fg_num, std::minstd_rand engine,
+                       std::vector<int>* fg_inds, std::vector<int>* bg_inds,
+                       int64_t* target_label_data) const {
+    auto* dist_data = dist.data<T>();
+    int64_t row = dist.dims()[0];
+    int64_t col = dist.dims()[1];
+    int fg_offset = fg_inds->size();
+    int bg_offset = bg_inds->size();
+
+    // Calculate the max IoU between anchors and gt boxes
+    Tensor anchor_to_gt_max;
+    anchor_to_gt_max.mutable_data<T>(
+        framework::make_ddim({static_cast<int64_t>(col), 1}),
+        platform::CPUPlace());
+    auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
+                       .eigen_device();
+    auto x = EigenMatrix<T>::From(dist);
+    auto x_col_max = EigenMatrix<T>::From(anchor_to_gt_max);
+    x_col_max.device(place) =
+        x.maximum(Eigen::DSizes<int, 1>(0))
+            .reshape(Eigen::DSizes<int, 2>(static_cast<int64_t>(col), 1));
+    // Follow the Faster RCNN's implementation
+    ScoreAssign(dist_data, anchor_to_gt_max, row, col, pos_threshold,
+                neg_threshold, target_label_data, fg_inds, bg_inds);
+    // Reservoir Sampling
+    ReservoirSampling(fg_num, fg_offset, engine, fg_inds);
+    int bg_num = rpn_batch_size - fg_inds->size();
+    ReservoirSampling(bg_num, bg_offset, engine, bg_inds);
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* dist = context.Input<LoDTensor>("DistMat");
+    auto* loc_index = context.Output<Tensor>("LocationIndex");
+    auto* score_index = context.Output<Tensor>("ScoreIndex");
+    auto* tgt_lbl = context.Output<Tensor>("TargetLabel");
+
+    auto col = dist->dims()[1];
+    int64_t n = dist->lod().size() == 0UL
+                    ? 1
+                    : static_cast<int64_t>(dist->lod().back().size() - 1);
+    if (dist->lod().size()) {
+      PADDLE_ENFORCE_EQ(dist->lod().size(), 1UL,
+                        "Only support 1 level of LoD.");
+    }
+    int rpn_batch_size = context.Attr<int>("rpn_batch_size_per_im");
+    float pos_threshold = context.Attr<float>("rpn_positive_overlap");
+    float neg_threshold = context.Attr<float>("rpn_negative_overlap");
+    float fg_fraction = context.Attr<float>("fg_fraction");
+
+    int fg_num = static_cast<int>(rpn_batch_size * fg_fraction);
+
+    int64_t* target_label_data =
+        tgt_lbl->mutable_data<int64_t>({n * col, 1}, context.GetPlace());
+
+    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
+    math::SetConstant<platform::CPUDeviceContext, int64_t> iset;
+    iset(dev_ctx, tgt_lbl, static_cast<int>(-1));
+
+    std::vector<int> fg_inds;
+    std::vector<int> bg_inds;
+    std::random_device rnd;
+    std::minstd_rand engine;
+    int seed =
+        context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
+    engine.seed(seed);
+
+    if (n == 1) {
+      RpnTargetAssign(context, *dist, pos_threshold, neg_threshold,
+                      rpn_batch_size, fg_num, engine, &fg_inds, &bg_inds,
+                      target_label_data);
+    } else {
+      auto lod = dist->lod().back();
+      for (size_t i = 0; i < lod.size() - 1; ++i) {
+        Tensor one_ins = dist->Slice(lod[i], lod[i + 1]);
+        RpnTargetAssign(context, one_ins, pos_threshold, neg_threshold,
+                        rpn_batch_size, fg_num, engine, &fg_inds, &bg_inds,
+                        target_label_data + i * col);
+      }
+    }
+    int* loc_index_data = loc_index->mutable_data<int>(
+        {static_cast<int>(fg_inds.size())}, context.GetPlace());
+    int* score_index_data = score_index->mutable_data<int>(
+        {static_cast<int>(fg_inds.size() + bg_inds.size())},
+        context.GetPlace());
+    memcpy(loc_index_data, reinterpret_cast<int*>(&fg_inds[0]),
+           fg_inds.size() * sizeof(int));
+    memcpy(score_index_data, reinterpret_cast<int*>(&fg_inds[0]),
+           fg_inds.size() * sizeof(int));
+    memcpy(score_index_data + fg_inds.size(),
+           reinterpret_cast<int*>(&bg_inds[0]), bg_inds.size() * sizeof(int));
+  }
+};
+
+class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "DistMat",
+        "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
+        "[K, M]. It is pair-wise distance matrix between the entities "
+        "represented by each row and each column. For example, assumed one "
+        "entity is A with shape [K], another entity is B with shape [M]. The "
+        "DistMat[i][j] is the distance between A[i] and B[j]. The bigger "
+        "the distance is, the better macthing the pairs are. Please note, "
+        "This tensor can contain LoD information to represent a batch of "
+        "inputs. One instance of this batch can contain different numbers of "
+        "entities.");
+    AddAttr<float>(
+        "rpn_positive_overlap",
+        "Minimum overlap required between an anchor and ground-truth "
+        "box for the (anchor, gt box) pair to be a positive example.")
+        .SetDefault(0.7);
+    AddAttr<float>(
+        "rpn_negative_overlap",
+        "Maximum overlap allowed between an anchor and ground-truth "
+        "box for the (anchor, gt box) pair to be a negative examples.")
+        .SetDefault(0.3);
+    AddAttr<float>(
+        "fg_fraction",
+        "Target fraction of RoI minibatch that "
+        "is labeled foreground (i.e. class > 0), 0-th class is background.")
+        .SetDefault(0.25);
+    AddAttr<int>("rpn_batch_size_per_im",
+                 "Total number of RPN examples per image.")
+        .SetDefault(256);
+    AddAttr<bool>("fix_seed",
+                  "A flag indicating whether to use a fixed seed to generate "
+                  "random mask. NOTE: DO NOT set this flag to true in "
+                  "training. Setting this flag to true is only useful in "
+                  "unittest.")
+        .SetDefault(false);
+    AddAttr<int>("seed", "RpnTargetAssign random seed.").SetDefault(0);
+    AddOutput(
+        "LocationIndex",
+        "(Tensor), The indexes of foreground anchors in all RPN anchors, the "
+        "shape of the LocationIndex is [F], F depends on the value of input "
+        "tensor and attributes.");
+    AddOutput(
+        "ScoreIndex",
+        "(Tensor), The indexes of foreground and background anchors in all "
+        "RPN anchors(The rest anchors are ignored). The shape of the "
+        "ScoreIndex is [F + B], F and B depend on the value of input "
+        "tensor and attributes.");
+    AddOutput("TargetLabel",
+              "(Tensor<int64_t>), The target labels of each anchor with shape "
+              "[K * M, 1], "
+              "K and M is the same as they are in DistMat.");
+    AddComment(R"DOC(
+This operator can be, for given the IoU between the ground truth bboxes and the
+anchors, to assign classification and regression targets to each prediction.
+The Score index and LocationIndex will be generated according to the DistMat.
+The rest anchors would not contibute to the RPN training loss
+
+ScoreIndex is composed of foreground anchor indexes(positive labels) and
+background anchor indexes(negative labels). LocationIndex is exactly same
+as the foreground anchor indexes since we can not assign regression target to 
+the background anchors.
+
+The classification targets(TargetLabel) is a binary class label (of being
+an object or not). Following the paper of Faster-RCNN, the positive labels
+are two kinds of anchors: (i) the anchor/anchors with the highest IoU
+overlap with a ground-truth box, or (ii) an anchor that has an IoU overlap
+higher than rpn_positive_overlap(0.7) with any ground-truth box. Note that
+a single ground-truth box may assign positive labels to multiple anchors.
+A non-positive anchor is when its IoU ratio is lower than rpn_negative_overlap
+(0.3) for all ground-truth boxes. Anchors that are neither positive nor
+negative do not contribute to the training objective.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(rpn_target_assign, ops::RpnTargetAssignOp,
+                  ops::RpnTargetAssignOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(rpn_target_assign, ops::RpnTargetAssignKernel<float>,
+                       ops::RpnTargetAssignKernel<double>);
diff --git a/paddle/fluid/operators/detection/target_assign_op.h b/paddle/fluid/operators/detection/target_assign_op.h
index 3d529737414d54f05e8c82ede1d6068e5d261110..7f989dfca699d498432f8df3f86c44723faeb980 100644
--- a/paddle/fluid/operators/detection/target_assign_op.h
+++ b/paddle/fluid/operators/detection/target_assign_op.h
@@ -106,7 +106,11 @@ class TargetAssignKernel : public framework::OpKernel<T> {
     int64_t k = x->dims()[2];
 
     auto x_lod = x->lod().back();
+#if defined(PADDLE_WITH_CUDA)
     size_t* x_lod_data = x_lod.MutableData(ctx.GetPlace());
+#else
+    size_t* x_lod_data = x_lod.data();
+#endif
 
     TargetAssignFunctor<T, WT> functor(x_data, match_idx_data, x_lod_data,
                                        mismatch_value, n, m, p, k, out_data,
@@ -121,7 +125,11 @@ class TargetAssignKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(neg_indices->lod().size(), 1UL);
       const int* neg_idx_data = neg_indices->data<int>();
       auto neg_lod = neg_indices->lod().back();
+#if defined(PADDLE_WITH_CUDA)
       size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace());
+#else
+      size_t* neg_lod_data = neg_lod.data();
+#endif
       NegTargetAssignFunctor<DeviceContext, T, WT> neg_trg_functor;
       neg_trg_functor(device_ctx, neg_idx_data, neg_lod_data, n, m, k,
                       mismatch_value, out_data, out_wt_data);
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 675ca36774beb72cc1e9b136ad0b18ce061689ac..da5d20505e9b06c0717af8d79d5456a9ade1e89c 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -1,33 +1,43 @@
+if(NOT WITH_DISTRIBUTE)
+    return()
+endif()
+
+if(WITH_GRPC)
+    set(cc_generic_services "false")
+else()
+    set(cc_generic_services "true")
+endif()
+configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY)
+
 if(WITH_GRPC)
-  grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
-      request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor
-      selected_rows memory)
+  grpc_library(sendrecvop_grpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
+        request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc
+      PROTO send_recv.proto 
+      DEPS lod_tensor selected_rows memory)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  cc_test(serde_test SRCS grpc_serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
-          cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
-  cc_test(grpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc
-          grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
-          proto_desc lookup_table_op SERIAL)
+  cc_test(grpc_serde_test SRCS grpc_serde_test.cc 
+    DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
+  cc_test(rpc_server_test SRCS rpc_server_test.cc
+    DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_sparse_table_op SERIAL)
   return()
 endif()
 
 
 set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc  rpc_client.cc request_handler_impl.cc
+
+set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
+    brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+
+brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc 
+    brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc
   PROTO send_recv.proto
   DEPS lod_tensor selected_rows memory)
 
-find_library(OPENSSL_CRYPTO_LIBRARY_STATIC NAMES libcrypto.so)
-ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY_STATIC})
-
+set(brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy)
 
-find_library(OPENSSL_SSL_LIBRARY_STATIC NAMES libssl.so)
-ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY_STATIC})
+cc_test(brpc_server_test SRCS rpc_server_test.cc 
+    DEPS ${brpc_test_depends} SERIAL)
 
-cc_test(brpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_brpc 
-       brpc protobuf leveldb gflags glog
-       protobuf executor proto_desc lookup_table_op snappystream snappy ssl crypto SERIAL)
+cc_test(brpc_serde_test SRCS brpc_serde_test.cc 
+    DEPS ${brpc_test_depends} SERIAL)
diff --git a/paddle/fluid/operators/distributed/bytebuffer_stream.cc b/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.cc
similarity index 96%
rename from paddle/fluid/operators/distributed/bytebuffer_stream.cc
rename to paddle/fluid/operators/distributed/grpc_bytebuffer_stream.cc
index 6e91b447db838c9095432eda22e9e1171e938d31..d192f54ee0c924b772045d9b6a01701f640e07c7 100644
--- a/paddle/fluid/operators/distributed/bytebuffer_stream.cc
+++ b/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 //       file and did some modifications so that we can send gRPC
 //       requests without too much copying of the tensor data.
 
-#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/distributed/bytebuffer_stream.h b/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h
similarity index 87%
rename from paddle/fluid/operators/distributed/bytebuffer_stream.h
rename to paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h
index e7de172c79c30761483b5d96f5bad19860208832..e9074574cdd163bbf7e62939df9283352706f840 100644
--- a/paddle/fluid/operators/distributed/bytebuffer_stream.h
+++ b/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "grpc++/grpc++.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
 
 namespace grpc {
 // A ZeroCopyInputStream that reads from grpc_byte_buffer
@@ -107,25 +108,6 @@ class GrpcBufferReader final
 namespace paddle {
 namespace operators {
 namespace distributed {
-// Source provides a way for a particular RPC implementation to provide
-// received data to ParseFrom.
-class Source {
- public:
-  virtual ~Source() {}
-
-  // Return the stream that contains the data to be parsed.
-  // Note that this method might be invoked more than once if
-  // ParseFrom needs to fall back to a more expensive parsing method.
-  // Every call must return a stream pointing at the beginning of
-  // the serialized RecvTensorResponse.
-  //
-  // Note that a subsequent call to contents() invalidates previous
-  // results of contents().
-  //
-  // Ownership of the returned stream is retained by the Source and
-  // should not be deleted by the caller.
-  virtual ::google::protobuf::io::ZeroCopyInputStream* contents() = 0;
-};
 
 // A ZeroCopyInputStream that reads from a grpc::ByteBuffer.
 class GrpcByteBufferSource
diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
index 4a09f3870d64d8e14b2db41ff3ea7c2f9e67b558..b4f60c9ff9a41d5cb7dbe4e7a7694a84bab8e940 100644
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 
 #include "glog/logging.h"  // For VLOG
 #include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/operators/distributed/grpc_serde.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -35,23 +36,20 @@ void GRPCClient::InitEventLoop() {
   client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this)));
 }
 
-void GRPCClient::SendBeginPass() {
-  for (auto& it : channels_) {
-    VLOG(3) << "send begin pass to: " << it.first;
-    this->AsyncSendBeginPass(it.first);
-  }
-  this->Wait();
-}
-
-void GRPCClient::SendEndPass() {
-  for (auto& it : channels_) {
-    VLOG(3) << "send end pass to " << it.first;
-    this->AsyncSendEndPass(it.first);
+void GRPCClient::SendComplete() {
+  std::unique_lock<std::mutex> lk(completed_mutex_);
+  if (!completed_) {
+    for (auto& it : channels_) {
+      VLOG(3) << "send complete message to " << it.first;
+      this->AsyncSendComplete(it.first);
+    }
+    PADDLE_ENFORCE(this->Wait(), "internal grpc error");
+    completed_ = true;
   }
-  this->Wait();
 }
 
 GRPCClient::~GRPCClient() {
+  stopped_ = true;
   Wait();
   cq_.Shutdown();
   {
@@ -59,7 +57,9 @@ GRPCClient::~GRPCClient() {
     for (auto& it : channels_) {
       it.second.reset();
     }
+    channels_.clear();
   }
+
   client_thread_->join();
 }
 
@@ -236,32 +236,19 @@ void GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
   req_count_++;
 }
 
-void GRPCClient::AsyncSendBeginPass(const std::string& ep, int64_t time_out) {
+void GRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) {
   const auto ch = GetChannel(ep);
 
   BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
   s->Prepare(time_out);
 
   sendrecv::VariableMessage req;
-  req.set_varname(BEGIN_PASS_MESSAGE);
+  req.set_varname(COMPLETE_MESSAGE);
   auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
   req_count_++;
 }
 
-void GRPCClient::AsyncSendEndPass(const std::string& ep, int64_t time_out) {
-  const auto ch = GetChannel(ep);
-
-  FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
-  s->Prepare(time_out);
-
-  sendrecv::VariableMessage req;
-  req.set_varname(END_PASS_MESSAGE);
-  auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  req_count_++;
-}
-
 void GRPCClient::AsyncCheckpointNotify(const std::string& ep,
                                        const std::string& dir,
                                        int64_t time_out) {
@@ -279,22 +266,31 @@ void GRPCClient::AsyncCheckpointNotify(const std::string& ep,
   req_count_++;
 }
 
-void GRPCClient::Wait() {
+bool GRPCClient::Wait() {
   std::unique_lock<std::mutex> lk(sync_mutex_);
-  sync_cond_.wait(lk, [this] { return req_count_ == 0; });
+  sync_cond_.wait(lk, [this] { return (req_count_ == 0 || ok_ == false); });
+  return ok_;
 }
 
 void GRPCClient::Proceed() {
   void* tag = nullptr;
   bool ok = false;
 
-  while (cq_.Next(&tag, &ok)) {
+  while (!stopped_ && cq_.Next(&tag, &ok)) {
     BaseProcessor* c = static_cast<BaseProcessor*>(tag);
     GPR_ASSERT(ok);
     PADDLE_ENFORCE(c);
     if (c->status_.ok()) {
       VLOG(3) << c->var_h_.String() << " process";
       c->Process();
+    } else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) {
+      LOG(ERROR) << c->var_h_.String()
+                 << " meets grpc error:" << c->status_.error_message();
+      {
+        std::lock_guard<std::mutex> lk(sync_mutex_);
+        ok_ = false;
+      }
+      sync_cond_.notify_all();
     } else {
       LOG(FATAL) << c->var_h_.String()
                  << " meets grpc error:" << c->status_.error_message();
diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h
index 5dae20155edcf9edd746a5d9a9bbe0ccd789f431..0c95ffeb5ce7e1586c5968fb122acd12c0c0196e 100644
--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -38,7 +38,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 
@@ -46,23 +49,6 @@ namespace paddle {
 namespace operators {
 namespace distributed {
 
-struct VarHandle {
-  // RPC endpoint.
-  std::string ep;
-  const platform::DeviceContext* ctx;
-  const framework::Scope* scope;
-  // Variable name.
-  std::string name;
-  // RPC method name.
-  std::string method;
-
-  std::string String() const {
-    std::ostringstream s;
-    s << method << " name:[" << name << "], ep:[" << ep << "]";
-    return s.str();
-  }
-};
-
 void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
 
 class BaseProcessor {
@@ -188,7 +174,7 @@ class CheckpointNotifyProcessor : public BaseProcessor {
 
 class GRPCClient : public RPCClient {
  public:
-  GRPCClient() {}
+  GRPCClient() : ok_(true), completed_(false), stopped_(false) {}
   virtual ~GRPCClient();
 
   bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
@@ -215,17 +201,12 @@ class GRPCClient : public RPCClient {
   void AsyncCheckpointNotify(const std::string& ep, const std::string& dir,
                              int64_t time_out = FLAGS_rpc_deadline) override;
 
-  void AsyncSendBeginPass(const std::string& ep,
-                          int64_t time_out = FLAGS_rpc_deadline) override;
-
-  void AsyncSendEndPass(const std::string& ep,
-                        int64_t time_out = FLAGS_rpc_deadline) override;
-
-  void Wait() override;
+  void AsyncSendComplete(const std::string& ep,
+                         int64_t time_out = FLAGS_rpc_deadline) override;
 
-  void SendBeginPass() override;
+  bool Wait() override;
 
-  void SendEndPass() override;
+  void SendComplete() override;
 
  protected:
   void InitImpl() override;
@@ -247,10 +228,17 @@ class GRPCClient : public RPCClient {
   std::mutex sync_mutex_;
   std::condition_variable sync_cond_;
   std::atomic<int64_t> req_count_{0};
+  bool ok_;
 
   // mutex for GetChannel thread safety
   std::mutex chan_mutex_;
   DISABLE_COPY_AND_ASSIGN(GRPCClient);
+
+  // mutex for sending complete message only once
+  std::mutex completed_mutex_;
+  bool completed_;
+
+  volatile bool stopped_;
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f8796713a6b89a308113981614673e07e8d367f
--- /dev/null
+++ b/paddle/fluid/operators/distributed/grpc_serde.cc
@@ -0,0 +1,157 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+#include <nccl.h>
+#endif
+#include <sys/time.h>
+#include <thread>  // NOLINT
+
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/grpc_serde.h"
+#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
+                           const platform::DeviceContext& ctx,
+                           ::grpc::ByteBuffer* msg,
+                           const std::string& out_name) {
+  // Default DestroyCallback does nothing, When using GPU
+  // the CPU buffer need to be freed.
+  DestroyCallback destroy_callback = [](void* backing) {};
+  VarMsg request;
+  void* payload = nullptr;
+  size_t payload_size;
+
+  request.set_varname(name);
+  // Note: normally the profiler is enabled in 1 trainer, hence only
+  // 1 trainer returns true for ShouldSendProfileState(). It tells PS
+  // servers the trainer's profiling state so that PS can follow the
+  // trainer.
+  if (platform::ShouldSendProfileState()) {
+    if (platform::IsProfileEnabled()) {
+      request.set_profile(platform::kEnableProfiler);
+    } else {
+      request.set_profile(platform::kDisableProfiler);
+    }
+  }
+  if (!out_name.empty()) {
+    request.set_out_varname(out_name);
+  }
+  if (var->IsType<framework::LoDTensor>()) {
+    request.set_type(::sendrecv::LOD_TENSOR);
+    GetTensorPayload(var, ctx, &request, &payload, &payload_size);
+  } else if (var->IsType<framework::SelectedRows>()) {
+    request.set_type(::sendrecv::SELECTED_ROWS);
+    GetSelectedRowsPayload(var, ctx, &request, &payload, &payload_size);
+#ifdef PADDLE_WITH_CUDA
+  } else if (var->IsType<ncclUniqueId>()) {
+    request.set_type(::sendrecv::NCCL_ID);
+#endif
+  } else {
+    PADDLE_THROW("Serialize does not support type: %s",
+                 typeid(var->Type()).name());
+  }
+
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+    // GPU data is copied to CPU buffer when sending,
+    // free the buffer when possible.
+    destroy_callback = [](void* backing) {
+      platform::CUDAPinnedPlace cuda_pinned;
+      memory::Free(cuda_pinned, backing);
+    };
+#endif
+  }
+
+  std::string header;
+  request.AppendToString(&header);
+  auto buffer = std::unique_ptr<char[]>(new char[1024]);
+  void* buf = buffer.get();
+  ProtoEncodeHelper e(static_cast<char*>(buf), 1024);
+  e.WriteRawBytes(std::string(header.data(), header.size()));
+// NCCLID is copied directly to the message, return bytebuffer
+// with only one slice if serializing NCCLID.
+#ifdef PADDLE_WITH_CUDA
+  if (var->IsType<ncclUniqueId>()) {
+    e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
+                              NCCL_UNIQUE_ID_BYTES);
+    const ncclUniqueId& uid = var->Get<ncclUniqueId>();
+    e.WriteRawBytes(std::string(uid.internal, NCCL_UNIQUE_ID_BYTES));
+
+    // for serialize NCCL_ID
+    ::grpc::Slice slices(e.size());
+    memcpy(const_cast<uint8_t*>(slices.begin()), e.data(), e.size());
+    ::grpc::ByteBuffer tmp(&slices, 1);
+    msg->Swap(&tmp);
+    return;
+  }
+#endif
+
+  e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
+  // steal reference of tensor data
+  ::grpc::Slice slices[4];  // metadata, tensor, rows meta, rows
+  int num_slices = 2;       // only SelectedRows have rows buffer
+  slices[0] = ::grpc::Slice(e.size());
+  memcpy(const_cast<uint8_t*>(slices[0].begin()), e.data(), e.size());
+  slices[1] = ::grpc::Slice(
+      grpc_slice_new_with_user_data(payload, payload_size, destroy_callback,
+                                    static_cast<char*>(payload)),
+      ::grpc::Slice::STEAL_REF);
+
+  if (var->IsType<framework::SelectedRows>()) {
+    auto* slr = var->GetMutable<framework::SelectedRows>();
+    ProtoEncodeHelper e2(static_cast<char*>(buf), 128);
+    size_t rows_memory_size =
+        slr->rows().size() * framework::SizeOfType(typeid(int64_t));
+    e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
+    slices[2] = ::grpc::Slice(e2.size());
+    memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());
+
+    slices[3] = ::grpc::Slice(
+        grpc_slice_new_with_user_data(
+            const_cast<void*>(
+                reinterpret_cast<const void*>(slr->rows().data())),
+            rows_memory_size, [](void* backing) {},
+            const_cast<char*>(
+                reinterpret_cast<const char*>(slr->rows().data()))),
+        ::grpc::Slice::STEAL_REF);
+    num_slices = 4;
+  }
+
+  ::grpc::ByteBuffer tmp(&slices[0], num_slices);
+  msg->Swap(&tmp);
+}
+
+void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
+                               const platform::DeviceContext& ctx,
+                               const framework::Scope* scope,
+                               framework::Variable** var) {
+  operators::distributed::GRPCVariableResponse resp(scope, &ctx);
+  PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
+  *var = resp.GetVar();
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc_serde.h b/paddle/fluid/operators/distributed/grpc_serde.h
new file mode 100644
index 0000000000000000000000000000000000000000..450c41dcd6b1bf9a33d3bbef3a1c94a2f83ff322
--- /dev/null
+++ b/paddle/fluid/operators/distributed/grpc_serde.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <sys/time.h>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+
+#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+typedef void (*DestroyCallback)(void*);
+
+void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
+                           const platform::DeviceContext& ctx,
+                           ::grpc::ByteBuffer* msg,
+                           const std::string& out_varname = std::string());
+
+void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
+                               const platform::DeviceContext& ctx,
+                               const framework::Scope* scope,
+                               framework::Variable** var);
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc_serde_test.cc b/paddle/fluid/operators/distributed/grpc_serde_test.cc
index 3d107b533bcb7bfef3f9b13ec99afbd579a62e52..96ea05e74ed76768248a27ab435dc801b7d1b995 100644
--- a/paddle/fluid/operators/distributed/grpc_serde_test.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde_test.cc
@@ -21,8 +21,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/grpc_serde.h"
+#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"
@@ -84,7 +86,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
   // operators::distributed::DeserializeFromByteBuffer(msg, ctx, &var2);
   framework::Scope scope;
   scope.Var("myvar");
-  operators::distributed::VariableResponse resp(&scope, &ctx);
+  operators::distributed::GRPCVariableResponse resp(&scope, &ctx);
   EXPECT_EQ(resp.Parse(msg), 0);
 
   framework::Variable* var2 = resp.GetVar();
@@ -171,7 +173,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
   // deserialize zero-copy
   framework::Scope scope;
   scope.Var("myvar");
-  operators::distributed::VariableResponse resp(&scope, &ctx);
+  operators::distributed::GRPCVariableResponse resp(&scope, &ctx);
   if (from_type == 0) {
     EXPECT_EQ(resp.Parse(msg), 0);
   } else {
diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc
index f35e268f6ad36da02f17db2feb3fbf1fdf6c1e41..8edb00276df3ade1b320fbf2873e8b54ff3e1464 100644
--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <limits>
 #include <string>
 
+#include "paddle/fluid/operators/distributed/grpc_serde.h"
 #include "paddle/fluid/operators/distributed/grpc_server.h"
 
 using ::grpc::ServerAsyncResponseWriter;
@@ -84,9 +85,9 @@ class RequestSend final : public RequestBase {
                        ::grpc::ServerCompletionQueue* cq,
                        RequestHandler* request_handler, int req_id)
       : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    request_.reset(new VariableResponse(request_handler->scope(),
-                                        request_handler->dev_ctx(),
-                                        !request_handler->sync_mode()));
+    request_.reset(new GRPCVariableResponse(request_handler->scope(),
+                                            request_handler->dev_ctx(),
+                                            !request_handler->sync_mode()));
     int method_id = static_cast<int>(distributed::GrpcMethod::kSendVariable);
     service_->RequestAsyncUnary(
         method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
@@ -109,7 +110,7 @@ class RequestSend final : public RequestBase {
 
  protected:
   sendrecv::VoidMessage reply_;
-  std::shared_ptr<VariableResponse> request_;
+  std::shared_ptr<GRPCVariableResponse> request_;
   ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
 };
 
@@ -161,8 +162,8 @@ class RequestPrefetch final : public RequestBase {
       : RequestBase(service, cq, request_handler, req_id),
         responder_(&ctx_),
         local_scope_(nullptr) {
-    request_.reset(new VariableResponse(request_handler->scope(),
-                                        request_handler->dev_ctx(), true));
+    request_.reset(new GRPCVariableResponse(request_handler->scope(),
+                                            request_handler->dev_ctx(), true));
     int method_id =
         static_cast<int>(distributed::GrpcMethod::kPrefetchVariable);
     service_->RequestAsyncUnary(
@@ -194,7 +195,7 @@ class RequestPrefetch final : public RequestBase {
   }
 
  protected:
-  std::shared_ptr<VariableResponse> request_;
+  std::shared_ptr<GRPCVariableResponse> request_;
   ::grpc::ByteBuffer reply_;
   ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
   framework::Scope* local_scope_;
@@ -206,8 +207,8 @@ class RequestCheckpointNotify final : public RequestBase {
                                    ::grpc::ServerCompletionQueue* cq,
                                    RequestHandler* request_handler, int req_id)
       : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    request_.reset(new VariableResponse(request_handler->scope(),
-                                        request_handler->dev_ctx()));
+    request_.reset(new GRPCVariableResponse(request_handler->scope(),
+                                            request_handler->dev_ctx()));
     int method_id =
         static_cast<int>(distributed::GrpcMethod::kCheckpointNotify);
     service_->RequestAsyncUnary(
@@ -234,7 +235,7 @@ class RequestCheckpointNotify final : public RequestBase {
   }
 
  protected:
-  std::shared_ptr<VariableResponse> request_;
+  std::shared_ptr<GRPCVariableResponse> request_;
   sendrecv::VoidMessage reply_;
   ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
 };
diff --git a/paddle/fluid/operators/distributed/grpc_service.h b/paddle/fluid/operators/distributed/grpc_service.h
index cdc4e7b79276d6aac55aeac8ac121ca28d2cc1f0..9ae9a31a003cbb1f808fd1127a5dd78511aa3e99 100644
--- a/paddle/fluid/operators/distributed/grpc_service.h
+++ b/paddle/fluid/operators/distributed/grpc_service.h
@@ -23,8 +23,7 @@
 #include <grpc++/impl/codegen/stub_options.h>
 #include <grpc++/impl/codegen/sync_stream.h>
 #include <grpc++/support/byte_buffer.h>
-#include "paddle/fluid/operators/distributed/variable_response.h"
-
+#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
 #include "paddle/fluid/platform/profiler.h"
 
 // NOTE: This method was originally created by tensorflow
@@ -42,17 +41,18 @@ class ServerContext;
 // Support parsing/unparsing of tensorflow::VariableResponse.
 // Wire-format is identical to RecvVariableResponse.
 template <>
-class SerializationTraits<paddle::operators::distributed::VariableResponse> {
+class SerializationTraits<
+    paddle::operators::distributed::GRPCVariableResponse> {
  public:
   static Status Serialize(
-      const paddle::operators::distributed::VariableResponse& msg,
+      const paddle::operators::distributed::GRPCVariableResponse& msg,
       grpc_byte_buffer** bp, bool* own_buffer) {
     PADDLE_ENFORCE(false, "SerializationTraits::Serialize not implemented!");
     return Status();
   }
   static Status Deserialize(
       grpc_byte_buffer* buffer,
-      paddle::operators::distributed::VariableResponse* msg,
+      paddle::operators::distributed::GRPCVariableResponse* msg,
       int max_message_size = INT_MAX) {
     if (buffer == nullptr) {
       return Status(StatusCode::INTERNAL, "No payload");
diff --git a/paddle/fluid/operators/distributed/grpc_variable_response.cc b/paddle/fluid/operators/distributed/grpc_variable_response.cc
new file mode 100644
index 0000000000000000000000000000000000000000..34d47f3ec0f3025109447b66078b724607d2953a
--- /dev/null
+++ b/paddle/fluid/operators/distributed/grpc_variable_response.cc
@@ -0,0 +1,308 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <utility>
+#include <vector>
+#ifdef PADDLE_WITH_CUDA
+#include <nccl.h>
+#endif
+
+#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+enum WireType {
+  WIRETYPE_VARINT = 0,
+  WIRETYPE_LENGTH_DELIMITED = 2,
+};
+
+inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; }
+
+inline WireType GetTagWireType(uint32_t tag) {
+  return static_cast<WireType>(tag & 0x7);
+}
+
+bool ReadVarintSizeAsInt(::google::protobuf::io::CodedInputStream* input,
+                         int* result) {
+  uint64_t v;
+  if (input->ReadVarint64(&v) && v <= static_cast<uint64_t>(INT_MAX)) {
+    *result = static_cast<int>(v);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+int GRPCVariableResponse::Parse(const ::grpc::ByteBuffer& byte_buffer) {
+  GrpcByteBufferSource source;
+  source.Init(byte_buffer);
+  GrpcByteBufferSourceWrapper r(&source);
+
+  return Parse(&r);
+}
+
+bool ParseLodData(::google::protobuf::io::CodedInputStream* input,
+                  std::vector<int64_t>* lod) {
+  while (true) {
+    auto p = input->ReadTagWithCutoff(127);
+    int tag = GetTagFieldNumber(p.first);
+    WireType wt = GetTagWireType(p.first);
+
+    if (!p.second) {
+      return (tag == 0);
+    }
+
+    switch (tag) {
+      case sendrecv::VariableMessage_LodData::kLodDataFieldNumber: {
+        uint64_t v;
+        if (wt == WIRETYPE_VARINT) {
+          if (!input->ReadVarint64(&v)) {
+            return false;
+          }
+          lod->push_back(v);
+          break;
+        }
+
+        if (wt == WIRETYPE_LENGTH_DELIMITED) {
+          int num_bytes = 0;
+          if (!input->ReadVarintSizeAsInt(&num_bytes)) {
+            return tag;
+          }
+          int start_pos = input->CurrentPosition();
+          while (input->CurrentPosition() - start_pos < num_bytes) {
+            uint64_t v;
+            if (!input->ReadVarint64(&v)) {
+              return tag;
+            }
+            lod->push_back(v);
+          }
+          break;
+        }
+
+        return false;
+      }
+      default: { return false; }
+    }
+  }
+
+  return true;
+}
+
+int GRPCVariableResponse::Parse(Source* source) {
+  ::google::protobuf::io::ZeroCopyInputStream* input_stream =
+      source->contents();
+  ::google::protobuf::io::CodedInputStream input(input_stream);
+  input.SetTotalBytesLimit(INT_MAX, INT_MAX);
+
+  while (true) {
+    auto p = input.ReadTagWithCutoff(127);
+    int tag = GetTagFieldNumber(p.first);
+    WireType wt = GetTagWireType(p.first);
+    if (!p.second) {
+      if (tag != 0) {
+        return -1;
+      }
+      return 0;
+    }
+
+    switch (tag) {
+      case sendrecv::VariableMessage::kVarnameFieldNumber: {
+        uint32_t length;
+        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
+          return tag;
+        }
+
+        std::string temp;
+        if (!input.ReadString(&temp, length)) {
+          return tag;
+        }
+
+        meta_.set_varname(temp);
+        break;
+      }
+      case sendrecv::VariableMessage::kTypeFieldNumber: {
+        uint32_t v;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
+          return tag;
+        }
+
+        meta_.set_type(static_cast<::sendrecv::VarType>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kDataTypeFieldNumber: {
+        uint32_t v = 0;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
+          return tag;
+        }
+
+        meta_.set_data_type(static_cast<::sendrecv::VariableMessage_Type>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kDimsFieldNumber: {
+        // not packed
+        if (wt == WIRETYPE_VARINT) {
+          uint64_t v;
+          if (!input.ReadVarint64(&v)) {
+            return tag;
+          }
+          meta_.add_dims(v);
+          break;
+        }
+
+        // packed
+        if (wt == WIRETYPE_LENGTH_DELIMITED) {
+          int num_bytes = 0;
+          if (!input.ReadVarintSizeAsInt(&num_bytes)) {
+            return tag;
+          }
+          int start_pos = input.CurrentPosition();
+          while (input.CurrentPosition() - start_pos < num_bytes) {
+            uint64_t v;
+            if (!input.ReadVarint64(&v)) {
+              return tag;
+            }
+            meta_.add_dims(v);
+          }
+          break;
+        }
+        return tag;
+      }
+      case sendrecv::VariableMessage::kLodLevelFieldNumber: {
+        uint64_t v = 0;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
+          return tag;
+        }
+        meta_.set_lod_level(static_cast<int64_t>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kLodFieldNumber: {
+        int length = 0;
+        if (wt != WIRETYPE_LENGTH_DELIMITED ||
+            !ReadVarintSizeAsInt(&input, &length)) {
+          return tag;
+        }
+
+        std::pair<::google::protobuf::io::CodedInputStream::Limit, int> p =
+            input.IncrementRecursionDepthAndPushLimit(length);
+
+        std::vector<int64_t> lod_data;
+        if (p.second < 0 || !ParseLodData(&input, &lod_data)) {
+          return tag;
+        }
+
+        if (!input.DecrementRecursionDepthAndPopLimit(p.first)) {
+          return tag;
+        }
+
+        if (lod_data.size() == 0) {
+          break;
+        }
+
+        auto lod = meta_.add_lod();
+        for (uint32_t i = 0; i < lod_data.size(); i++) {
+          lod->add_lod_data(lod_data[i]);
+        }
+        break;
+      }
+      case sendrecv::VariableMessage::kSlrHeightFieldNumber: {
+        uint64_t v = 0;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
+          return tag;
+        }
+        meta_.set_slr_height(static_cast<int64_t>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kSerializedFieldNumber: {
+        int num_bytes = 0;
+        if (wt != WIRETYPE_LENGTH_DELIMITED ||
+            !ReadVarintSizeAsInt(&input, &num_bytes)) {
+          return tag;
+        }
+
+        if (!ProcSerializedField(tag, &input, num_bytes)) {
+          return tag;
+        }
+
+        break;
+      }
+      case sendrecv::VariableMessage::kRowsFieldNumber: {
+        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
+                        meta_.type() == sendrecv::LOD_TENSOR) &&
+                           meta_.varname() != "",
+                       "meta info should be got first!");
+
+        int num_bytes = 0;
+        if (wt != WIRETYPE_LENGTH_DELIMITED ||
+            !ReadVarintSizeAsInt(&input, &num_bytes)) {
+          return tag;
+        }
+
+        if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) {
+          return tag;
+        }
+        break;
+      }
+      case sendrecv::VariableMessage::kOutVarnameFieldNumber: {
+        uint32_t length;
+        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
+          return tag;
+        }
+
+        std::string temp;
+        if (!input.ReadString(&temp, length)) {
+          return tag;
+        }
+
+        meta_.set_out_varname(temp);
+        break;
+      }
+      case sendrecv::VariableMessage::kProfileFieldNumber: {
+        uint64_t profiling = 0;
+        if (!input.ReadVarint64(&profiling)) {
+          return tag;
+        }
+        meta_.set_profile(profiling);
+        int64_t listener_id = platform::ListenerId();
+        if (listener_id <= 0) {
+          break;
+        }
+        if (profiling == platform::kEnableProfiler &&
+            !platform::IsProfileEnabled()) {
+          platform::EnableProfiler(platform::ProfilerState::kCPU);
+        } else if (profiling == platform::kDisableProfiler &&
+                   platform::IsProfileEnabled()) {
+          // TODO(panyx0718): Should we allow to customize file dir.
+          platform::DisableProfiler(
+              platform::EventSortingKey::kDefault,
+              string::Sprintf("/tmp/profile_ps_%lld", listener_id));
+        }
+        break;
+      }
+      default: {
+        // Unknown tag, return unknown error.
+        return -1;
+      }
+    }
+  }
+
+  return 0;
+}
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc_variable_response.h b/paddle/fluid/operators/distributed/grpc_variable_response.h
new file mode 100644
index 0000000000000000000000000000000000000000..89df07c92cd33bcb76c8539b5566d74fa21bba5e
--- /dev/null
+++ b/paddle/fluid/operators/distributed/grpc_variable_response.h
@@ -0,0 +1,58 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+
+#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+class GRPCVariableResponse : public VariableResponse {
+ public:
+  GRPCVariableResponse(const framework::Scope* scope,
+                       const platform::DeviceContext* dev_ctx,
+                       bool create_scope = false)
+      : VariableResponse(scope, dev_ctx, create_scope) {}
+
+  virtual ~GRPCVariableResponse() {}
+
+  int Parse(Source* source) override;
+
+  // return:
+  // 0:ok.
+  // -1: unkown error.
+  // other: number of error field.
+  int Parse(const ::grpc::ByteBuffer& byte_buffer);
+};
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
index 271306d5d20f1b849a81a9bfa6436f2faf261204..64ac7281848f91302bc0aa3cb81dd198e56fb653 100644
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -43,14 +43,29 @@ constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
 #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
 #define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
 #define COMPLETE_MESSAGE "COMPLETE@RECV"
-#define BEGIN_PASS_MESSAGE "BEGIN_PASS@RECV"
-#define END_PASS_MESSAGE "END_PASS@RECV"
 
 #define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY"
 #define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY"
 
 class RPCServer;
 
+struct VarHandle {
+  // RPC endpoint.
+  std::string ep;
+  const platform::DeviceContext* ctx;
+  const framework::Scope* scope;
+  // Variable name.
+  std::string name;
+  // RPC method name.
+  std::string method;
+
+  std::string String() const {
+    std::ostringstream s;
+    s << method << " name:[" << name << "], ep:[" << ep << "]";
+    return s.str();
+  }
+};
+
 class RequestHandler {
  public:
   explicit RequestHandler(bool sync_mode)
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index 5e6bff20f5f8c06e1497c697e3aabf7b9cb94ad6..55995783c6eab10632ab2a5bca64ca856f000df1 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -53,20 +53,18 @@ bool RequestSendHandler::Handle(const std::string& varname,
 
   // Sync
   if (varname == BATCH_BARRIER_MESSAGE) {
-    VLOG(3) << "sync: recv batch barrier message";
+    VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE";
     rpc_server_->IncreaseBatchBarrier(kRequestSend);
-  } else if (varname == BEGIN_PASS_MESSAGE) {
-    VLOG(3) << "sync: recv begin pass message";
-    rpc_server_->WaitCond(kRequestSend);
-    rpc_server_->BeginPass();
+  } else if (varname == COMPLETE_MESSAGE) {
+    VLOG(3) << "sync: recv complete message";
+    rpc_server_->Complete();
   } else {
     VLOG(3) << "sync: received var_name: " << varname;
     rpc_server_->WaitCond(kRequestSend);
     VLOG(3) << "sync: processing received var: " << varname;
 
     if (invar == nullptr) {
-      LOG(ERROR) << "sync: Can not find server side var: " << varname;
-      PADDLE_THROW("sync: Can not find server side var");
+      LOG(FATAL) << "sync: Can not find server side var: " << varname;
       return false;
     }
     if (invar->IsType<framework::SelectedRows>()) {
@@ -95,14 +93,12 @@ bool RequestGetHandler::Handle(const std::string& varname,
     if (varname == FETCH_BARRIER_MESSAGE) {
       VLOG(3) << "sync: recv fetch barrier message";
       rpc_server_->IncreaseBatchBarrier(kRequestGet);
-    } else if (varname == END_PASS_MESSAGE) {
-      rpc_server_->EndPass();
     } else {
       rpc_server_->WaitCond(kRequestGet);
       *outvar = scope_->FindVar(varname);
     }
   } else {
-    if (varname != FETCH_BARRIER_MESSAGE && varname != END_PASS_MESSAGE) {
+    if (varname != FETCH_BARRIER_MESSAGE && varname != COMPLETE_MESSAGE) {
       *outvar = scope_->FindVar(varname);
     }
   }
diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h
index 6479d3a97bafba37b74a1d1c04852a6e60e01be8..22a022a5d25e5c6628b80294494b87ca105a04c7 100644
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -60,19 +60,15 @@ class RPCClient {
                                      const std::string& dir,
                                      int64_t time_out = FLAGS_rpc_deadline) = 0;
 
-  virtual void AsyncSendBeginPass(const std::string& ep,
-                                  int64_t time_out = FLAGS_rpc_deadline) = 0;
+  virtual void AsyncSendComplete(const std::string& ep,
+                                 int64_t time_out = FLAGS_rpc_deadline) = 0;
 
-  virtual void AsyncSendEndPass(const std::string& ep,
-                                int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  // BeginePass/EndPass tells all the pserver that start/end a pass, so that
-  // the pserver can increase/reduce it's barrier count, and continue to train
+  // Complete tells all the pserver instances that finishe the training,
+  // the pserver can reduce it's barrier count, and continue to train
   // with other trainers.
-  virtual void SendBeginPass() = 0;
-  virtual void SendEndPass() = 0;
+  virtual void SendComplete() = 0;
 
-  virtual void Wait() = 0;
+  virtual bool Wait() = 0;
 
   template <typename T>
   static RPCClient* GetInstance() {
diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc
index d49ee34eeaf4e80f6fd4f8cdc548cc2b938d0f2a..83b14fa64d735d80f43bf55c798cddb2f3ea7032 100644
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -64,18 +64,7 @@ void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
   }
 }
 
-void RPCServer::BeginPass() {
-  VLOG(4) << "RPCServer begin increase pass barrier";
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    client_num_++;
-    VLOG(4) << "increase client_num to: " << client_num_;
-  }
-  barrier_cond_.notify_all();
-}
-
-void RPCServer::EndPass() {
-  VLOG(4) << "RPCServer begin increase pass barrier";
+void RPCServer::Complete() {
   {
     std::unique_lock<std::mutex> lock(mutex_);
     client_num_--;
@@ -87,6 +76,11 @@ void RPCServer::EndPass() {
   barrier_cond_.notify_all();
 }
 
+int RPCServer::GetClientNum() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  return client_num_;
+}
+
 void RPCServer::ResetBarrierCounter() {
   VLOG(3) << "RPCServer ResetBarrierCounter ";
   std::unique_lock<std::mutex> lock(mutex_);
diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h
index 833991c8aa6e7cfd10f2aa52f9218be7ff8ccebf..fd914d7a72e61bc9472876c433b65598ef5b1980 100644
--- a/paddle/fluid/operators/distributed/rpc_server.h
+++ b/paddle/fluid/operators/distributed/rpc_server.h
@@ -44,7 +44,7 @@ class RPCServer {
 
   int GetSelectedPort() const { return selected_port_; }
 
-  int GetClientNum() const;
+  int GetClientNum();
 
   void SavePort() const;
 
@@ -64,8 +64,7 @@ class RPCServer {
   void WaitCond(const std::string& rpc_name);
   void IncreaseBatchBarrier(const std::string rpc_name);
 
-  void BeginPass();
-  void EndPass();
+  void Complete();
 
   void ResetBarrierCounter();
 
diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc
index a0693cffabcc561b0adfafc2c49027a890dd5efc..b50830c362d3f6ecf38affbfa6a1ffe2ed77e125 100644
--- a/paddle/fluid/operators/distributed/rpc_server_test.cc
+++ b/paddle/fluid/operators/distributed/rpc_server_test.cc
@@ -30,7 +30,7 @@ namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace distributed = paddle::operators::distributed;
 
-USE_OP(lookup_table);
+USE_NO_KERNEL_OP(lookup_sparse_table);
 
 std::unique_ptr<distributed::RPCServer> g_rpc_service;
 std::unique_ptr<distributed::RequestHandler> g_req_handler;
@@ -42,13 +42,13 @@ framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
   framework::VariableNameMap input({{"W", {"w"}}, {"Ids", {"ids"}}});
   framework::VariableNameMap output({{"Output", {"out"}}});
   auto op = block->AppendOp();
-  op->SetType("lookup_table");
+  op->SetType("lookup_sparse_table");
   op->SetInput("W", {"w"});
   op->SetInput("Ids", {"ids"});
   op->SetOutput("Out", {"out"});
 
   auto& out = *root_block->Var("out");
-  out.SetType(framework::proto::VarType::SELECTED_ROWS);
+  out.SetType(framework::proto::VarType::LOD_TENSOR);
   out.SetShape({10, 10});
 
   return block;
@@ -59,20 +59,19 @@ void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
   w_var->GetMutable<framework::SelectedRows>();
 
   auto out_var = scope->Var("out");
-  out_var->GetMutable<framework::SelectedRows>();
+  out_var->GetMutable<framework::LoDTensor>();
 
   auto ids_var = scope->Var("ids");
-  ids_var->GetMutable<framework::SelectedRows>();
+  ids_var->GetMutable<framework::LoDTensor>();
 }
 
 void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
                          int64_t rows_numel) {
   CreateVarsOnScope(scope, place);
-  auto ids_var = scope->Var("ids")->GetMutable<framework::SelectedRows>();
-  auto rows = ids_var->mutable_rows();
-  for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i * 2);
-  ids_var->mutable_value()->Resize({rows_numel, 1});
-  ids_var->mutable_value()->mutable_data<float>(*place);
+  auto ids_var = scope->Var("ids")->GetMutable<framework::LoDTensor>();
+  int64_t* ids_ptr =
+      ids_var->mutable_data<int64_t>(framework::DDim({rows_numel, 1}), *place);
+  for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2;
 }
 
 void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
@@ -91,7 +90,7 @@ void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
   }
 }
 
-void StartServer() {
+void StartServer(const std::string& rpc_name) {
   framework::ProgramDesc program;
   framework::Scope scope;
   platform::CPUPlace place;
@@ -107,14 +106,14 @@ void StartServer() {
                      std::shared_ptr<framework::ExecutorPrepareContext>>
       prefetch_var_name_to_prepared;
   prefetch_var_name_to_prepared[in_var_name] = prepared[0];
+
   g_req_handler->SetProgram(&program);
   g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared);
   g_req_handler->SetDevCtx(&ctx);
   g_req_handler->SetScope(&scope);
   g_req_handler->SetExecutor(&exe);
 
-  g_rpc_service->RegisterRPC(distributed::kRequestPrefetch,
-                             g_req_handler.get());
+  g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get());
   g_req_handler->SetRPCServer(g_rpc_service.get());
 
   std::thread server_thread(
@@ -129,7 +128,7 @@ TEST(PREFETCH, CPU) {
   distributed::RPCClient* client =
       distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
-  std::thread server_thread(StartServer);
+  std::thread server_thread(StartServer, distributed::kRequestPrefetch);
   g_rpc_service->WaitServerReady();
 
   int port = g_rpc_service->GetSelectedPort();
@@ -148,11 +147,11 @@ TEST(PREFETCH, CPU) {
     client->AsyncPrefetchVar(ep, ctx, scope, in_var_name, out_var_name);
     client->Wait();
     auto var = scope.Var(out_var_name);
-    auto value = var->GetMutable<framework::SelectedRows>()->value();
-    auto ptr = value.mutable_data<float>(place);
+    auto value = var->GetMutable<framework::LoDTensor>();
+    auto ptr = value->mutable_data<float>(place);
 
     for (int64_t i = 0; i < rows_numel; ++i) {
-      EXPECT_EQ(ptr[0 + i * value.dims()[1]], static_cast<float>(i * 2));
+      EXPECT_EQ(ptr[0 + i * value->dims()[1]], static_cast<float>(i * 2));
     }
   }
 
@@ -162,3 +161,24 @@ TEST(PREFETCH, CPU) {
   g_rpc_service.reset(nullptr);
   g_req_handler.reset(nullptr);
 }
+
+TEST(COMPLETE, CPU) {
+  g_req_handler.reset(new distributed::RequestSendHandler(true));
+  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 2));
+  distributed::RPCClient* client =
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>();
+  PADDLE_ENFORCE(client != nullptr);
+  std::thread server_thread(StartServer, distributed::kRequestSend);
+  g_rpc_service->WaitServerReady();
+  int port = g_rpc_service->GetSelectedPort();
+  std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
+  client->AsyncSendComplete(ep);
+  client->Wait();
+
+  EXPECT_EQ(g_rpc_service->GetClientNum(), 1);
+
+  g_rpc_service->ShutDown();
+  server_thread.join();
+  g_rpc_service.reset(nullptr);
+  g_req_handler.reset(nullptr);
+}
diff --git a/paddle/fluid/operators/distributed/send_recv.proto b/paddle/fluid/operators/distributed/send_recv.proto.in
similarity index 97%
rename from paddle/fluid/operators/distributed/send_recv.proto
rename to paddle/fluid/operators/distributed/send_recv.proto.in
index e0902320cff003797b12ed0204f7f99c44554b62..8b0a09abe1d05dda10eda0030eb91cb9ca40683e 100644
--- a/paddle/fluid/operators/distributed/send_recv.proto
+++ b/paddle/fluid/operators/distributed/send_recv.proto.in
@@ -1,3 +1,4 @@
+
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under
 the Apache License, Version 2.0 (the "License"); you may not use this file
 except in compliance with the License.
@@ -14,7 +15,7 @@ limitations under the License. */
 syntax = "proto3";
 package sendrecv;
 
-// option cc_generic_services = true;
+option cc_generic_services = @cc_generic_services@;
 
 service SendRecvService {
   // For parameter server round-robin like hashing, do not split tensors.
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
index 98129d9f1014c39347e3409533f2bc10092611d2..6a3f8fd544bc5d669b725765a863b42ec069a7b6 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
@@ -12,21 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-
 #ifdef PADDLE_WITH_CUDA
 #include <nccl.h>
 #endif
 #include <sys/time.h>
 #include <thread>  // NOLINT
 
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
-#include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/operators/distributed/variable_response.h"
-#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -34,6 +28,13 @@ namespace distributed {
 
 using VarMsg = sendrecv::VariableMessage;
 
+#ifdef PADDLE_WITH_CUDA
+void* GetVarPayLoad(const std::string varname, int64_t size) {
+  platform::CUDAPinnedPlace cuda_pinned;
+  return memory::Alloc(cuda_pinned, size);
+}
+#endif
+
 void GetTensorPayload(framework::Variable* var,
                       const platform::DeviceContext& ctx, VarMsg* request,
                       void** payload, size_t* payload_size) {
@@ -58,15 +59,17 @@ void GetTensorPayload(framework::Variable* var,
   if (platform::is_gpu_place(ctx.GetPlace())) {
 #ifdef PADDLE_WITH_CUDA
     PADDLE_ENFORCE(platform::is_gpu_place(tensor.place()));
-    platform::CUDAPinnedPlace cuda_pinned;
+    // platform::CUDAPinnedPlace cuda_pinned;
     auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
     auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type());
-    *payload = memory::Alloc(cuda_pinned, copy_size);
+    *payload = GetVarPayLoad(request->varname(), copy_size);
 
+    platform::CUDAPinnedPlace cuda_pinned;
     memory::Copy(cuda_pinned, *payload,
                  boost::get<platform::CUDAPlace>(tensor.place()),
                  reinterpret_cast<const void*>(tensor.data<void>()), copy_size,
                  gpu_dev_ctx.stream());
+
     ctx.Wait();
 #endif
   } else {
@@ -91,10 +94,11 @@ void GetSelectedRowsPayload(framework::Variable* var,
   auto* tensor = slr->mutable_value();
   if (platform::is_gpu_place(ctx.GetPlace())) {
 #ifdef PADDLE_WITH_CUDA
-    platform::CUDAPinnedPlace cuda_pinned;
     auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
     auto copy_size = tensor->numel() * framework::SizeOfType(tensor->type());
-    *payload = memory::Alloc(cuda_pinned, copy_size);
+    *payload = GetVarPayLoad(request->varname(), copy_size);
+
+    platform::CUDAPinnedPlace cuda_pinned;
     memory::Copy(cuda_pinned, *payload,
                  boost::get<platform::CUDAPlace>(tensor->place()),
                  reinterpret_cast<const void*>(tensor->data<void>()), copy_size,
@@ -107,126 +111,6 @@ void GetSelectedRowsPayload(framework::Variable* var,
   *payload_size = tensor->numel() * framework::SizeOfType(tensor->type());
 }
 
-void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
-                           const platform::DeviceContext& ctx,
-                           ::grpc::ByteBuffer* msg,
-                           const std::string& out_name) {
-  // Default DestroyCallback does nothing, When using GPU
-  // the CPU buffer need to be freed.
-  DestroyCallback destroy_callback = [](void* backing) {};
-  VarMsg request;
-  void* payload = nullptr;
-  size_t payload_size;
-
-  request.set_varname(name);
-  // Note: normally the profiler is enabled in 1 trainer, hence only
-  // 1 trainer returns true for ShouldSendProfileState(). It tells PS
-  // servers the trainer's profiling state so that PS can follow the
-  // trainer.
-  if (platform::ShouldSendProfileState()) {
-    if (platform::IsProfileEnabled()) {
-      request.set_profile(platform::kEnableProfiler);
-    } else {
-      request.set_profile(platform::kDisableProfiler);
-    }
-  }
-  if (!out_name.empty()) {
-    request.set_out_varname(out_name);
-  }
-  if (var->IsType<framework::LoDTensor>()) {
-    request.set_type(::sendrecv::LOD_TENSOR);
-    GetTensorPayload(var, ctx, &request, &payload, &payload_size);
-  } else if (var->IsType<framework::SelectedRows>()) {
-    request.set_type(::sendrecv::SELECTED_ROWS);
-    GetSelectedRowsPayload(var, ctx, &request, &payload, &payload_size);
-#ifdef PADDLE_WITH_CUDA
-  } else if (var->IsType<ncclUniqueId>()) {
-    request.set_type(::sendrecv::NCCL_ID);
-#endif
-  } else {
-    PADDLE_THROW("Serialize does not support type: %s",
-                 typeid(var->Type()).name());
-  }
-
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
-    // GPU data is copied to CPU buffer when sending,
-    // free the buffer when possible.
-    destroy_callback = [](void* backing) {
-      platform::CUDAPinnedPlace cuda_pinned;
-      memory::Free(cuda_pinned, backing);
-    };
-#endif
-  }
-
-  std::string header;
-  request.AppendToString(&header);
-  auto buffer = std::unique_ptr<char[]>(new char[1024]);
-  void* buf = buffer.get();
-  ProtoEncodeHelper e(static_cast<char*>(buf), 1024);
-  e.WriteRawBytes(std::string(header.data(), header.size()));
-// NCCLID is copied directly to the message, return bytebuffer
-// with only one slice if serializing NCCLID.
-#ifdef PADDLE_WITH_CUDA
-  if (var->IsType<ncclUniqueId>()) {
-    e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
-                              NCCL_UNIQUE_ID_BYTES);
-    const ncclUniqueId& uid = var->Get<ncclUniqueId>();
-    e.WriteRawBytes(std::string(uid.internal, NCCL_UNIQUE_ID_BYTES));
-
-    // for serialize NCCL_ID
-    ::grpc::Slice slices(e.size());
-    memcpy(const_cast<uint8_t*>(slices.begin()), e.data(), e.size());
-    ::grpc::ByteBuffer tmp(&slices, 1);
-    msg->Swap(&tmp);
-    return;
-  }
-#endif
-
-  e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
-  // steal reference of tensor data
-  ::grpc::Slice slices[4];  // metadata, tensor, rows meta, rows
-  int num_slices = 2;       // only SelectedRows have rows buffer
-  slices[0] = ::grpc::Slice(e.size());
-  memcpy(const_cast<uint8_t*>(slices[0].begin()), e.data(), e.size());
-  slices[1] = ::grpc::Slice(
-      grpc_slice_new_with_user_data(payload, payload_size, destroy_callback,
-                                    static_cast<char*>(payload)),
-      ::grpc::Slice::STEAL_REF);
-
-  if (var->IsType<framework::SelectedRows>()) {
-    auto* slr = var->GetMutable<framework::SelectedRows>();
-    ProtoEncodeHelper e2(static_cast<char*>(buf), 128);
-    size_t rows_memory_size =
-        slr->rows().size() * framework::SizeOfType(typeid(int64_t));
-    e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
-    slices[2] = ::grpc::Slice(e2.size());
-    memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());
-
-    slices[3] = ::grpc::Slice(
-        grpc_slice_new_with_user_data(
-            const_cast<void*>(
-                reinterpret_cast<const void*>(slr->rows().data())),
-            rows_memory_size, [](void* backing) {},
-            const_cast<char*>(
-                reinterpret_cast<const char*>(slr->rows().data()))),
-        ::grpc::Slice::STEAL_REF);
-    num_slices = 4;
-  }
-
-  ::grpc::ByteBuffer tmp(&slices[0], num_slices);
-  msg->Swap(&tmp);
-}
-
-void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
-                               const platform::DeviceContext& ctx,
-                               const framework::Scope* scope,
-                               framework::Variable** var) {
-  operators::distributed::VariableResponse resp(scope, &ctx);
-  PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
-  *var = resp.GetVar();
-}
-
 }  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h
index fe25e73fa608727ba0bb912a82776b330ec8d83a..4d08d3c77afa3c1f2b4d7602f7199558bb5a79c0 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.h
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h
@@ -25,24 +25,21 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/var_type.h"
 
-#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
 #include "paddle/fluid/operators/distributed/send_recv.pb.h"
 
 namespace paddle {
 namespace operators {
 namespace distributed {
 
-typedef void (*DestroyCallback)(void*);
+using VarMsg = sendrecv::VariableMessage;
 
-void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
-                           const platform::DeviceContext& ctx,
-                           ::grpc::ByteBuffer* msg,
-                           const std::string& out_varname = std::string());
+void GetTensorPayload(framework::Variable* var,
+                      const platform::DeviceContext& ctx, VarMsg* request,
+                      void** payload, size_t* payload_size);
 
-void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
-                               const platform::DeviceContext& ctx,
-                               const framework::Scope* scope,
-                               framework::Variable** var);
+void GetSelectedRowsPayload(framework::Variable* var,
+                            const platform::DeviceContext& ctx, VarMsg* request,
+                            void** payload, size_t* payload_size);
 
 inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) {
   switch (type) {
diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc
index 45832c60bf9172497afabac927ba39a7cbfb9a52..466bce18af7cf97014a7b1ba64df68eab193c7c8 100644
--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@@ -1,4 +1,4 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,50 +13,20 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/distributed/variable_response.h"
-
-#include <string>
-#include <utility>
 #include <vector>
-#ifdef PADDLE_WITH_CUDA
-#include <nccl.h>
-#endif
-#include "paddle/fluid/platform/profiler.h"
-
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 
 namespace paddle {
 namespace operators {
 namespace distributed {
 
-enum WireType {
-  WIRETYPE_VARINT = 0,
-  WIRETYPE_LENGTH_DELIMITED = 2,
-};
-
-inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; }
-
-inline WireType GetTagWireType(uint32_t tag) {
-  return static_cast<WireType>(tag & 0x7);
-}
-
-bool ReadVarintSizeAsInt(::google::protobuf::io::CodedInputStream* input,
-                         int* result) {
-  uint64_t v;
-  if (input->ReadVarint64(&v) && v <= static_cast<uint64_t>(INT_MAX)) {
-    *result = static_cast<int>(v);
-    return true;
-  } else {
-    return false;
-  }
-}
-
-bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
-             const platform::DeviceContext& dev_ctx, platform::Place place,
-             void* dest, int size) {
+bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
+                               const platform::DeviceContext& dev_ctx,
+                               platform::Place place, void* dest,
+                               int64_t size) {
   const void* data = NULL;
   int size_to_write = 0;
-  int length = size;
+  int64_t length = size;
   int total_written = 0;
 
   if (platform::is_gpu_place(place)) {
@@ -194,294 +164,49 @@ bool VariableResponse::CopySelectRowsData(
   return true;
 }
 
-bool ParseLodData(::google::protobuf::io::CodedInputStream* input,
-                  std::vector<int64_t>* lod) {
-  while (true) {
-    auto p = input->ReadTagWithCutoff(127);
-    int tag = GetTagFieldNumber(p.first);
-    WireType wt = GetTagWireType(p.first);
-
-    if (!p.second) {
-      return (tag == 0);
-    }
-
-    switch (tag) {
-      case sendrecv::VariableMessage_LodData::kLodDataFieldNumber: {
-        uint64_t v;
-        if (wt == WIRETYPE_VARINT) {
-          if (!input->ReadVarint64(&v)) {
-            return false;
-          }
-          lod->push_back(v);
-          break;
-        }
-
-        if (wt == WIRETYPE_LENGTH_DELIMITED) {
-          int num_bytes = 0;
-          if (!input->ReadVarintSizeAsInt(&num_bytes)) {
-            return tag;
-          }
-          int start_pos = input->CurrentPosition();
-          while (input->CurrentPosition() - start_pos < num_bytes) {
-            uint64_t v;
-            if (!input->ReadVarint64(&v)) {
-              return tag;
-            }
-            lod->push_back(v);
-          }
-          break;
-        }
+bool VariableResponse::ProcSerializedField(
+    int tag, ::google::protobuf::io::CodedInputStream* input,
+    int64_t num_bytes) {
+  PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
+                  meta_.type() == sendrecv::LOD_TENSOR ||
+                  meta_.type() == sendrecv::NCCL_ID) &&
+                     meta_.varname() != "",
+                 "meta info should be got first!");
 
+  if (meta_.type() == sendrecv::NCCL_ID) {
+#ifdef PADDLE_WITH_CUDA
+    auto* var = scope_->FindVar(meta_.varname());
+    if (var != nullptr) {
+      ncclUniqueId* id = var->GetMutable<ncclUniqueId>();
+      if (!ReadRaw(input, *dev_ctx_, platform::CPUPlace(), id->internal,
+                   num_bytes)) {
         return false;
       }
-      default: { return false; }
     }
-  }
-
-  return true;
-}
-
-int VariableResponse::Parse(const ::grpc::ByteBuffer& byte_buffer) {
-  GrpcByteBufferSource source;
-  source.Init(byte_buffer);
-  GrpcByteBufferSourceWrapper r(&source);
-
-  return Parse(&r);
-}
-
-int VariableResponse::Parse(Source* source) {
-  ::google::protobuf::io::ZeroCopyInputStream* input_stream =
-      source->contents();
-  ::google::protobuf::io::CodedInputStream input(input_stream);
-  input.SetTotalBytesLimit(INT_MAX, INT_MAX);
-
-  while (true) {
-    auto p = input.ReadTagWithCutoff(127);
-    int tag = GetTagFieldNumber(p.first);
-    WireType wt = GetTagWireType(p.first);
-    if (!p.second) {
-      if (tag != 0) {
-        return -1;
-      }
-      return 0;
-    }
-
-    switch (tag) {
-      case sendrecv::VariableMessage::kVarnameFieldNumber: {
-        uint32_t length;
-        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
-          return tag;
-        }
-
-        std::string temp;
-        if (!input.ReadString(&temp, length)) {
-          return tag;
-        }
-
-        meta_.set_varname(temp);
-        break;
-      }
-      case sendrecv::VariableMessage::kTypeFieldNumber: {
-        uint32_t v;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
-          return tag;
-        }
-
-        meta_.set_type(static_cast<::sendrecv::VarType>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kDataTypeFieldNumber: {
-        uint32_t v = 0;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
-          return tag;
-        }
-
-        meta_.set_data_type(static_cast<::sendrecv::VariableMessage_Type>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kDimsFieldNumber: {
-        // not packed
-        if (wt == WIRETYPE_VARINT) {
-          uint64_t v;
-          if (!input.ReadVarint64(&v)) {
-            return tag;
-          }
-          meta_.add_dims(v);
-          break;
-        }
-
-        // packed
-        if (wt == WIRETYPE_LENGTH_DELIMITED) {
-          int num_bytes = 0;
-          if (!input.ReadVarintSizeAsInt(&num_bytes)) {
-            return tag;
-          }
-          int start_pos = input.CurrentPosition();
-          while (input.CurrentPosition() - start_pos < num_bytes) {
-            uint64_t v;
-            if (!input.ReadVarint64(&v)) {
-              return tag;
-            }
-            meta_.add_dims(v);
-          }
-          break;
-        }
-        return tag;
-      }
-      case sendrecv::VariableMessage::kLodLevelFieldNumber: {
-        uint64_t v = 0;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
-          return tag;
-        }
-        meta_.set_lod_level(static_cast<int64_t>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kLodFieldNumber: {
-        int length = 0;
-        if (wt != WIRETYPE_LENGTH_DELIMITED ||
-            !ReadVarintSizeAsInt(&input, &length)) {
-          return tag;
-        }
-
-        std::pair<::google::protobuf::io::CodedInputStream::Limit, int> p =
-            input.IncrementRecursionDepthAndPushLimit(length);
-
-        std::vector<int64_t> lod_data;
-        if (p.second < 0 || !ParseLodData(&input, &lod_data)) {
-          return tag;
-        }
-
-        if (!input.DecrementRecursionDepthAndPopLimit(p.first)) {
-          return false;
-        }
-
-        if (lod_data.size() == 0) {
-          break;
-        }
-
-        auto lod = meta_.add_lod();
-        for (uint32_t i = 0; i < lod_data.size(); i++) {
-          lod->add_lod_data(lod_data[i]);
-        }
-        break;
-      }
-      case sendrecv::VariableMessage::kSlrHeightFieldNumber: {
-        uint64_t v = 0;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
-          return tag;
-        }
-        meta_.set_slr_height(static_cast<int64_t>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kSerializedFieldNumber: {
-        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
-                        meta_.type() == sendrecv::LOD_TENSOR ||
-                        meta_.type() == sendrecv::NCCL_ID) &&
-                           meta_.varname() != "",
-                       "meta info should be got first!");
-
-        int num_bytes = 0;
-        if (wt != WIRETYPE_LENGTH_DELIMITED ||
-            !ReadVarintSizeAsInt(&input, &num_bytes)) {
-          return tag;
-        }
-
-        if (meta_.type() == sendrecv::NCCL_ID) {
-#ifdef PADDLE_WITH_CUDA
-          auto* var = scope_->FindVar(meta_.varname());
-          if (var != nullptr) {
-            ncclUniqueId* id = var->GetMutable<ncclUniqueId>();
-            if (!ReadRaw(&input, *dev_ctx_, platform::CPUPlace(), id->internal,
-                         num_bytes)) {
-              return tag;
-            }
-          }
-          break;
+    return true;
 #else
-          PADDLE_THROW("Not compiled with CUDA!");
+    PADDLE_THROW("Not compiled with CUDA!");
+    return false;
 #endif
-        }
-
-        framework::DDim dims = GetDims(meta_.dims());
-        if (meta_.type() == sendrecv::LOD_TENSOR) {
-          PADDLE_ENFORCE(meta_.lod_size() >= 0,
-                         "lod info should be got first!");
-          if (!CopyLodTensorData(&input, *dev_ctx_, dims, num_bytes)) {
-            return tag;
-          }
-          break;
-        }
-
-        if (meta_.type() == sendrecv::SELECTED_ROWS) {
-          if (!CopySelectRowsTensorData(&input, *dev_ctx_, dims, num_bytes)) {
-            return tag;
-          }
-          break;
-        }
-
-        return tag;
-      }
-      case sendrecv::VariableMessage::kRowsFieldNumber: {
-        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
-                        meta_.type() == sendrecv::LOD_TENSOR) &&
-                           meta_.varname() != "",
-                       "meta info should be got first!");
-
-        int num_bytes = 0;
-        if (wt != WIRETYPE_LENGTH_DELIMITED ||
-            !ReadVarintSizeAsInt(&input, &num_bytes)) {
-          return tag;
-        }
-
-        if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) {
-          return tag;
-        }
-        break;
-      }
-      case sendrecv::VariableMessage::kOutVarnameFieldNumber: {
-        uint32_t length;
-        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
-          return tag;
-        }
+  }
 
-        std::string temp;
-        if (!input.ReadString(&temp, length)) {
-          return tag;
-        }
+  framework::DDim dims = GetDims(meta_.dims());
+  if (meta_.type() == sendrecv::LOD_TENSOR) {
+    PADDLE_ENFORCE(meta_.lod_size() >= 0, "lod info should be got first!");
+    if (!CopyLodTensorData(input, *dev_ctx_, dims, num_bytes)) {
+      return false;
+    }
+    return true;
+  }
 
-        meta_.set_out_varname(temp);
-        break;
-      }
-      case sendrecv::VariableMessage::kProfileFieldNumber: {
-        uint64_t profiling = 0;
-        if (!input.ReadVarint64(&profiling)) {
-          return tag;
-        }
-        meta_.set_profile(profiling);
-        int64_t listener_id = platform::ListenerId();
-        if (listener_id <= 0) {
-          break;
-        }
-        if (profiling == platform::kEnableProfiler &&
-            !platform::IsProfileEnabled()) {
-          platform::EnableProfiler(platform::ProfilerState::kCPU);
-        } else if (profiling == platform::kDisableProfiler &&
-                   platform::IsProfileEnabled()) {
-          // TODO(panyx0718): Should we allow to customize file dir.
-          platform::DisableProfiler(
-              platform::EventSortingKey::kDefault,
-              string::Sprintf("/tmp/profile_ps_%lld", listener_id));
-        }
-        break;
-      }
-      default: {
-        // Unknown tag, return unknown error.
-        return -1;
-      }
+  if (meta_.type() == sendrecv::SELECTED_ROWS) {
+    if (!CopySelectRowsTensorData(input, *dev_ctx_, dims, num_bytes)) {
+      return false;
     }
+    return true;
   }
 
-  return 0;
+  return true;
 }
 
 };  // namespace distributed
diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h
index 1db4a0a522654ff2497b8bd9ee1381b5ab64067a..6aec52ca00f59a42ecca01da8df1680ce4eda432 100644
--- a/paddle/fluid/operators/distributed/variable_response.h
+++ b/paddle/fluid/operators/distributed/variable_response.h
@@ -22,18 +22,35 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
 
-#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
-
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 
 namespace paddle {
 namespace operators {
 namespace distributed {
 
+// Source provides a way for a particular RPC implementation to provide
+// received data to ParseFrom.
+class Source {
+ public:
+  virtual ~Source() {}
+
+  // Return the stream that contains the data to be parsed.
+  // Note that this method might be invoked more than once if
+  // ParseFrom needs to fall back to a more expensive parsing method.
+  // Every call must return a stream pointing at the beginning of
+  // the serialized RecvTensorResponse.
+  //
+  // Note that a subsequent call to contents() invalidates previous
+  // results of contents().
+  //
+  // Ownership of the returned stream is retained by the Source and
+  // should not be deleted by the caller.
+  virtual ::google::protobuf::io::ZeroCopyInputStream* contents() = 0;
+};
+
 class VariableResponse {
  public:
   VariableResponse(const framework::Scope* scope,
@@ -51,22 +68,19 @@ class VariableResponse {
     }
   }
 
-  // return:
-  // 0:ok.
-  // -1: unkown error.
-  // other: number of error field.
-  int Parse(Source* source);
+  int Parse(Source* source, const sendrecv::VariableMessage& meta) {
+    meta_ = meta;
+    return Parse(source);
+  }
 
   // return:
   // 0:ok.
   // -1: unkown error.
   // other: number of error field.
-  int Parse(const ::grpc::ByteBuffer& byte_buffer);
-
-  const framework::Scope& GetLocalScope() const { return *local_scope_; }
-
-  framework::Scope* GetMutableLocalScope() const { return local_scope_; }
+  virtual int Parse(Source* source) = 0;
 
+  inline const framework::Scope& GetLocalScope() const { return *local_scope_; }
+  inline framework::Scope* GetMutableLocalScope() const { return local_scope_; }
   inline std::string Varname() const { return meta_.varname(); }
   inline std::string OutVarname() const { return meta_.out_varname(); }
 
@@ -78,7 +92,11 @@ class VariableResponse {
     return scope_->FindVar(meta_.varname());
   }
 
- private:
+ protected:
+  bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
+               const platform::DeviceContext& dev_ctx, platform::Place place,
+               void* dest, int64_t size);
+
   bool CopySelectRowsTensorData(::google::protobuf::io::CodedInputStream* input,
                                 const platform::DeviceContext& ctx,
                                 const framework::DDim& dims, int length);
@@ -90,12 +108,16 @@ class VariableResponse {
                          const platform::DeviceContext& ctx,
                          const framework::DDim& dims, int length);
 
- private:
+  bool ProcSerializedField(int tag,
+                           ::google::protobuf::io::CodedInputStream* input,
+                           int64_t num_bytes);
+
+ protected:
   const framework::Scope* scope_;
   const platform::DeviceContext* dev_ctx_;
   bool create_scope_ = false;
   framework::Scope* local_scope_ = nullptr;
-  // only Skeleton
+
   sendrecv::VariableMessage meta_;
 };
 
diff --git a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
index 3f612256840825a75f49944ab97ff957d572a863..c86cd57316078778e5930c9b524b931d523028d7 100644
--- a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
@@ -47,12 +47,12 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
     int axis = ctx.Attr<int>("axis");
 
     auto x_dims = x->dims();
-    auto y_dims = y->dims();
+    auto y_dims_untrimed = y->dims();
     auto z_dims = z->dims();
 
     // Execute default elementwise_add operator when
     // broadcast operations need to performed.
-    if (x_dims != y_dims) {
+    if (x_dims != y_dims_untrimed) {
       auto sum_func = [](T a, T b) -> T { return a + b; };
 
       TransformFunctor<decltype(sum_func), T,
@@ -62,11 +62,11 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
               ctx.template device_context<paddle::platform::CPUDeviceContext>(),
               sum_func);
 
-      axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+      axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
       PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
                      "Axis should be in range [0, x_dims)");
 
-      trim_trailing_singular_dims(&y_dims);
+      auto y_dims = trim_trailing_singular_dims(y_dims_untrimed);
       axis = (y_dims.size() == 0) ? x_dims.size() : axis;
 
       int pre, n, post;
@@ -85,10 +85,10 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
                      "Wrong layout/format set for X tensor");
       PADDLE_ENFORCE(y->layout() == DataLayout::kMKLDNN &&
                          y->format() != memory::format::format_undef,
-                     "Wrong layout/format set for X tensor");
+                     "Wrong layout/format set for Y tensor");
 
       std::vector<int> src_x_tz = framework::vectorize2int(x_dims);
-      std::vector<int> src_y_tz = framework::vectorize2int(y_dims);
+      std::vector<int> src_y_tz = framework::vectorize2int(y_dims_untrimed);
       std::vector<int> dst_tz = framework::vectorize2int(z_dims);
 
       std::vector<memory::primitive_desc> srcs_pd;
@@ -142,36 +142,39 @@ class EltwiseAddMKLDNNGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     using Tensor = framework::Tensor;
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     int axis = ctx.Attr<int>("axis");
+    // skip out, x, y,
+    // dout length is larger or equal than dx, dy.
+    auto* out = dout;
+    auto *x = dout, *y = dout;
 
     auto set_mkldnn_format = [](Tensor* in, const Tensor* out) {
       in->set_layout(DataLayout::kMKLDNN);
       in->set_format(out->format());
     };
 
-    if (x->dims() == y->dims()) {
-      auto blas = math::GetBlas<paddle::platform::CPUDeviceContext, T>(ctx);
-      if (dx) {
-        blas.VCOPY(dout->numel(), dout->data<T>(),
-                   dx->mutable_data<T>(ctx.GetPlace()));
-        set_mkldnn_format(dx, dout);
-      }
-
-      if (dy) {
-        blas.VCOPY(dout->numel(), dout->data<T>(),
-                   dy->mutable_data<T>(ctx.GetPlace()));
-        set_mkldnn_format(dy, dout);
+    if (dx != nullptr && dy != nullptr && dx->dims() == dy->dims()) {
+      if (dx->dims() == dy->dims()) {
+        auto blas = math::GetBlas<paddle::platform::CPUDeviceContext, T>(ctx);
+        if (dx) {
+          blas.VCOPY(dout->numel(), dout->data<T>(),
+                     dx->mutable_data<T>(ctx.GetPlace()));
+          set_mkldnn_format(dx, dout);
+        }
+
+        if (dy) {
+          blas.VCOPY(dout->numel(), dout->data<T>(),
+                     dy->mutable_data<T>(ctx.GetPlace()));
+          set_mkldnn_format(dy, dout);
+        }
       }
     } else {
       // Execute default kernel when broadcast is needed
-      ElemwiseGradCompute<paddle::platform::CPUDeviceContext, T,
-                          IdentityGrad<T>, IdentityGrad<T>>(
+      ElemwiseExplicitGradCompute<paddle::platform::CPUDeviceContext, T,
+                                  IdentityGrad<T>, IdentityGrad<T>>(
           ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(),
           IdentityGrad<T>());
     }
diff --git a/paddle/fluid/operators/elementwise_add_op.cc b/paddle/fluid/operators/elementwise_add_op.cc
index d2c20537136fc3ac9d1bece24a2238f26215c922..3c97ac995c649ecd0d196a584240e1e7ac04f08e 100644
--- a/paddle/fluid/operators/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise_add_op.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise_add_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
 namespace ops = paddle::operators;
-REGISTER_ELEMWISE_OP(elementwise_add, "Add", "Out = X + Y");
+REGISTER_ELEMWISE_GRAD_MAKER(elementwise_add, Add);
+REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_add, "Add", "Out = X + Y", "Out",
+                              "X");
 REGISTER_OP_CPU_KERNEL(
     elementwise_add,
     ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_add_op.h b/paddle/fluid/operators/elementwise_add_op.h
index baf04c30b17cb333fc8a6544afd6c479442f835b..5356105e2e551c0528694091608fc7585dce66d2 100644
--- a/paddle/fluid/operators/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise_add_op.h
@@ -95,9 +95,10 @@ void default_elementwise_add_grad(const framework::ExecutionContext& ctx,
                                   framework::Tensor* dy) {
   int axis = ctx.Attr<int>("axis");
 
-  ElemwiseGradCompute<DeviceContext, T, IdentityGrad<T>, IdentityGrad<T>>(
-      ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(),
-      IdentityGrad<T>());
+  ElemwiseExplicitGradCompute<DeviceContext, T, IdentityGrad<T>,
+                              IdentityGrad<T>>(ctx, *x, *y, *out, *dout, axis,
+                                               dx, dy, IdentityGrad<T>(),
+                                               IdentityGrad<T>());
 }
 
 template <typename DeviceContext, typename T>
@@ -140,14 +141,15 @@ class ElementwiseAddGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     using Tensor = framework::Tensor;
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    // skip out, x, y
+    auto* out = dout;
+    auto *x = dout, *y = dout;
 
-    if (platform::is_cpu_place(ctx.GetPlace()) && (x->dims() == y->dims())) {
+    if (platform::is_cpu_place(ctx.GetPlace()) && dx != nullptr &&
+        dy != nullptr && (dx->dims() == dy->dims())) {
       elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
     } else {
       default_elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx,
diff --git a/paddle/fluid/operators/elementwise_div_op.cc b/paddle/fluid/operators/elementwise_div_op.cc
index 824b1221e5a77c8799dc34820b7f0db180c2439e..84c8a65e5f859d276ae6d5f1a3f25c9d713a7a61 100644
--- a/paddle/fluid/operators/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise_div_op.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise_div_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
 namespace ops = paddle::operators;
+
 REGISTER_ELEMWISE_OP(elementwise_div, "Div", "Out = X / Y");
+
 REGISTER_OP_CPU_KERNEL(
     elementwise_div,
     ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h
index bb88970e42c194d9437609b62435f1a89e2b446b..d8a12e800ad733800c1ec333f15d31d4dcd1a3a5 100644
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -78,7 +78,9 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() final {
     AddInput("X", "(Tensor), The first input tensor of elementwise op.");
     AddInput("Y", "(Tensor), The second input tensor of elementwise op.");
-    AddOutput("Out", "The output of elementwise op.").Reuse("X");
+    // AddOutput("SavedShape", "(Tensor), save X, Y shape for grad to save
+    // memory.").AsIntermediate();
+    AddOutput("Out", "The output of elementwise op.");
     AddAttr<int>("axis",
                  "(int, default -1). The start dimension index "
                  "for broadcasting Y onto X.")
@@ -125,11 +127,13 @@ But the output only shares the LoD information with the input $X$.
 
 )DOC",
                                GetName(), GetEquation()));
+    SetReuse();
   }
 
  protected:
   virtual std::string GetName() const = 0;
   virtual std::string GetEquation() const = 0;
+  virtual void SetReuse() {}
 };
 
 class ElementwiseOpGrad : public framework::OperatorWithKernel {
@@ -162,8 +166,8 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    auto input_data_type = framework::ToDataType(
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type());
 
 #ifdef PADDLE_WITH_MKLDNN
     if (platform::CanMKLDNNBeUsed(ctx)) {
@@ -175,9 +179,58 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
+
+// For Add, Sub op, the X, Out is not needed.
+class ElementwiseOpExplicitGrad : public ElementwiseOpGrad {
+ public:
+  using operators::ElementwiseOpGrad::ElementwiseOpGrad;
+  using operators::ElementwiseOpGrad::GetExpectedKernelType;
+  using Tensor = framework::Tensor;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+      ctx->SetOutputDim(x_grad_name, out_dims);
+    }
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(y_grad_name)) {
+      PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
+      auto y_dims = ctx->GetInputDim("Y");
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
+/*
+*/
+
+#define REGISTER_ELEMWISE_GRAD_MAKER(kernel_type, op_name)                   \
+  class kernel_type##GradMaker                                               \
+      : public paddle::framework::SingleGradOpDescMaker {                    \
+   public:                                                                   \
+    using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker; \
+                                                                             \
+   protected:                                                                \
+    std::unique_ptr<paddle::framework::OpDesc> Apply() const override {      \
+      auto* op = new paddle::framework::OpDesc();                            \
+      op->SetType(#kernel_type "_grad");                                     \
+      op->SetInput("Y", Input("Y"));                                         \
+      op->SetInput(::paddle::framework::GradVarName("Out"),                  \
+                   OutputGrad("Out"));                                       \
+      op->SetAttrMap(Attrs());                                               \
+      op->SetOutput(::paddle::framework::GradVarName("X"), InputGrad("X"));  \
+      op->SetOutput(::paddle::framework::GradVarName("Y"), InputGrad("Y"));  \
+      return std::unique_ptr<::paddle::framework::OpDesc>(op);               \
+    }                                                                        \
+  }
+
 #define REGISTER_ELEMWISE_OP(op_type, op_name, equation)                \
   class __ElemwiseOp##op_type##Maker__                                  \
       : public ::paddle::operators::ElementwiseOpMaker {                \
@@ -190,3 +243,18 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
                     ::paddle::operators::ElementwiseOpInferVarType,     \
                     ::paddle::framework::DefaultGradOpDescMaker<true>); \
   REGISTER_OPERATOR(op_type##_grad, ::paddle::operators::ElementwiseOpGrad)
+
+#define REGISTER_ELEMWISE_EXPLICIT_OP(op_type, op_name, equation, ...) \
+  class __ElemwiseOp##op_type##Maker__                                 \
+      : public ::paddle::operators::ElementwiseOpMaker {               \
+   protected:                                                          \
+    virtual std::string GetName() const { return op_name; }            \
+    virtual std::string GetEquation() const { return equation; }       \
+    virtual void SetReuse() { Reuse(__VA_ARGS__); }                    \
+  };                                                                   \
+  REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp,       \
+                    __ElemwiseOp##op_type##Maker__,                    \
+                    ::paddle::operators::ElementwiseOpInferVarType,    \
+                    op_type##GradMaker);                               \
+  REGISTER_OPERATOR(op_type##_grad,                                    \
+                    ::paddle::operators::ElementwiseOpExplicitGrad)
diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h
index 8b052611f80ddf874ca48c1c58e13346528a834e..eb8272e90c32c3a0be2c0ce1bc679571af876317 100644
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <glog/logging.h>
 #include <algorithm>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -65,17 +67,21 @@ inline void get_mid_dims(const framework::DDim& x_dims,
   }
 }
 
-inline void trim_trailing_singular_dims(framework::DDim* dims) {
+inline framework::DDim trim_trailing_singular_dims(
+    const framework::DDim& dims) {
   // Remove trailing dimensions of size 1 for y
-  auto actual_dims_size = dims->size();
+  auto actual_dims_size = dims.size();
   for (; actual_dims_size != 0; --actual_dims_size) {
-    if ((*dims)[actual_dims_size - 1] != 1) break;
+    if (dims[actual_dims_size - 1] != 1) break;
   }
-  if (actual_dims_size != dims->size()) {
-    auto actual_dims = framework::vectorize(*dims);
-    actual_dims.resize(actual_dims_size);
-    *dims = framework::make_ddim(actual_dims);
+
+  std::vector<int> trim_dims;
+  trim_dims.resize(actual_dims_size);
+  for (int i = 0; i < actual_dims_size; ++i) {
+    trim_dims[i] = dims[i];
   }
+  framework::DDim actual_dims = framework::make_ddim(trim_dims);
+  return actual_dims;
 }
 
 template <typename T, typename DeviceContext>
@@ -456,6 +462,71 @@ static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T* x,
 
 #endif
 
+template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP>
+void ElemwiseGradComputeNoBroadcast(
+    const framework::ExecutionContext& ctx, const framework::DDim& x_dim,
+    const framework::DDim& y_dim, const framework::Tensor& x,
+    const framework::Tensor& y, const framework::Tensor& out,
+    const framework::Tensor& dout, int axis, framework::Tensor* dx,
+    framework::Tensor* dy, DX_OP dx_op, DY_OP dy_op) {
+  size_t N = static_cast<size_t>(framework::product(x_dim));
+  platform::ForRange<DeviceContext> for_range(
+      ctx.template device_context<DeviceContext>(), N);
+  for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP>{
+      x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), dx_op, dy_op,
+      dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+      dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace())});
+}
+
+template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP>
+void ElemwiseGradComputeWithBroadcast(
+    const framework::ExecutionContext& ctx, const framework::DDim& x_dim,
+    const framework::DDim& y_dim_untrimed, const framework::Tensor& x,
+    const framework::Tensor& y, const framework::Tensor& out,
+    const framework::Tensor& dout, int axis, framework::Tensor* dx,
+    framework::Tensor* dy, DX_OP dx_op, DY_OP dy_op) {
+  axis = (axis == -1 ? x_dim.size() - y_dim_untrimed.size() : axis);
+  auto y_dim = trim_trailing_singular_dims(y_dim_untrimed);
+  axis = (y_dim.size() == 0) ? x_dim.size() : axis;
+
+  int pre, n, post;
+  get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
+  if (post == 1) {
+    int h = pre;
+    int w = n;
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef __NVCC__
+      ElemwiseGradBroadcast1CUDA(
+          ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
+          y.data<T>(), out.data<T>(), dout.data<T>(), h, w, dx_op, dy_op,
+          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+#endif
+    } else {
+      ElemwiseGradBroadcast1CPU(
+          x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), h, w, dx_op,
+          dy_op, dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+    }
+  } else {
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef __NVCC__
+      ElemwiseGradBroadcast2CUDA(
+          ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
+          y.data<T>(), out.data<T>(), dout.data<T>(), pre, n, post, dx_op,
+          dy_op, dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+#endif
+    } else {
+      ElemwiseGradBroadcast2CPU(
+          x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), pre, n, post,
+          dx_op, dy_op,
+          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+    }
+  }
+}
+
 template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP>
 void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
                          const framework::Tensor& x, const framework::Tensor& y,
@@ -463,63 +534,50 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
                          const framework::Tensor& dout, int axis,
                          framework::Tensor* dx, framework::Tensor* dy,
                          DX_OP dx_op, DY_OP dy_op) {
+  const framework::DDim x_dim = x.dims();
+  const framework::DDim y_dim = y.dims();
   if (x.dims() == y.dims()) {
-    size_t N = static_cast<size_t>(framework::product(x.dims()));
-    platform::ForRange<DeviceContext> for_range(
-        ctx.template device_context<DeviceContext>(), N);
-    for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP>{
-        x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), dx_op, dy_op,
-        dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-        dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace())});
+    ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
+        ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
   } else {  // Y is a scalar
-    auto x_dim = x.dims();
-    auto y_dim = y.dims();
-
-    axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis);
-    trim_trailing_singular_dims(&y_dim);
-    axis = (y_dim.size() == 0) ? x_dim.size() : axis;
-
-    int pre, n, post;
-    get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
-    if (post == 1) {
-      int h = pre;
-      int w = n;
-      if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef __NVCC__
-        ElemwiseGradBroadcast1CUDA(
-            ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
-            y.data<T>(), out.data<T>(), dout.data<T>(), h, w, dx_op, dy_op,
-            dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-            dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-#endif
-      } else {
-        ElemwiseGradBroadcast1CPU(
-            x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), h, w,
-            dx_op, dy_op,
-            dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-            dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-      }
-    } else {
-      if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef __NVCC__
-        ElemwiseGradBroadcast2CUDA(
-            ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
-            y.data<T>(), out.data<T>(), dout.data<T>(), pre, n, post, dx_op,
-            dy_op,
-            dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-            dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-#endif
-      } else {
-        ElemwiseGradBroadcast2CPU(
-            x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), pre, n,
-            post, dx_op, dy_op,
-            dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-            dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-      }
+    ElemwiseGradComputeWithBroadcast<DeviceContext, T, DX_OP, DY_OP>(
+        ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+  }
+}
+
+// NOTE(dzhwinter): Only used in elementwise_add, elementwise_sub.
+// explicit gradient can cut off X, Y, Out from gradient op
+// In elementwise_add, elementwise_sub, we use dout as fake X, Y, Out to reuse
+// elementwise code.
+template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP>
+void ElemwiseExplicitGradCompute(const framework::ExecutionContext& ctx,
+                                 const framework::Tensor& x,
+                                 const framework::Tensor& y,
+                                 const framework::Tensor& out,
+                                 const framework::Tensor& dout, int axis,
+                                 framework::Tensor* dx, framework::Tensor* dy,
+                                 DX_OP dx_op, DY_OP dy_op) {
+  if (dy == nullptr) {
+    const framework::DDim dx_dims = dout.dims();
+    auto dy_dims = dx_dims;
+    ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
+        ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+  } else {
+    if (dout.dims() == dy->dims()) {
+      const framework::DDim dx_dims = dout.dims();
+      const framework::DDim dy_dims = dy->dims();
+      ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
+          ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+    } else {  // Y is a scalar
+      auto dx_dims = dout.dims();
+      const framework::DDim dy_dims = dy->dims();
+      ElemwiseGradComputeWithBroadcast<DeviceContext, T, DX_OP, DY_OP>(
+          ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
     }
   }
 }
 
+// Deprecated
 template <typename DeviceContext, typename T, typename functor,
           typename broadcastfunctor, typename broadcast2functor>
 void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
@@ -547,7 +605,7 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
   }
 
   axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-  trim_trailing_singular_dims(&y_dims);
+  trim_trailing_singular_dims(y_dims);
   axis = (y_dims.size() == 0) ? x_dims.size() : axis;
 
   int pre, n, post;
@@ -574,19 +632,19 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
       x, y, z, ctx.template device_context<DeviceContext>(), func);
 
   auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+  auto y_dims_untrimed = y->dims();
+  PADDLE_ENFORCE_GE(x_dims.size(), y_dims_untrimed.size(),
                     "Rank of first input must >= rank of second input.");
 
-  if (x_dims == y_dims) {
+  if (x_dims == y_dims_untrimed) {
     functor.Run();
     return;
   }
 
-  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+  axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
   PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
                  "Axis should be in range [0, x_dims)");
-  trim_trailing_singular_dims(&y_dims);
+  auto y_dims = trim_trailing_singular_dims(y_dims_untrimed);
   axis = (y_dims.size() == 0) ? x_dims.size() : axis;
 
   int pre, n, post;
diff --git a/paddle/fluid/operators/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise_sub_op.cc
index a7562b166b373ee2a8c9b6f379431d88d3e45fcb..b7224261e6a7ca82dff92a25f5fe8818c08e676d 100644
--- a/paddle/fluid/operators/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise_sub_op.cc
@@ -15,7 +15,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise_sub_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
 namespace ops = paddle::operators;
-REGISTER_ELEMWISE_OP(elementwise_sub, "Sub", "Out = X - Y");
+REGISTER_ELEMWISE_GRAD_MAKER(elementwise_sub, Sub);
+REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_sub, "Sub", "Out = X - Y", "Out",
+                              "X");
+
 REGISTER_OP_CPU_KERNEL(
     elementwise_sub,
     ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_sub_op.h b/paddle/fluid/operators/elementwise_sub_op.h
index fe088b8203722a43b9aba7be3878b8f4ca68ba12..11c7e3fe628001f095836a788f2bcc7c4ee7ad4b 100644
--- a/paddle/fluid/operators/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise_sub_op.h
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -55,14 +55,15 @@ class ElementwiseSubGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     using Tensor = framework::Tensor;
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     int axis = ctx.Attr<int>("axis");
-    ElemwiseGradCompute<DeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
+    // skip out, x, y
+    auto* out = dout;
+    auto *x = dout, *y = dout;
+
+    ElemwiseExplicitGradCompute<DeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
         ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>());
   }
 };
diff --git a/paddle/fluid/operators/extract_rows_op.cc b/paddle/fluid/operators/extract_rows_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9a297d03cfb041e584159a5fc5ba214f8ac404b4
--- /dev/null
+++ b/paddle/fluid/operators/extract_rows_op.cc
@@ -0,0 +1,103 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class ExtractRowsOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ExtractRowsOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ExtractRowsOp should not be null.");
+    PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("X")[0],
+                      framework::proto::VarType::SELECTED_ROWS,
+                      "The type of input(X) must be SelectedRows.");
+    auto in_dims = ctx->GetInputDim("X");
+
+    ctx->SetOutputDim(
+        "Out", framework::make_ddim(std::vector<int64_t>{in_dims[0], 1}));
+  }
+};
+
+class ExtractRowsOp : public framework::OperatorBase {
+ public:
+  ExtractRowsOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &in = scope.FindVar(Input("X"))->Get<framework::SelectedRows>();
+    auto out = scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+
+    auto in_rows = in.rows();
+    auto out_dim = framework::make_ddim(
+        std::vector<int64_t>{static_cast<int64_t>(in_rows.size()), 1});
+    auto dst_ptr = out->mutable_data<int64_t>(out_dim, in.place());
+
+    if (paddle::platform::is_gpu_place(in.place())) {
+#ifdef PADDLE_WITH_CUDA
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto *dev_ctx = pool.Get(in.place());
+      auto src_ptr = in_rows.Data(in.place());
+      auto stream =
+          reinterpret_cast<const platform::CUDADeviceContext &>(*dev_ctx)
+              .stream();
+      memory::Copy(boost::get<platform::CUDAPlace>(out->place()), dst_ptr,
+                   boost::get<platform::CUDAPlace>(in.place()), src_ptr,
+                   in_rows.size() * sizeof(int64_t), stream);
+#else
+      PADDLE_THROW("Not compiled with CUDA.");
+#endif
+    } else {
+      memory::Copy(platform::CPUPlace(), dst_ptr, platform::CPUPlace(),
+                   in_rows.data(), in_rows.size() * sizeof(int64_t));
+    }
+  }
+};
+
+class ExtractRowsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(SelectedRows). The input tensor of extract_rows operator,"
+             " and its type is SelectedRows.");
+    AddOutput("Out", "(Tensor). The the rows of input(X).");
+
+    AddComment(R"DOC(
+    ExtractRows Operator.
+
+The function of extract_rows_op is extracting the rows from the input(X)
+whose type is SelectedRows.
+
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(extract_rows, ops::ExtractRowsOp, ops::ExtractRowsOpMaker,
+                  ops::ExtractRowsOpInferShape);
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a91e0f520e93c01bc5af09b691af2d5a6deda9f2
--- /dev/null
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -0,0 +1,112 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fake_quantize_op.h"
+#include <string>
+
+namespace paddle {
+namespace operators {
+
+class FakeQuantizeOp : public framework::OperatorWithKernel {
+ public:
+  FakeQuantizeOp(const std::string &type,
+                 const framework::VariableNameMap &inputs,
+                 const framework::VariableNameMap &outputs,
+                 const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of FakeQuantizeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of FakeQuantizeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutMovingScale"),
+                   "OutMovingScale(Out) of FakeQuantizeOp should not be null");
+    // if (ctx->HasInput("InMovingScale")) {
+    ctx->SetOutputDim("OutMovingScale", ctx->GetInputDim("InMovingScale"));
+    //}
+    // if (ctx->HasInput("InScales")) {
+    PADDLE_ENFORCE(ctx->HasOutput("OutScales"),
+                   "OutScales(Out) of FakeQuantizeOp should not be null");
+    ctx->SetOutputDim("OutScales", ctx->GetInputDim("InScales"));
+    // PADDLE_ENFORCE_EQ(ctx->Inputs("InScales")[0],
+    // ctx->Outputs("OutScales")[0],
+    //                  "Mean and MeanOut should share the same memory");
+    //}
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class FakeQuantizeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) Input tensor of scale operator.");
+    AddInput("InScales", "(Tensor) scale buffer, used in static quantization.")
+        .AsDispensable();
+    AddInput("InMovingScale", "Last scale, used in static quantization.")
+        .AsDispensable();
+    AddInput("InCurrentIter",
+             "Last iteration number, used in static quantization.")
+        .AsDispensable();
+    AddOutput("Out", "(Tensor) Output of quantized low level tensor.");
+    AddOutput("OutScales",
+              "(Tensor) scale buffer, used in static quantization.")
+        .AsDispensable();
+    AddOutput("OutMovingScale", " Current scale");
+    AddOutput("OutCurrentIter", "Current iteration number.").AsDispensable();
+    AddAttr<std::string>("quantize_type",
+                         "(string, default abs_max)"
+                         "The scaling tpe of the quantize operator.")
+        .SetDefault("abs_max");
+    AddAttr<int>("window_size", "(int, default 10000)").SetDefault(10000);
+    AddAttr<int>("bit_length", "(int, default 8)")
+        .SetDefault(8)
+        .AddCustomChecker([](const int &bit_length) {
+          PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
+                         "'bit_length' should be between 1 and 16.");
+        });
+    AddAttr<bool>("is_test", "").SetDefault(false);
+    AddComment(R"DOC(
+FakeQuantize operator
+
+quantize_type = abs_max:
+
+    $$scale = max(abs(x))$$ 
+
+quantize_type = range_abs_max:
+
+    $$scale = max(max(abs(x)), history_abs_max)$$ 
+
+quantize_type = moving_average_abs_max:
+
+    $$scale = 0.1*scale+0.9*new_abs_max)$$ 
+
+$$Out = scale*X$$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(fake_quantize, ops::FakeQuantizeOp, ops::FakeQuantizeOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    fake_quantize,
+    ops::FakeQuantizeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FakeQuantizeKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..be0c6730a5119090600a27c66510b2a095c54583
--- /dev/null
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -0,0 +1,272 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include "paddle/fluid/operators/fake_quantize_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void FindAbsMaxKernel(const int n, const T* in, T* out) {
+  int bid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = threadIdx.x;
+
+  extern __shared__ T shared_max_data[];
+  if (gridDim.x > 1) {
+    shared_max_data[tid] = T(0);
+    for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
+      T tmp = fabs(in[i]);
+      if (tmp > shared_max_data[tid]) {
+        shared_max_data[tid] = tmp;
+      }
+    }
+  } else {
+    if (bid < n) {
+      shared_max_data[tid] = fabs(in[bid]);
+    } else {
+      shared_max_data[tid] = T(0);
+    }
+  }
+  __syncthreads();
+
+  for (int i = blockDim.x / 2; i > 0; i >>= 1) {
+    if (tid < i && shared_max_data[tid] < shared_max_data[tid + i]) {
+      shared_max_data[tid] = shared_max_data[tid + i];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    out[blockIdx.x] = shared_max_data[0];
+  }
+}
+
+float FindAbsMaxGpu(const platform::CUDADeviceContext& ctx, const float* array,
+                    int length) {
+  float host_max;
+  int kNumTheads = 1024;
+  int gridDimx = (kNumTheads - 1 + length) / kNumTheads;
+  gridDimx = (gridDimx > kNumTheads) ? kNumTheads : gridDimx;
+  framework::Tensor t;
+  float* device_max = t.mutable_data<float>(framework::make_ddim({gridDimx}),
+                                            platform::CUDAPlace());
+  FindAbsMaxKernel<float><<<gridDimx, kNumTheads, kNumTheads * sizeof(float),
+                            ctx.stream()>>>(length, array, device_max);
+  FindAbsMaxKernel<
+      float><<<1, kNumTheads, kNumTheads * sizeof(float), ctx.stream()>>>(
+      gridDimx, device_max, device_max);
+  PADDLE_ENFORCE_EQ(
+      cudaMemcpy(&host_max, device_max, sizeof(float), cudaMemcpyDeviceToHost),
+      cudaSuccess, "cudaMemcpy failed");
+  return host_max;
+}
+
+template <typename T>
+__global__ void ApplySaturateKernel(const int n, const T* in, T* out,
+                                    int* num_saturate, const T min,
+                                    const T max) {
+  int bid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = threadIdx.x;
+
+  extern __shared__ int shared_count[];
+  shared_count[tid] = 0;
+  for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
+    if (in[i] > max) {
+      out[i] = max;
+      shared_count[tid] += 1;
+    } else if (in[i] < min) {
+      out[i] = min;
+      shared_count[tid] += 1;
+    } else {
+      out[i] = in[i];
+    }
+  }
+  __syncthreads();
+
+  for (int i = blockDim.x / 2; i > 0; i >>= 1) {
+    if (tid < i) {
+      shared_count[tid] += shared_count[tid + i];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    num_saturate[blockIdx.x] = shared_count[0];
+  }
+}
+
+template <typename T>
+__global__ void ReduceKernel(const int n, const T* in, T* out) {
+  int tid = threadIdx.x;
+  extern __shared__ T shared_sum[];
+  if (tid < n) {
+    shared_sum[tid] = in[tid];
+  } else {
+    shared_sum[tid] = T(0);
+  }
+  __syncthreads();
+  // blockDim.x must >= n
+  for (int i = (n + 1) / 2; i > 0; i >>= 1) {
+    if (tid < i) {
+      shared_sum[tid] += shared_sum[tid + i];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    out[0] = shared_sum[0];
+  }
+}
+
+template <typename T>
+int ApplySaturateGpu(const platform::CUDADeviceContext& ctx, const int n,
+                     const T* in, T* out, const T min, const T max) {
+  int host_num_saturate;
+  int kNumTheads = 1024;
+  int gridDimx = (n + kNumTheads - 1) / kNumTheads;
+  gridDimx = (gridDimx > kNumTheads) ? kNumTheads : gridDimx;
+  framework::Tensor t;
+  int* device_num_saturate = t.mutable_data<int>(
+      framework::make_ddim({gridDimx}), platform::CUDAPlace());
+  ApplySaturateKernel<
+      T><<<gridDimx, kNumTheads, kNumTheads * sizeof(T), ctx.stream()>>>(
+      n, in, out, device_num_saturate, min, max);
+  ReduceKernel<int><<<1, kNumTheads, kNumTheads * sizeof(T), ctx.stream()>>>(
+      gridDimx, device_num_saturate, device_num_saturate);
+  PADDLE_ENFORCE_EQ(cudaSuccess,
+                    cudaMemcpy(&host_num_saturate, device_num_saturate,
+                               sizeof(int), cudaMemcpyDeviceToHost),
+                    "cudaMemcpy failed");
+  return host_num_saturate;
+}
+
+template <typename DeviceContext, typename T>
+class FakeQuantizeCUDAKernel : public framework::OpKernel<T> {
+ public:
+  T FindRangeAbsMax(const platform::CUDADeviceContext& ctx,
+                    framework::Tensor* scale_list, framework::Tensor* out_scale,
+                    const T& cur_scale, int window_size,
+                    int current_iter) const {
+    T* sl = scale_list->mutable_data<T>(platform::CPUPlace());
+    T remove_tmp = sl[current_iter];
+    sl[current_iter] = cur_scale;
+    T& max_scale = out_scale->mutable_data<T>(platform::CPUPlace())[0];
+    if (max_scale < cur_scale) {
+      max_scale = cur_scale;
+    } else if (fabs(remove_tmp - max_scale) < 1e-6) {
+      int size = (current_iter > window_size) ? window_size : current_iter;
+      max_scale = T(FindAbsMaxGpu(ctx, scale_list->data<float>(), size));
+    }
+    return max_scale;
+  }
+
+  T FindMovingAverageAbsMmax(framework::Tensor* in_scale,
+                             framework::Tensor* out_scale,
+                             const T& cur_scale) const {
+    T* ins = in_scale->mutable_data<T>(platform::CPUPlace());
+    T* outs = out_scale->mutable_data<T>(platform::CPUPlace());
+    outs[0] = 0.9 * cur_scale + 0.1 * ins[0];
+    return T(outs[0]);
+  }
+
+  virtual void Compute(const framework::ExecutionContext& context) const {
+    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto& device_ctx = context.cuda_device_context();
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    auto* in = context.Input<framework::Tensor>("X");
+    const bool is_test = context.Attr<bool>("is_test");
+    tensor->mutable_data<T>(in->place());
+    context.Output<framework::Tensor>("OutMovingScale")
+        ->mutable_data<T>(
+            context.Input<framework::Tensor>("InMovingScale")->place());
+    auto quantize_type =
+        static_cast<std::string>(context.Attr<std::string>("quantize_type"));
+    if (quantize_type == std::string("range_abs_max")) {
+      context.Output<framework::Tensor>("OutScales")
+          ->mutable_data<T>(
+              context.Input<framework::Tensor>("InScales")->place());
+      context.Output<framework::Tensor>("OutCurrentIter")
+          ->mutable_data<T>(
+              context.Input<framework::Tensor>("InCurrentIter")->place());
+    }
+
+    T scale = T(1);
+    int window_size = context.Attr<int>("window_size");
+    T bin_cnt = (T)((1 << (context.Attr<int>("bit_length") - 1)) - 1);
+    if (quantize_type == std::string("abs_max")) {
+      auto* saving_scale = context.Output<framework::Tensor>("OutMovingScale");
+      scale = (T)FindAbsMaxGpu(device_ctx, in->data<float>(), in->numel());
+      saving_scale->mutable_data<T>(platform::CPUPlace())[0] = scale;
+
+      auto& device_ctx = context.template device_context<DeviceContext>();
+      auto* scale_list = context.Output<framework::Tensor>("OutScales");
+      math::SetConstant<DeviceContext, T> scalar;
+      scale_list->mutable_data<T>(context.GetPlace());
+      scalar(device_ctx, scale_list, static_cast<T>(0));
+      auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
+      iter->mutable_data<T>(context.GetPlace());
+      scalar(device_ctx, iter, static_cast<T>(0));
+    } else if (quantize_type == std::string("range_abs_max")) {
+      auto* moving_scale = const_cast<framework::Tensor*>(
+          context.Input<framework::Tensor>("InMovingScale"));
+      if (is_test) {
+        scale = moving_scale->mutable_data<T>(platform::CPUPlace())[0];
+      } else {
+        auto* it = const_cast<framework::Tensor*>(
+            context.Input<framework::Tensor>("InCurrentIter"));
+        auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
+        int* last_iter = it->mutable_data<int>(platform::CPUPlace());
+        int* current_iter = iter->mutable_data<int>(platform::CPUPlace());
+        auto* scale_list = context.Output<framework::Tensor>("OutScales");
+        auto* saving_scale =
+            context.Output<framework::Tensor>("OutMovingScale");
+        scale = (T)FindAbsMaxGpu(device_ctx, in->data<float>(), in->numel());
+        scale = FindRangeAbsMax(device_ctx, scale_list, saving_scale, scale,
+                                window_size, current_iter[0]);
+        (*current_iter) = (*last_iter) + 1;
+      }
+    } else if (quantize_type == std::string("moving_average_abs_max")) {
+      auto* moving_scale = const_cast<framework::Tensor*>(
+          context.Input<framework::Tensor>("InMovingScale"));
+      if (is_test) {
+        scale = moving_scale->mutable_data<T>(platform::CPUPlace())[0];
+      } else {
+        scale = (T)FindAbsMaxGpu(device_ctx, in->data<float>(), in->numel());
+        auto* saving_scale =
+            context.Output<framework::Tensor>("OutMovingScale");
+        scale = FindMovingAverageAbsMmax(
+            const_cast<framework::Tensor*>(moving_scale), saving_scale, scale);
+      }
+    }
+
+    ApplySaturateGpu<T>(device_ctx, in->numel(), in->data<T>(),
+                        tensor->mutable_data<T>(in->place()), -scale, scale);
+    scale = bin_cnt / scale;
+
+    auto& dev =
+        *context.template device_context<DeviceContext>().eigen_device();
+    auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
+    auto eigen_in = framework::EigenVector<T>::Flatten(*tensor);
+    eigen_out.device(dev) = (scale * eigen_in).round();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(fake_quantize,
+                        paddle::operators::FakeQuantizeCUDAKernel<
+                            paddle::platform::CUDADeviceContext, float>,
+                        paddle::operators::FakeQuantizeCUDAKernel<
+                            paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..80f71d85dde39f773cc489fb79effcc775c5010a
--- /dev/null
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/clip_op.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+using platform::Transform;
+
+template <typename DeviceContext, typename T>
+class FakeQuantizeKernel : public framework::OpKernel<T> {
+ public:
+  T FindAbsMax(framework::Tensor* in, int n) const {
+    T* p = in->mutable_data<T>(platform::CPUPlace());
+    T abs_max = (T)0.00000001;
+    for (int i = 0; i < n; i++) {
+      T tmp = fabs(p[i]);
+      if (tmp > abs_max) abs_max = tmp;
+    }
+    return T(abs_max);
+  }
+  T FindRangeAbsMax(framework::Tensor* scale_list, framework::Tensor* out_scale,
+                    const T& cur_scale, int window_size,
+                    int current_iter) const {
+    T* sl = scale_list->mutable_data<T>(platform::CPUPlace());
+    T remove_tmp = sl[current_iter];
+    sl[current_iter] = cur_scale;
+    T& max_scale = out_scale->mutable_data<T>(platform::CPUPlace())[0];
+    if (max_scale < cur_scale) {
+      max_scale = cur_scale;
+    } else if (fabs(remove_tmp - max_scale) < 1e-6) {
+      int size = (current_iter > window_size) ? window_size : current_iter;
+      max_scale = T(FindAbsMax(scale_list, size));
+    }
+    return max_scale;
+  }
+
+  T FindMovingAverageAbsMmax(framework::Tensor* in_scale,
+                             framework::Tensor* out_scale,
+                             const T& cur_scale) const {
+    T* ins = in_scale->mutable_data<T>(platform::CPUPlace());
+    T* outs = out_scale->mutable_data<T>(platform::CPUPlace());
+    outs[0] = 0.9 * cur_scale + 0.1 * ins[0];
+    return T(outs[0]);
+  }
+
+  virtual void Compute(const framework::ExecutionContext& context) const {
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    auto* in = context.Input<framework::Tensor>("X");
+    const bool is_test = context.Attr<bool>("is_test");
+    tensor->mutable_data<T>(in->place());
+
+    auto* oms_tensor = context.Output<framework::Tensor>("OutMovingScale");
+    oms_tensor->mutable_data<T>(in->place());
+
+    auto quantize_type =
+        static_cast<std::string>(context.Attr<std::string>("quantize_type"));
+    if (quantize_type == std::string("range_abs_max")) {
+      auto* oss_tensor = context.Output<framework::Tensor>("OutScales");
+      oss_tensor->mutable_data<T>(
+          context.Input<framework::Tensor>("InScales")->place());
+      auto* oci_tensor = context.Output<framework::Tensor>("OutCurrentIter");
+      oci_tensor->mutable_data<T>(
+          context.Input<framework::Tensor>("InCurrentIter")->place());
+    }
+
+    T scale = static_cast<T>(1);
+    int window_size = context.Attr<int>("window_size");
+    int bit_length = context.Attr<int>("bit_length");
+    int bin_cnt = std::pow(2, bit_length - 1) - 1;
+
+    auto& dev =
+        *context.template device_context<DeviceContext>().eigen_device();
+    auto raw_in = framework::EigenVector<T>::Flatten(*in);
+    if (quantize_type == std::string("abs_max")) {
+      auto* saving_scale = context.Output<framework::Tensor>("OutMovingScale");
+      auto scale_out = framework::EigenVector<T>::Flatten(*saving_scale);
+      scale_out.device(dev) = raw_in.abs().maximum();
+      scale = scale_out(0);
+
+      auto& device_ctx = context.template device_context<DeviceContext>();
+      auto* scale_list = context.Output<framework::Tensor>("OutScales");
+      math::SetConstant<DeviceContext, T> scalar;
+      scale_list->mutable_data<T>(context.GetPlace());
+      scalar(device_ctx, scale_list, static_cast<T>(0));
+      auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
+      iter->mutable_data<T>(context.GetPlace());
+      scalar(device_ctx, iter, static_cast<T>(0));
+    } else if (quantize_type == std::string("range_abs_max")) {
+      auto* moving_scale = context.Input<framework::Tensor>("InMovingScale");
+      if (is_test) {
+        scale = moving_scale->data<T>()[0];
+      } else {
+        auto* it = context.Input<framework::Tensor>("InCurrentIter");
+        auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
+        const int* last_iter = it->data<int>();
+        int* current_iter = iter->mutable_data<int>(platform::CPUPlace());
+        auto* scale_list = context.Output<framework::Tensor>("OutScales");
+        auto* saving_scale =
+            context.Output<framework::Tensor>("OutMovingScale");
+        auto scale_out = framework::EigenVector<T>::Flatten(*saving_scale);
+        scale_out.device(dev) = raw_in.abs().maximum();
+        scale = saving_scale->mutable_data<T>(platform::CPUPlace())[0];
+        scale = FindRangeAbsMax(scale_list, saving_scale, scale, window_size,
+                                current_iter[0]);
+        saving_scale->mutable_data<T>(platform::CPUPlace())[0] = scale;
+        (*current_iter) = (*last_iter) + 1;
+      }
+    } else if (quantize_type == std::string("moving_average_abs_max")) {
+      auto* moving_scale = context.Input<framework::Tensor>("InMovingScale");
+      if (is_test) {
+        scale = moving_scale->data<T>()[0];
+      } else {
+        auto* saving_scale =
+            context.Output<framework::Tensor>("OutMovingScale");
+        auto scale_out = framework::EigenVector<T>::Flatten(*saving_scale);
+        scale_out.device(dev) = raw_in.abs().maximum();
+        scale = saving_scale->mutable_data<T>(platform::CPUPlace())[0];
+        scale = FindMovingAverageAbsMmax(
+            const_cast<framework::Tensor*>(moving_scale), saving_scale, scale);
+        saving_scale->mutable_data<T>(platform::CPUPlace())[0] = scale;
+      }
+    }
+
+    Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), in->data<T>(),
+          in->data<T>() + in->numel(), tensor->mutable_data<T>(in->place()),
+          ClipFunctor<T>(-scale, scale));
+    auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
+    auto eigen_in = framework::EigenVector<T>::Flatten(*tensor);
+    eigen_out.device(dev) = (bin_cnt / scale * eigen_in).round();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/feed_op.cc b/paddle/fluid/operators/feed_op.cc
index bcb3e63ed7dbc775c1de6c4522f0548ea48a6cf0..dc7ef664958238ddbd48745bd59cc7db28e49f5b 100644
--- a/paddle/fluid/operators/feed_op.cc
+++ b/paddle/fluid/operators/feed_op.cc
@@ -31,7 +31,6 @@ class FeedOp : public framework::OperatorBase {
                const platform::Place &place) const override {
     // get device context from pool
     auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-    platform::RecordEvent record_event(Type(), dev_ctx);
 
     auto feed_var_name = Input("X");
     auto *feed_var = scope.FindVar(feed_var_name);
diff --git a/paddle/fluid/operators/fetch_barrier_op.cc b/paddle/fluid/operators/fetch_barrier_op.cc
index 02beb80fc8a9f451393dcdd54492c4f88f908497..d9cd956dfdff3d009d38ee5088f5396080580483 100644
--- a/paddle/fluid/operators/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/fetch_barrier_op.cc
@@ -36,22 +36,16 @@ class FetchBarrierOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope& scope,
                const platform::Place& place) const override {
     std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
-
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(place);
-    // For profiling
-    platform::RecordEvent record_event(Type(), &ctx);
-
     distributed::RPCClient* rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
-    rpc_client->Wait();
+    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
 
     for (auto& ep : eps) {
       VLOG(3) << "fetch barrier, ep: " << ep;
       rpc_client->AsyncSendFetchBarrier(ep);
     }
-    rpc_client->Wait();
+    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
   }
 };
 
diff --git a/paddle/fluid/operators/fetch_op.cc b/paddle/fluid/operators/fetch_op.cc
index 1640a2a22c69a0e3ab81a2889d6105b2cf4162b7..c197b45e8196a47def6465128e8ca39d8daefed6 100644
--- a/paddle/fluid/operators/fetch_op.cc
+++ b/paddle/fluid/operators/fetch_op.cc
@@ -30,9 +30,6 @@ class FetchOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    platform::RecordEvent record_event(Type(), pool.Get(place));
-
     auto fetch_var_name = Input("X");
     auto *fetch_var = scope.FindVar(fetch_var_name);
     PADDLE_ENFORCE(fetch_var != nullptr,
diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fdda01381e117cecffb2a05f8399f3ad82a46339
--- /dev/null
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -0,0 +1,169 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class FlattenOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input (X) of Flatten op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output (Output) of Flatten op should not be null.");
+    const auto &axis = ctx->Attrs().Get<int>("axis");
+    const auto &in_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE(axis >= 0, "The axis should be greater than or equal to 0.");
+    PADDLE_ENFORCE(
+        axis <= in_dims.size(),
+        "The axis should be less than or equal to input tensor's rank.");
+
+    const auto &out_dims = GetOutputShape(axis, in_dims);
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
+    if (in_dims[0] == out_dims[0]) {
+      // Only pass LoD when the first dimension of output and Input(X)
+      // are the same.
+      ctx->ShareLoD("X", "Out");
+    }
+  }
+
+  static std::vector<int32_t> GetOutputShape(const int axis,
+                                             const framework::DDim &in_dims) {
+    int64_t outer = 1, inner = 1;
+    for (int i = 0; i < in_dims.size(); ++i) {
+      if (i < axis) {
+        outer *= in_dims[i];
+      } else {
+        inner *= in_dims[i];
+      }
+    }
+    std::vector<int32_t> out_shape(2);
+    out_shape[0] = outer;
+    out_shape[1] = inner;
+    return out_shape;
+  }
+};
+
+class FlattenOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axis = Attr<int>("axis");
+    auto in_dims =
+        scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    const auto &out_dims = FlattenOpInferShape::GetOutputShape(axis, in_dims);
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = out_dims;
+    attrs["inplace"] = false;
+    // Invoke Reshape Op
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+class FlattenOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) A tensor of rank >= axis.");
+    AddOutput("Out",
+              "A 2D tensor is reshaped input tensor. The input dimensions"
+              "up to axis are flattened to the outer dimension of the output"
+              "and the remaining input dimensions are flattened into the inner"
+              "dimension of the output.");
+    AddAttr<int>("axis",
+                 "(int)"
+                 "Indicate up to which input dimensions (exclusive) should be"
+                 "flattened to the outer dimension of the output. The value"
+                 "for axis must be in the range [0, R], where R is the rank of"
+                 "the input tensor. When axis = 0, the shape of the output"
+                 "tensor is (1, (d_0 X d_1 ... d_n), where the shape of the"
+                 "input tensor is (d_0, d_1, ... d_n).")
+        .SetDefault(1);
+    AddComment(R"DOC(
+Flatten Operator
+
+Flattens the input tensor into a 2D matrix.
+
+Examples:
+Case 1:
+  Given
+    X.shape = (3, 100, 100, 4)
+  and
+    axis = 2
+  We get:
+    Out.shape = (3 * 100, 4 * 100)
+
+Case 2:
+  Given
+    X.shape = (3, 100, 100, 4)
+  and
+    axis = 0
+  We get:
+    Out.shape = (1, 3 * 100 * 100 * 4)
+)DOC");
+  }
+};
+
+class FlattenGradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    context->SetOutputDim(framework::GradVarName("X"),
+                          context->GetInputDim("X"));
+    context->ShareLoD("X", framework::GradVarName("X"));
+  }
+};
+
+class FlattenGradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto in_dims =
+        scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(in_dims);
+    attrs["inplace"] = false;
+
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}},
+        attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+USE_OP(reshape);
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(flatten, ops::FlattenOp, ops::FlattenOpMaker,
+                  ops::FlattenOpInferShape,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp, ops::FlattenGradInferShape);
diff --git a/paddle/fluid/operators/fused_elemwise_activation_op.cc b/paddle/fluid/operators/fused_elemwise_activation_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a6fd0aeb021dce40339c32251af130d5984dccd2
--- /dev/null
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.cc
@@ -0,0 +1,221 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/operators/fused_elemwise_activation_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FusedElemwiseActivationOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("X"),
+        "Input(X) of FusedElemwiseActivationOp op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("Y"),
+        "Input(Y) of FusedElemwiseActivationOp op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Out"),
+        "Output(Out) of FusedElemwiseActivationOp op should not be null.");
+
+    auto x_dim = ctx->GetInputDim("X");
+    auto y_dim = ctx->GetInputDim("Y");
+    PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
+                      "Rank of first input must >= rank of second input.");
+
+    ctx->SetOutputDim("Out", x_dim);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx.Input<framework::Tensor>("X")->type(),
+                      ctx.Input<framework::Tensor>("Y")->type(),
+                      "The element's type of input should be the same.");
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type());
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
+};
+
+class FusedElemwiseActivationMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(vector<Tensor>)");
+    AddInput("Y", "(vector<Tensor>)");
+    AddOutput("Out", "vector<Tensor>");
+    AddAttr<int>("axis",
+                 "axis is used by elementwise_op, the default value is -1.")
+        .SetDefault(-1);
+    AddAttr<float>("scale",
+                   "scale is used by scale_op, the default value is 0.0.")
+        .SetDefault(0.0);
+    AddAttr<bool>("recomputation",
+                  "Whether to recompute the Out."
+                  "fused_elemwise_activation_grad has two methods to get the "
+                  "dx and dy, one "
+                  "is to use the 'Out', and the other is not to use it. "
+                  "The former method will save the time of recomputing the "
+                  "'Out', but it must occupy the memory to store the 'out'. "
+                  "While, the later method can avoid occupying the memory, "
+                  "but it must recompute the 'Out'. The default value is true.")
+        .SetDefault(true);
+    AddAttr<std::vector<std::string>>("functor_list",
+                                      "The functors that should be fused.")
+        .AddCustomChecker([&](const std::vector<std::string> &functor_list) {
+          PADDLE_ENFORCE(ValidCheck(functor_list));
+        });
+
+    AddComment(R"DOC(
+FusedElemwiseActivation Operator.
+
+At present, FusedElemwiseActivation only supports Two kinds of compound
+operators (elementwise_op and activation_op):
+
+    Z = Binary(X, Unary(Y))
+    Z = Unary(Binary(X, Y))
+
+The attributions of activation_op can be get from fused_elemwise_activation_op's
+attributions. functor_list records the functors to be fused, for example
+"scale,elementwise_add".
+
+)DOC");
+  }
+
+ private:
+  bool ValidCheck(const std::vector<std::string> &functors) {
+    std::unordered_set<std::string> unary_fun = {"scale", "relu"};
+    std::unordered_set<std::string> binary_fun = {"elementwise_add"};
+
+    std::string unary_fun_str;
+    if (binary_fun.count(functors[0])) {
+      unary_fun_str = functors[1];
+    } else if (binary_fun.count(functors[1])) {
+      unary_fun_str = functors[0];
+    } else {
+      PADDLE_THROW("%s and %s are not included in fused_list.", functors[0],
+                   functors[1]);
+    }
+    PADDLE_ENFORCE_EQ(unary_fun.count(unary_fun_str), 1,
+                      "%s is not included in fused_list.", unary_fun_str);
+    return true;
+  }
+};
+
+class FusedElemwiseActivationGradMaker
+    : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *op_desc_ptr = new framework::OpDesc();
+    op_desc_ptr->SetType(this->ForwardOpType() + "_grad");
+
+    for (auto &input_param : this->InputNames()) {
+      op_desc_ptr->SetInput(input_param, this->Input(input_param));
+      op_desc_ptr->SetOutput(framework::GradVarName(input_param),
+                             this->InputGrad(input_param, true));
+    }
+
+    for (auto &output_param : this->OutputNames()) {
+      op_desc_ptr->SetInput(output_param, this->Output(output_param));
+      op_desc_ptr->SetInput(framework::GradVarName(output_param),
+                            this->OutputGrad(output_param));
+    }
+    op_desc_ptr->SetAttrMap(this->Attrs());
+
+    std::vector<std::string> functor_names =
+        boost::get<std::vector<std::string>>(
+            op_desc_ptr->GetAttr("functor_list"));
+    functor_names[0] += "_grad";
+    functor_names[1] += "_grad";
+    op_desc_ptr->SetAttr("functor_list", functor_names);
+    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
+  }
+};
+
+class FusedElemwiseActivationOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+                      "Rank of first input must >= rank of second input.");
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type_index = ctx.Input<framework::Tensor>("X")->type();
+    PADDLE_ENFORCE_EQ(input_data_type_index,
+                      ctx.Input<framework::Tensor>("Y")->type(),
+                      "The element's type of input should be the same.");
+    PADDLE_ENFORCE_EQ(
+        input_data_type_index,
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type(),
+        "The element's type of input should be the same.");
+
+    auto input_data_type = framework::ToDataType(input_data_type_index);
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fused_elemwise_activation, ops::FusedElemwiseActivationOp,
+                  ops::FusedElemwiseActivationMaker,
+                  ops::FusedElemwiseActivationGradMaker);
+REGISTER_OPERATOR(fused_elemwise_activation_grad,
+                  ops::FusedElemwiseActivationOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    fused_elemwise_activation,
+    ops::FusedElemwiseActivationKernel<paddle::platform::CPUDeviceContext,
+                                       float>,
+    ops::FusedElemwiseActivationKernel<paddle::platform::CPUDeviceContext,
+                                       double>);
+
+REGISTER_OP_CPU_KERNEL(
+    fused_elemwise_activation_grad,
+    ops::FusedElemwiseActivationGradKernel<paddle::platform::CPUDeviceContext,
+                                           float>,
+    ops::FusedElemwiseActivationGradKernel<paddle::platform::CPUDeviceContext,
+                                           double>);
diff --git a/paddle/fluid/operators/fused_elemwise_activation_op.cu b/paddle/fluid/operators/fused_elemwise_activation_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e1d2b16b4b5e3a480777f834c2cbeb6d00a755e4
--- /dev/null
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.cu
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fused_elemwise_activation_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    fused_elemwise_activation,
+    ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
+                                       float>,
+    ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
+                                       double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    fused_elemwise_activation_grad,
+    ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
+                                           float>,
+    ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
+                                           double>);
diff --git a/paddle/fluid/operators/fused_elemwise_activation_op.h b/paddle/fluid/operators/fused_elemwise_activation_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe0017b824532b1210d0ae3e51983d63d081f12a
--- /dev/null
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.h
@@ -0,0 +1,425 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/functors.h"
+
+namespace math = paddle::operators::math;
+
+namespace paddle {
+namespace operators {
+
+// CompoundFunctors
+// For example: Z = Binary(X, Unary(Y))
+template <typename T, typename BinaryFun, typename UnaryFun>
+struct BinaryCompoundFunctor {
+  BinaryCompoundFunctor(const BinaryFun &binary_fun, const UnaryFun &unary_fun)
+      : binary_fun_(binary_fun), unary_fun_(unary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y) {
+    return binary_fun_(x, unary_fun_(y));
+  }
+
+ private:
+  BinaryFun binary_fun_;
+  UnaryFun unary_fun_;
+};
+
+// For example: Z = Unary(Binary(X, Y))
+template <typename T, typename UnaryFun, typename BinaryFun>
+struct UnaryCompoundFunctor {
+  UnaryCompoundFunctor(const UnaryFun &unary_fun, const BinaryFun &binary_fun)
+      : unary_fun_(unary_fun), binary_fun_(binary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y) {
+    return unary_fun_(binary_fun_(x, y));
+  }
+
+ private:
+  UnaryFun unary_fun_;
+  BinaryFun binary_fun_;
+};
+
+// FIXME(zcd): DBinaryFun and DUnaryFun have to method to get
+// the dx, one is to use the 'out', and the other is not to use it.
+// the former method will save the time of recomputing the
+// 'out', but it must occupy the memory to store the 'out'.
+// While the later method can avoid occupying this memory,
+// but it must recompute the 'out'.
+
+template <typename T, typename DBinaryFun, typename UnaryFun,
+          bool Recomputation = true>
+struct BinaryCompoundGradDxFunctor {
+  BinaryCompoundGradDxFunctor(const DBinaryFun &d_binary_fun,
+                              const UnaryFun &unary_fun)
+      : d_binary_fun_(d_binary_fun), unary_fun_(unary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y, T out, T dout) {
+    if (Recomputation) {
+      return dout * d_binary_fun_(x, unary_fun_(y));
+    } else {
+      return dout * d_binary_fun_(x, unary_fun_(y), out);
+    }
+  }
+
+ private:
+  DBinaryFun d_binary_fun_;
+  UnaryFun unary_fun_;
+};
+
+template <typename T, typename DBinaryFun, typename UnaryFun,
+          typename DUnaryFun, bool Recomputation = true>
+struct BinaryCompoundGradDyFunctor {
+  BinaryCompoundGradDyFunctor(const DBinaryFun &d_binary_fun,
+                              const UnaryFun &unary_fun,
+                              const DUnaryFun &d_unary_fun)
+      : d_binary_fun_(d_binary_fun),
+        unary_fun_(unary_fun),
+        d_unary_fun_(d_unary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y, T out, T dout) {
+    if (Recomputation) {
+      return dout * d_binary_fun_(unary_fun_(y), x) * d_unary_fun_(y);
+    } else {
+      return dout * d_binary_fun_(unary_fun_(y), x, out) * d_unary_fun_(y);
+    }
+  }
+
+ private:
+  DBinaryFun d_binary_fun_;
+  UnaryFun unary_fun_;
+  DUnaryFun d_unary_fun_;
+};
+
+template <typename T, typename DUnaryFun, typename BinaryFun,
+          typename DBinaryFun, bool Recomputation = true>
+struct UnaryCompoundGradDxFunctor {
+  UnaryCompoundGradDxFunctor(const DUnaryFun &d_unary_fun,
+                             const BinaryFun &binary_fun,
+                             const DBinaryFun &d_binary_fun)
+      : d_unary_fun_(d_unary_fun),
+        binary_fun_(binary_fun),
+        d_binary_fun_(d_binary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y, T out, T dout) {
+    T base;
+    if (Recomputation) {
+      base = dout * d_unary_fun_(binary_fun_(x, y));
+    } else {
+      base = dout * d_unary_fun_(binary_fun_(x, y), out);
+    }
+    return base * d_binary_fun_(x, y);
+  }
+
+ private:
+  DUnaryFun d_unary_fun_;
+  BinaryFun binary_fun_;
+  DBinaryFun d_binary_fun_;
+};
+
+template <typename T, typename DUnaryFun, typename BinaryFun,
+          typename DBinaryFun, bool Recomputation = true>
+struct UnaryCompoundGradDyFunctor {
+  UnaryCompoundGradDyFunctor(const DUnaryFun &d_unary_fun,
+                             const BinaryFun &binary_fun,
+                             const DBinaryFun &d_binary_fun)
+      : d_unary_fun_(d_unary_fun),
+        binary_fun_(binary_fun),
+        d_binary_fun_(d_binary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y, T out, T dout) {
+    T base;
+    if (Recomputation) {
+      base = dout * d_unary_fun_(binary_fun_(x, y));
+    } else {
+      base = dout * d_unary_fun_(binary_fun_(x, y), out);
+    }
+    return base * d_binary_fun_(y, x);
+  }
+
+ private:
+  DUnaryFun d_unary_fun_;
+  BinaryFun binary_fun_;
+  DBinaryFun d_binary_fun_;
+};
+
+template <typename DeviceContext, typename T, typename BinaryFunctor,
+          typename UnaryFunctor>
+static void RunBinaryCompoundFunctor(const framework::ExecutionContext &ctx,
+                                     const BinaryFunctor &binary_functor,
+                                     const UnaryFunctor &unary_functor,
+                                     const framework::Tensor *in_x,
+                                     const framework::Tensor *in_y,
+                                     framework::Tensor *output) {
+  int axis = ctx.Attr<int>("axis");
+  using BinaryCompoundFunctor =
+      BinaryCompoundFunctor<T, BinaryFunctor, UnaryFunctor>;
+
+  ElementwiseComputeEx<BinaryCompoundFunctor, DeviceContext, T>(
+      ctx, in_x, in_y, axis,
+      BinaryCompoundFunctor(binary_functor, unary_functor), output);
+}
+
+template <typename DeviceContext, typename T, typename UnaryFunctor,
+          typename BinaryFunctor>
+static void RunUnaryCompoundFunctors(const framework::ExecutionContext &ctx,
+                                     const UnaryFunctor &unary_functor,
+                                     const BinaryFunctor &binary_functor,
+                                     const framework::Tensor *in_x,
+                                     const framework::Tensor *in_y,
+                                     framework::Tensor *output) {
+  int axis = ctx.Attr<int>("axis");
+
+  using UnaryCompoundFunctor =
+      UnaryCompoundFunctor<T, UnaryFunctor, BinaryFunctor>;
+
+  ElementwiseComputeEx<UnaryCompoundFunctor, DeviceContext, T>(
+      ctx, in_x, in_y, axis,
+      UnaryCompoundFunctor(unary_functor, binary_functor), output);
+}
+
+template <typename DeviceContext, typename T, typename BinaryGradFunctor,
+          typename UnaryFunctor, typename UnaryGradFunctor,
+          bool Recomputation = true>
+static void RunBinaryCompoundGradFunctors(
+    const framework::ExecutionContext &ctx,
+    const BinaryGradFunctor &binary_grad_functor,
+    const UnaryFunctor &unary_functor,
+    const UnaryGradFunctor &unary_grad_functor, const framework::Tensor *in_x,
+    const framework::Tensor *in_y, const framework::Tensor *in_out,
+    const framework::Tensor *in_out_grad, framework::Tensor *x_grad,
+    framework::Tensor *y_grad) {
+  int axis = ctx.Attr<int>("axis");
+
+  using BinaryCompoundDxFunctor =
+      BinaryCompoundGradDxFunctor<T, BinaryGradFunctor, UnaryFunctor,
+                                  Recomputation>;
+  using BinaryCompoundDyFunctor =
+      BinaryCompoundGradDyFunctor<T, BinaryGradFunctor, UnaryFunctor,
+                                  UnaryGradFunctor, Recomputation>;
+
+  ElemwiseGradCompute<DeviceContext, T, BinaryCompoundDxFunctor,
+                      BinaryCompoundDyFunctor>(
+      ctx, *in_x, *in_y, *in_out, *in_out_grad, axis, x_grad, y_grad,
+      BinaryCompoundDxFunctor(binary_grad_functor, unary_functor),
+      BinaryCompoundDyFunctor(binary_grad_functor, unary_functor,
+                              unary_grad_functor));
+}
+
+template <typename DeviceContext, typename T, typename UnaryGradFunctor,
+          typename BinaryFunctor, typename BinaryGradFunctor,
+          bool Recomputation = true>
+static void RunUnaryCompoundGradFunctors(
+    const framework::ExecutionContext &ctx,
+    const UnaryGradFunctor &unary_grad_functor,
+    const BinaryFunctor &binary_functor,
+    const BinaryGradFunctor &binary_grad_functor, const framework::Tensor *in_x,
+    const framework::Tensor *in_y, const framework::Tensor *in_out,
+    const framework::Tensor *in_out_grad, framework::Tensor *x_grad,
+    framework::Tensor *y_grad) {
+  int axis = ctx.Attr<int>("axis");
+
+  using UnaryCompoundDxFunctor =
+      UnaryCompoundGradDxFunctor<T, UnaryGradFunctor, BinaryFunctor,
+                                 BinaryGradFunctor, Recomputation>;
+  using UnaryCompoundDyFunctor =
+      UnaryCompoundGradDyFunctor<T, UnaryGradFunctor, BinaryFunctor,
+                                 BinaryGradFunctor, Recomputation>;
+
+  ElemwiseGradCompute<DeviceContext, T, UnaryCompoundDxFunctor,
+                      UnaryCompoundDyFunctor>(
+      ctx, *in_x, *in_y, *in_out, *in_out_grad, axis, x_grad, y_grad,
+      UnaryCompoundDxFunctor(unary_grad_functor, binary_functor,
+                             binary_grad_functor),
+      UnaryCompoundDyFunctor(unary_grad_functor, binary_functor,
+                             binary_grad_functor));
+}
+
+template <typename DeviceContext, typename T>
+static void RunFunctors(const framework::ExecutionContext &ctx,
+                        const framework::Tensor *in_x,
+                        const framework::Tensor *in_y,
+                        framework::Tensor *output) {
+  auto &functors = ctx.Attr<std::vector<std::string>>("functor_list");
+  auto funcs_str = functors[0] + "," + functors[1];
+  // TODO(zcd): The following code can be refined.
+  if (funcs_str == "elementwise_add,scale") {
+    // Z = Binary(X, Unary(Y))
+    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    RunBinaryCompoundFunctor<DeviceContext, T, math::AddFunctor<T>,
+                             math::ScaleFunctor<T>>(
+        ctx, math::AddFunctor<T>(), math::ScaleFunctor<T>(scale), in_x, in_y,
+        output);
+  } else if (funcs_str == "scale,elementwise_add") {
+    // Z = Unary(Binary(X, Y))
+    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    RunUnaryCompoundFunctors<DeviceContext, T, math::ScaleFunctor<T>,
+                             math::AddFunctor<T>>(
+        ctx, math::ScaleFunctor<T>(scale), math::AddFunctor<T>(), in_x, in_y,
+        output);
+  } else if (funcs_str == "elementwise_add,relu") {
+    RunBinaryCompoundFunctor<DeviceContext, T, math::AddFunctor<T>,
+                             math::ReluFunctor<T>>(
+        ctx, math::AddFunctor<T>(), math::ReluFunctor<T>(), in_x, in_y, output);
+  } else if (funcs_str == "relu,elementwise_add") {
+    RunUnaryCompoundFunctors<DeviceContext, T, math::ReluFunctor<T>,
+                             math::AddFunctor<T>>(
+        ctx, math::ReluFunctor<T>(), math::AddFunctor<T>(), in_x, in_y, output);
+  } else {
+    PADDLE_THROW("%s has not been implemented.", funcs_str);
+  }
+}
+
+template <typename DeviceContext, typename T>
+static void RunGradFunctors(const framework::ExecutionContext &ctx,
+                            const framework::Tensor *in_x,
+                            const framework::Tensor *in_y,
+                            const framework::Tensor *in_out,
+                            const framework::Tensor *in_out_grad,
+                            framework::Tensor *x_grad,
+                            framework::Tensor *y_grad) {
+  auto &functors = ctx.Attr<std::vector<std::string>>("functor_list");
+  auto funcs_str = functors[0] + "," + functors[1];
+
+  bool recomputation = ctx.Attr<bool>("recomputation");
+
+  // TODO(zcd): The following code can be refined. for example, use registion
+  if (funcs_str == "elementwise_add_grad,scale_grad") {
+    // The backward of Z = Binary(X, Unary(Y))
+    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    if (recomputation) {
+      RunBinaryCompoundGradFunctors<DeviceContext, T, math::AddGradFunctor<T>,
+                                    math::ScaleFunctor<T>,
+                                    math::ScaleGradFunctor<T>, true>(
+          ctx, math::AddGradFunctor<T>(), math::ScaleFunctor<T>(scale),
+          math::ScaleGradFunctor<T>(scale), in_x, in_y, in_out, in_out_grad,
+          x_grad, y_grad);
+    } else {
+      RunBinaryCompoundGradFunctors<DeviceContext, T, math::AddGradFunctor<T>,
+                                    math::ScaleFunctor<T>,
+                                    math::ScaleGradFunctor<T>, false>(
+          ctx, math::AddGradFunctor<T>(), math::ScaleFunctor<T>(scale),
+          math::ScaleGradFunctor<T>(scale), in_x, in_y, in_out, in_out_grad,
+          x_grad, y_grad);
+    }
+  } else if (funcs_str == "scale_grad,elementwise_add_grad") {
+    // The backward of Z = Unary(Binary(X, Y))
+    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    if (recomputation) {
+      RunUnaryCompoundGradFunctors<DeviceContext, T, math::ScaleGradFunctor<T>,
+                                   math::AddFunctor<T>, math::AddGradFunctor<T>,
+                                   true>(ctx, math::ScaleGradFunctor<T>(scale),
+                                         math::AddFunctor<T>(),
+                                         math::AddGradFunctor<T>(), in_x, in_y,
+                                         in_out, in_out_grad, x_grad, y_grad);
+    } else {
+      RunUnaryCompoundGradFunctors<DeviceContext, T, math::ScaleGradFunctor<T>,
+                                   math::AddFunctor<T>, math::AddGradFunctor<T>,
+                                   false>(ctx, math::ScaleGradFunctor<T>(scale),
+                                          math::AddFunctor<T>(),
+                                          math::AddGradFunctor<T>(), in_x, in_y,
+                                          in_out, in_out_grad, x_grad, y_grad);
+    }
+  } else if (funcs_str == "elementwise_add_grad,relu_grad") {
+    if (recomputation) {
+      RunBinaryCompoundGradFunctors<DeviceContext, T, math::AddGradFunctor<T>,
+                                    math::ReluFunctor<T>,
+                                    math::ReluGradFunctor<T>, true>(
+          ctx, math::AddGradFunctor<T>(), math::ReluFunctor<T>(),
+          math::ReluGradFunctor<T>(), in_x, in_y, in_out, in_out_grad, x_grad,
+          y_grad);
+    } else {
+      RunBinaryCompoundGradFunctors<DeviceContext, T, math::AddGradFunctor<T>,
+                                    math::ReluFunctor<T>,
+                                    math::ReluGradFunctor<T>, false>(
+          ctx, math::AddGradFunctor<T>(), math::ReluFunctor<T>(),
+          math::ReluGradFunctor<T>(), in_x, in_y, in_out, in_out_grad, x_grad,
+          y_grad);
+    }
+  } else if (funcs_str == "relu_grad,elementwise_add_grad") {
+    if (recomputation) {
+      RunUnaryCompoundGradFunctors<DeviceContext, T, math::ReluGradFunctor<T>,
+                                   math::AddFunctor<T>, math::AddGradFunctor<T>,
+                                   true>(ctx, math::ReluGradFunctor<T>(),
+                                         math::AddFunctor<T>(),
+                                         math::AddGradFunctor<T>(), in_x, in_y,
+                                         in_out, in_out_grad, x_grad, y_grad);
+    } else {
+      RunUnaryCompoundGradFunctors<DeviceContext, T, math::ReluGradFunctor<T>,
+                                   math::AddFunctor<T>, math::AddGradFunctor<T>,
+                                   false>(ctx, math::ReluGradFunctor<T>(),
+                                          math::AddFunctor<T>(),
+                                          math::AddGradFunctor<T>(), in_x, in_y,
+                                          in_out, in_out_grad, x_grad, y_grad);
+    }
+  } else {
+    PADDLE_THROW("%s has not been implemented.", funcs_str);
+  }
+}
+
+template <typename DeviceContext, typename T>
+class FusedElemwiseActivationKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto &in_x = detail::Ref(ctx.Input<framework::Tensor>("X"),
+                             "Cannot get input tensor %s, variable name = %s",
+                             "X", ctx.op().Input("X"));
+    auto &in_y = detail::Ref(ctx.Input<framework::Tensor>("Y"),
+                             "Cannot get input tensor %s, variable name = %s",
+                             "Y", ctx.op().Input("Y"));
+    auto &output = detail::Ref(ctx.Output<framework::Tensor>("Out"),
+                               "Cannot get input tensor %s, variable name = %s",
+                               "Out", ctx.op().Output("Out"));
+
+    RunFunctors<DeviceContext, T>(ctx, &in_x, &in_y, &output);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FusedElemwiseActivationGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto &in_x = detail::Ref(ctx.Input<framework::Tensor>("X"),
+                             "Cannot get input tensor %s, variable name = %s",
+                             "X", ctx.op().Input("X"));
+    auto &in_y = detail::Ref(ctx.Input<framework::Tensor>("Y"),
+                             "Cannot get input tensor %s, variable name = %s",
+                             "Y", ctx.op().Input("Y"));
+    auto &in_out = detail::Ref(ctx.Input<framework::Tensor>("Out"),
+                               "Cannot get input tensor %s, variable name = %s",
+                               "Out", ctx.op().Input("Out"));
+    auto &in_out_grad =
+        detail::Ref(ctx.Input<framework::Tensor>(framework::GradVarName("Out")),
+                    "Cannot get input tensor %s, variable name = %s",
+                    framework::GradVarName("Out"),
+                    ctx.op().Input(framework::GradVarName("Out")));
+
+    framework::Tensor *x_grad =
+        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    framework::Tensor *y_grad =
+        ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+
+    RunGradFunctors<DeviceContext, T>(ctx, &in_x, &in_y, &in_out, &in_out_grad,
+                                      x_grad, y_grad);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dadd054b9a6f8d44f4e5832888052bffde34c827
--- /dev/null
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
@@ -0,0 +1,167 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/hierarchical_sigmoid_op.h"
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+/**
+ * Organize the classes into a binary tree. At each node, a sigmoid function
+ * is used to calculate the probability of belonging to the right branch.
+ * This idea is from "F. Morin, Y. Bengio (AISTATS 05):
+ * Hierarchical Probabilistic Neural Network Language Model."
+ *
+ * Here we uses a simple way of making the binary tree.
+ * Assuming the number of classes C = 6,
+ * The classes are organized as a binary tree in the following way:
+ *
+ * @code{.py}
+ * *-*-*- 2
+ * | | |- 3
+ * | |
+ * | |-*- 4
+ * |   |- 5
+ * |
+ * |-*- 0
+ *   |- 1
+ * @endcode
+ *
+ * where * indicates an internal node, and each leaf node represents a class.
+ * - Node 0 ... C-2 are internal nodes.
+ * - Node C-1 ... 2C-2 are leaf nodes.
+ * - Class c is represented by leaf node \f$c+C-1\f$.
+ *
+ * We assign an id for each node:
+ * - the id of root be 0.
+ * - the left child of a node i is 2*i+1.
+ * - the right child of a node i is 2*i+2.
+ *
+ * It's easy to see that:
+ * - the parent of node i is \f$\left\lfloor(i-1)/2\right\rfloor\f$.
+ * - the j-th level ancestor of node i is
+ * \f$\left\lfloor(i+1)/2^{j+1}\right\rfloor - 1\f$.
+ * - A node i is a left child of its parent if \f$(i-1)\%2==0\f$.
+ *
+ */
+
+class HierarchicalSigmoidOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("PreOut"),
+                   "Output(PreOut) should not be null.");
+    const int64_t batch_size = ctx->GetInputDim("X")[0];
+    std::vector<int64_t> output_shape({batch_size, 1});
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.GetPlace());
+  }
+};
+
+template <typename AttrType>
+class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor, required) The input tensor with shape [N, D], "
+             "where N is the size of mini-batch, and D is the feature size.");
+    AddInput("W",
+             "(Tensor, required), The parameters of hierarchical "
+             "sigmoid operator, each of them is a 2-D tensor, the shape is"
+             "[num_classes - 1, D].");
+    AddInput("Label",
+             "(Tensor, required), The labels of training data. It's a"
+             "tensor with shape [N, 1].");
+    AddInput("Bias",
+             "(Tensor, optional), The bias is a tensor with shape"
+             "[1, num_classes - 1].");
+    AddOutput("Out",
+              "(Tensor, required) The output of hierarchical sigmoid operator."
+              "The shape is [N, 1].");
+    AddOutput("PreOut",
+              "(Tensor, required) A intermedia 2-D tensor with shape "
+              "[batch_size, code_length], where code_length represents the "
+              "maximum path length from root to leaf nodes.")
+        .AsIntermediate();
+    AddAttr<AttrType>("num_classes", "(int, required), The number of classes")
+        .SetDefault(2);
+    AddComment(R"DOC(
+The hierarchical sigmoid operator organize the classes into a binary tree.
+At each node, a sigmoid function is used to calculate the probability of
+belonging to the right branch. This idea is from
+"F. Morin, Y. Bengio (AISTATS 05):
+Hierarchical Probabilistic Neural Network Language Model."
+      )DOC");
+  }
+};
+
+class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("PreOut"),
+                   "Input(Preout) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("W")),
+                   "Output(W@Grad should not be null.)");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")));
+    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
+      ctx->SetOutputDim(framework::GradVarName("Bias"),
+                        ctx->GetInputDim("Bias"));
+    }
+    ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.GetPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(hierarchical_sigmoid, ops::HierarchicalSigmoidOp,
+                  ops::HierarchicalSigmoidOpMaker<int>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(hierarchical_sigmoid_grad, ops::HierarchicalSigmoidGradOp);
+REGISTER_OP_CPU_KERNEL(
+    hierarchical_sigmoid,
+    ops::HierarchicalSigmoidOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::HierarchicalSigmoidOpKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
+REGISTER_OP_CPU_KERNEL(
+    hierarchical_sigmoid_grad,
+    ops::HierarchicalSigmoidGradOpKernel<paddle::platform::CPUDeviceContext,
+                                         float>,
+    ops::HierarchicalSigmoidGradOpKernel<paddle::platform::CPUDeviceContext,
+                                         double>);
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..64096a717b12ed231344649f5eb76b7e4b9af4a6
--- /dev/null
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -0,0 +1,135 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/clip_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/matrix_bit_code.h"
+#include "paddle/fluid/platform/transform.h"
+namespace paddle {
+namespace operators {
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+using platform::Transform;
+
+template <typename DeviceContext, typename T>
+class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* w = ctx.Input<framework::Tensor>("W");
+    auto* label = ctx.Input<framework::Tensor>("Label");
+    auto* bias = ctx.Input<framework::Tensor>("Bias");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* pre_out = ctx.Output<framework::Tensor>("PreOut");
+    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
+    int64_t code_length = math::FindLastSet(num_classes - 1);
+    int64_t batch_size = in->dims()[0];
+    framework::Tensor sum;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto* pre_out_data = pre_out->mutable_data<T>(
+        framework::make_ddim({batch_size, code_length}), ctx.GetPlace());
+    auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
+    // Not all class(leaf) nodes' path lengths equal code_length, thus init as
+    // 0s can avoid out of path's loss.
+    math::SetConstant<DeviceContext, T> zero;
+    zero(dev_ctx, pre_out, static_cast<T>(0.0));
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    math::RowwiseSum<DeviceContext, T> row_sum;
+    math::MatrixBitCodeFunctor<T> bit_code(num_classes, label->data<int64_t>());
+
+    std::vector<int64_t> sum_dims({batch_size, 1UL});
+    sum.mutable_data<T>(framework::make_ddim(sum_dims), ctx.GetPlace());
+    auto sum_mat = EigenMatrix<T>::From(sum);
+    out->mutable_data<T>(ctx.GetPlace());
+    auto out_mat = framework::EigenVector<T>::Flatten(*out);
+    if (bias) {
+      bit_code.Add(pre_out, *bias);
+    }
+    bit_code.Mul(pre_out, *w, *in);
+    // clip to [-40, 40]
+    Transform<DeviceContext> trans;
+    trans(ctx.template device_context<DeviceContext>(), pre_out_data,
+          pre_out_data + pre_out->numel(), pre_out_data,
+          ClipFunctor<T>(static_cast<T>(-40.0), static_cast<T>(40.0)));
+    bit_code.Sum(*pre_out, out, static_cast<T>(-1));
+    // use softrelu to calculate cross entropy
+    pre_out_mat.device(place) = (static_cast<T>(1.0) + pre_out_mat.exp()).log();
+    row_sum(dev_ctx, *pre_out, &sum);
+    // TODO(guosheng): Subtract the out of path's loss, since not all
+    // class(leaf) nodes' path lengths equal code_length. But it won't break the
+    // gradient check since both have the out of path's loss and will cancel out
+    // each other.
+    out_mat.device(place) = sum_mat + out_mat;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* w = ctx.Input<framework::Tensor>("W");
+    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* w_grad = ctx.Output<framework::Tensor>(framework::GradVarName("W"));
+    auto* bias_grad =
+        ctx.Output<framework::Tensor>(framework::GradVarName("Bias"));
+    auto* label = ctx.Input<framework::Tensor>("Label");
+    auto* pre_out = ctx.Input<framework::Tensor>("PreOut");
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    framework::Tensor pre_out_grad;
+
+    pre_out_grad.mutable_data<T>(pre_out->dims(), ctx.GetPlace());
+    in_grad->mutable_data<T>(ctx.GetPlace());
+    w_grad->mutable_data<T>(ctx.GetPlace());
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
+    zero(dev_ctx, in_grad, static_cast<T>(0.0));
+    zero(dev_ctx, w_grad, static_cast<T>(0.0));
+
+    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
+    math::MatrixBitCodeFunctor<T> bit_code(num_classes, label->data<int64_t>());
+
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
+    auto pre_out_grad_mat = EigenMatrix<T>::From(pre_out_grad);
+    auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
+    Eigen::array<int, 2> bcast({{1, static_cast<int>(pre_out_grad.dims()[1])}});
+
+    // softrelu derivative
+    pre_out_grad_mat.device(place) =
+        static_cast<T>(1.0) - static_cast<T>(1.0) / pre_out_mat.exp();
+    bit_code.Sub(&pre_out_grad);  // the gradient of clip(w * x + b)
+    pre_out_grad_mat.device(place) =
+        pre_out_grad_mat * out_grad_mat.broadcast(bcast);
+    // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
+    // be consistent with the clipping in forward.
+    if (bias_grad) {
+      bias_grad->mutable_data<T>(ctx.GetPlace());
+      zero(dev_ctx, bias_grad, static_cast<T>(0.0));
+      bit_code.AddGrad(pre_out_grad, bias_grad);
+    }
+    bit_code.MulGradWeight(pre_out_grad, w_grad, *in);
+    bit_code.MulGradError(pre_out_grad, *w, in_grad);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
index 0669661d225c664010fce97f0a526b62988b92c5..8efd43928aac994c7630a213f6724e8f50abc7e0 100644
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/im2sequence_op.h"
+#include <string>
 #include <vector>
 
 namespace paddle {
@@ -28,27 +29,18 @@ class Im2SequenceOp : public framework::OperatorWithKernel {
                    "Input(X) of Im2SequenceOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of Im2SequenceOp op should not be null.");
-
     auto in_dim = ctx->GetInputDim("X");
+
     PADDLE_ENFORCE_EQ(in_dim.size(), 4,
                       "Input(X) format must be 4D tensor, eg., NCHW.");
+    int img_channels = in_dim[1];
 
     auto kernels = ctx->Attrs().Get<std::vector<int>>("kernels");
     auto strides = ctx->Attrs().Get<std::vector<int>>("strides");
     auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
 
-    int batch_size = in_dim[0];
-    int img_channels = in_dim[1];
-    int img_height = in_dim[2];
-    int img_width = in_dim[3];
-
-    int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0],
-                                         paddings[2], strides[0]);
-    int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
-                                        paddings[3], strides[1]);
-
-    ctx->SetOutputDim("Out", {batch_size * output_height * output_width,
-                              img_channels * kernels[0] * kernels[1]});
+    ctx->SetOutputDim("Out",
+                      {in_dim[0], img_channels * kernels[0] * kernels[1]});
   }
 };
 
@@ -61,6 +53,10 @@ class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
              "C: channels"
              "H: height"
              "W: width");
+    AddInput("Y",
+             "(Tensor) The input tensor of image real size(H, W)."
+             "2-D with shape [batchsize, 2]")
+        .AsDispensable();
     AddOutput("Out", "(LodTensor) The output data of im2sequence op,");
     AddAttr<std::vector<int>>("kernels",
                               "(vector<int>), the "
@@ -73,6 +69,13 @@ class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
                               "(vector<int> default:{0, 0, 0, 0}), the "
                               "paddings(up_pad, left_pad, down_pad, right_pad)")
         .SetDefault({0, 0, 0, 0});
+    AddAttr<std::vector<int>>("out_stride",
+                              "the attribute is valid only when input(Y)"
+                              "is not NULL.this attribute represents the"
+                              "scaling of the pic through the CNN"
+                              "(vector<int> dedault:{1,1}),the out_stride"
+                              " (out_stride_height, out_stride_width)")
+        .SetDefault({1, 1});
     AddComment(R"DOC(
 This op uses kernels to scan images and converts these images to sequences.
 After expanding, The number of time steps are output_height * output_width
@@ -123,7 +126,7 @@ output.data = [[ 6.  2.  8.  3.  2.  4.  6.  3.]
                [ 7.  1.  7.  9.  2.  1.  3.  5.]
                [ 5.  7.  2.  4.  1.  3.  9.  0.]
                [ 7.  9.  4.  8.  3.  5.  0.  8.]]
-output.dims = {8, 9}
+output.dims = {8, 8}
 output.lod = [[0, 4, 8]]
 
 )DOC");
diff --git a/paddle/fluid/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h
index d792c68f784d8ffec0eb303a6ab9b59c9f121fa7..4a9942819414d552eb69bd0b30b66aab76a2dbf4 100644
--- a/paddle/fluid/operators/im2sequence_op.h
+++ b/paddle/fluid/operators/im2sequence_op.h
@@ -13,6 +13,7 @@
    limitations under the License. */
 
 #pragma once
+#include <string>
 #include <vector>
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/eigen.h"
@@ -39,50 +40,107 @@ class Im2SequenceKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     const Tensor* in = ctx.Input<Tensor>("X");
     LoDTensor* out = ctx.Output<LoDTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    // TODO(wanghaoshuang): Add layout checker after 'set_layout'
-    // being available for python API
-    // PADDLE_ENFORCE_EQ(in->layout(), framework::DataLayout::kNCHW,
-    //                  "Input(X) layout must be NCHW");
     auto in_dim = in->dims();
     int batch_size = in_dim[0];
     int img_channels = in_dim[1];
     int img_height = in_dim[2];
     int img_width = in_dim[3];
-
     auto kernels = ctx.Attr<std::vector<int>>("kernels");
     auto strides = ctx.Attr<std::vector<int>>("strides");
     auto paddings = ctx.Attr<std::vector<int>>("paddings");
-    int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0],
-                                         paddings[2], strides[0]);
-    int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
-                                        paddings[3], strides[1]);
-
-    const std::vector<int> dilations({1, 1});
-
-    auto out_dims = out->dims();
-    out->Resize({batch_size, out->numel() / batch_size});
-    for (int i = 0; i < batch_size; i++) {
-      const Tensor src =
-          in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
-      Tensor dst = out->Slice(i, i + 1).Resize(
-          {output_height, output_width, img_channels, kernels[0], kernels[1]});
-
-      math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
-      auto& dev_ctx = ctx.template device_context<DeviceContext>();
-      f(dev_ctx, src, dilations, strides, paddings, &dst);
-    }
-    out->Resize(out_dims);
-
-    // set lod information
-    // TODO(wanghaoshuang): Move this to InferShape
-    framework::LoD lod(1);
-    lod[0].reserve(batch_size + 1);
-    for (int i = 0, offset = 0; i < batch_size + 1; ++i) {
+    if (ctx.HasInput("Y") && batch_size > 1) {
+      const Tensor* imgrealsize = ctx.Input<Tensor>("Y");
+      auto out_stride = ctx.Attr<std::vector<int>>("out_stride");
+      Tensor cpu_shape_tensor;
+      TensorCopySync(*imgrealsize, platform::CPUPlace(), &cpu_shape_tensor);
+      std::vector<int> imgreal_h;
+      std::vector<int> imgreal_w;
+      std::vector<int> output_height;
+      std::vector<int> output_width;
+      int result = 0;
+      for (int i = 0; i < batch_size; i++) {
+        int tmp_real_h = static_cast<int>((cpu_shape_tensor.data<T>())[2 * i]);
+        int tmp_real_w =
+            static_cast<int>((cpu_shape_tensor.data<T>())[2 * i + 1]);
+        if (tmp_real_h % out_stride[0] == 0) {
+          tmp_real_h = tmp_real_h / out_stride[0];
+        } else {
+          tmp_real_h = tmp_real_h / out_stride[0] + 1;
+        }
+        if (tmp_real_w % out_stride[1] == 0) {
+          tmp_real_w = tmp_real_w / out_stride[1];
+        } else {
+          tmp_real_w = tmp_real_w / out_stride[1] + 1;
+        }
+        imgreal_h.push_back(tmp_real_h);
+        imgreal_w.push_back(tmp_real_w);
+        output_height.push_back(Im2SeqOutputSize(
+            imgreal_h[i], kernels[0], paddings[0], paddings[2], strides[0]));
+        output_width.push_back(Im2SeqOutputSize(
+            imgreal_w[i], kernels[1], paddings[1], paddings[3], strides[1]));
+        result += output_height[i] * output_width[i];
+      }
+
+      out->mutable_data<T>({result, img_channels * kernels[0] * kernels[1]},
+                           ctx.GetPlace());
+
+      const std::vector<int> dilations({1, 1});
+      int offset_out = 0;
+      for (int i = 0; i < batch_size; i++) {
+        const Tensor src =
+            in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
+        Tensor dst = out->Slice(offset_out,
+                                offset_out + output_height[i] * output_width[i])
+                         .Resize({output_height[i], output_width[i],
+                                  img_channels, kernels[0], kernels[1]});
+        offset_out += output_height[i] * output_width[i];
+
+        math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
+        auto& dev_ctx = ctx.template device_context<DeviceContext>();
+        f(dev_ctx, src, dilations, strides, paddings, &dst);
+      }
+      framework::LoD lod(1);
+      lod[0].reserve(batch_size + 1);
+      int offset = 0;
+      lod[0].push_back(offset);
+      for (int i = 0; i < batch_size; ++i) {
+        offset += output_height[i] * output_width[i];
+        lod[0].push_back(offset);
+      }
+      out->set_lod(lod);
+    } else {
+      int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0],
+                                           paddings[2], strides[0]);
+      int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
+                                          paddings[3], strides[1]);
+      out->mutable_data<T>({batch_size * output_height * output_width,
+                            img_channels * kernels[0] * kernels[1]},
+                           ctx.GetPlace());
+      const std::vector<int> dilations({1, 1});
+      auto out_dims = out->dims();
+      out->Resize({batch_size, out->numel() / batch_size});
+      for (int i = 0; i < batch_size; i++) {
+        const Tensor src =
+            in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
+        Tensor dst =
+            out->Slice(i, i + 1).Resize({output_height, output_width,
+                                         img_channels, kernels[0], kernels[1]});
+
+        math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
+        auto& dev_ctx = ctx.template device_context<DeviceContext>();
+        f(dev_ctx, src, dilations, strides, paddings, &dst);
+      }
+      out->Resize(out_dims);
+      framework::LoD lod(1);
+      lod[0].reserve(batch_size + 1);
+      int offset = 0;
       lod[0].push_back(offset);
-      offset += output_height * output_width;
+      for (int i = 0; i < batch_size; ++i) {
+        offset += output_height * output_width;
+        lod[0].push_back(offset);
+      }
+      out->set_lod(lod);
     }
-    out->set_lod(lod);
   }
 };
 
diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu
index 2ab9385b2c4eaa82f20430fd59dfab1287022af4..b2900cbf352636acc0db1bbb86778be4cea0aacf 100644
--- a/paddle/fluid/operators/layer_norm_op.cu
+++ b/paddle/fluid/operators/layer_norm_op.cu
@@ -42,6 +42,9 @@ inline static int GetDesiredBlockDim(int block_dim) {
   FIXED_BLOCK_DIM_CASE_BASE(2, ##__VA_ARGS__); \
   FIXED_BLOCK_DIM_CASE_BASE(1, ##__VA_ARGS__)
 
+static __device__ __forceinline__ float real_sqrt(float x) { return sqrtf(x); }
+static __device__ __forceinline__ double real_sqrt(double x) { return sqrt(x); }
+
 template <typename T, int BlockDim>
 __global__ void LayerNormForward(const T *x, const T *scale, const T *bias,
                                  T *y, T *mean, T *var, float epsilon,
@@ -55,7 +58,7 @@ __global__ void LayerNormForward(const T *x, const T *scale, const T *bias,
   // Step 1: Reduce to calculate mean
   T mean_val = static_cast<T>(0);
   for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    mean_val += x[beg_idx];
+    mean_val += x[i];
   }
   mean_val = BlockReduce(temp_storage).Reduce(mean_val, cub::Sum());
   if (threadIdx.x == 0) mean[blockIdx.x] = mean_val / feature_size;
@@ -69,11 +72,9 @@ __global__ void LayerNormForward(const T *x, const T *scale, const T *bias,
     var_val += (tmp * tmp);
   }
   var_val = BlockReduce(temp_storage).Reduce(var_val, cub::Sum());
-  if (threadIdx.x == 0) {
-    var[blockIdx.x] = var_val / feature_size;
-  }
+  if (threadIdx.x == 0) var[blockIdx.x] = var_val / feature_size;
   __syncthreads();
-  var_val = static_cast<T>(sqrt(var[blockIdx.x] + epsilon));
+  var_val = static_cast<T>(real_sqrt(var[blockIdx.x] + epsilon));
 
   // Step 3: Calculate y
   if (scale != nullptr) {
@@ -104,9 +105,10 @@ __global__ void LayerNormForward(const T *x, const T *scale, const T *bias,
 }
 
 template <typename T>
-struct Pair {
-  __device__ __forceinline__ Pair() {}
-  __device__ __forceinline__ Pair(const T &first, const T &second)
+struct PairForLayerNormBackward {
+  __device__ __forceinline__ PairForLayerNormBackward() {}
+  __device__ __forceinline__ PairForLayerNormBackward(const T &first,
+                                                      const T &second)
       : first_(first), second_(second) {}
 
   T first_;
@@ -114,10 +116,12 @@ struct Pair {
 };
 
 template <typename T>
-struct PairAddFunctor {
-  __device__ __forceinline__ Pair<T> operator()(const Pair<T> &p1,
-                                                const Pair<T> &p2) {
-    return Pair<T>(p1.first_ + p2.first_, p1.second_ + p2.second_);
+struct PairForLayerNormBackwardAddFunctor {
+  __device__ __forceinline__ PairForLayerNormBackward<T> operator()(
+      const PairForLayerNormBackward<T> &p1,
+      const PairForLayerNormBackward<T> &p2) {
+    return PairForLayerNormBackward<T>(p1.first_ + p2.first_,
+                                       p1.second_ + p2.second_);
   }
 };
 
@@ -129,7 +133,7 @@ __global__ void LayerNormBackwardGradientAll(const T *x, const T *d_y,
                                              const T *mean, const T *var,
                                              const T *scale, float epsilon,
                                              int batch_size, int feature_size) {
-  using BlockReduce = cub::BlockReduce<Pair<T>, BlockDim>;
+  using BlockReduce = cub::BlockReduce<PairForLayerNormBackward<T>, BlockDim>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
   int beg_idx = threadIdx.x * feature_size + blockIdx.x;
@@ -139,15 +143,16 @@ __global__ void LayerNormBackwardGradientAll(const T *x, const T *d_y,
 
   for (int i = beg_idx; i < end_idx; i += stride) {
     int row_idx = i / feature_size;
-    auto var_val = static_cast<T>(sqrt(var[row_idx] + epsilon));
+    auto var_val = static_cast<T>(real_sqrt(var[row_idx] + epsilon));
     d_scale_partial += d_y[i] * (x[i] - mean[row_idx]) / var_val;
     d_bias_partial += d_y[i];
     if (HasDx) d_x[i] = d_y[i] * scale[blockIdx.x] / var_val;
   }
 
-  auto pair = BlockReduce(temp_storage)
-                  .Reduce(Pair<T>(d_scale_partial, d_bias_partial),
-                          PairAddFunctor<T>());
+  auto pair =
+      BlockReduce(temp_storage)
+          .Reduce(PairForLayerNormBackward<T>(d_scale_partial, d_bias_partial),
+                  PairForLayerNormBackwardAddFunctor<T>());
 
   if (threadIdx.x == 0) {
     d_scale[blockIdx.x] = pair.first_;
@@ -172,7 +177,7 @@ __global__ void LayerNormBackwardGradientScaleOrBias(
 
   for (int i = beg_idx; i < end_idx; i += stride) {
     int row_idx = i / feature_size;
-    auto var_val = static_cast<T>(sqrt(var[row_idx] + epsilon));
+    auto var_val = static_cast<T>(real_sqrt(var[row_idx] + epsilon));
     if (HasDScale) {
       d_scale_or_d_bias_partial += d_y[i] * (x[i] - mean[row_idx]) / var_val;
     } else {  // d_bias != nullptr
@@ -180,10 +185,11 @@ __global__ void LayerNormBackwardGradientScaleOrBias(
     }
 
     if (HasDx) {
-      if (scale != nullptr)
+      if (scale != nullptr) {
         d_x[i] = d_y[i] * scale[blockIdx.x] / var_val;
-      else
+      } else {
         d_x[i] = d_y[i] / var_val;
+      }
     }
   }
 
@@ -208,7 +214,7 @@ __global__ void LayerNormBackwardGradientOnlyX(const T *d_y, T *d_x,
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < batch_size * feature_size) {
     int row_idx = idx / feature_size;
-    auto var_val = static_cast<T>(sqrt(var[row_idx] + epsilon));
+    auto var_val = static_cast<T>(real_sqrt(var[row_idx] + epsilon));
     if (scale != nullptr) {
       int col_idx = idx % feature_size;
       d_x[idx] = d_y[idx] * scale[col_idx] / var_val;
@@ -224,15 +230,19 @@ __global__ void LayerNormBackwardWhenBatchSizeIsOne(
     const T *var, const T *scale, float epsilon, int feature_size) {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < feature_size) {
-    auto var_val = static_cast<T>(sqrt(var[idx] + epsilon));
+    auto var_val = static_cast<T>(real_sqrt(var[idx] + epsilon));
     if (d_x != nullptr) {
-      if (d_scale == nullptr)
+      if (d_scale == nullptr) {
         d_x[idx] = d_y[idx] / var_val;
-      else
+      } else {
         d_x[idx] = d_y[idx] * scale[idx] / var_val;
+      }
     }
-    if (d_scale != nullptr)
+
+    if (d_scale != nullptr) {
       d_scale[idx] = d_y[idx] * (x[idx] - mean[idx]) / var_val;
+    }
+
     if (d_bias != nullptr) d_bias[idx] = d_y[idx];
   }
 }
@@ -243,9 +253,9 @@ static void LayerNormBackward(const T *x, const T *d_y, const T *scale,
                               T *d_bias, float epsilon, int batch_size,
                               int feature_size, cudaStream_t stream) {
   const int kMaxBlockDim = 512;
-  int gradient_flag = (static_cast<int>(d_x != nullptr) << 2) |
-                      (static_cast<int>(d_scale != nullptr) << 1) |
-                      (static_cast<int>(d_bias != nullptr));
+  int gradient_flag = ((d_x != nullptr ? 1 : 0) << 2) |
+                      ((d_scale != nullptr ? 1 : 0) << 1) |
+                      ((d_bias != nullptr ? 1 : 0));
   if (gradient_flag == 0) return;
 
   if (batch_size == 1) {
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 56e39649b409f7eed108027f6df58c19dd3c8ab8..e14b148cc00f425e90b0b2256ab3462753a34f47 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -19,12 +19,17 @@ limitations under the License. */
 #include <thread>  // NOLINT
 #include <vector>
 
+#include "gflags/gflags.h"
+
 #include "paddle/fluid/operators/detail/macros.h"
 
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/listen_and_serv_op.h"
 #include "paddle/fluid/platform/profiler.h"
 
+DEFINE_int32(listen_and_serv_profile_period, 0,
+             "the period of listen_and_serv to do profile");
+
 namespace paddle {
 namespace operators {
 
@@ -61,6 +66,8 @@ static void ParallelExecuteBlocks(
         framework::Async([&executor, &prepared, &program, &scope, idx]() {
           int run_block = idx;  // thread local
           try {
+            VLOG(3) << "running server block: " << run_block
+                    << "pointer: " << prepared[run_block].get();
             executor->RunPreparedContext(prepared[run_block].get(), scope);
           } catch (const std::exception &e) {
             LOG(ERROR) << "run sub program error " << e.what();
@@ -107,18 +114,31 @@ void ListenAndServOp::RunSyncLoop(
   PADDLE_ENFORCE_GE(num_blocks, 2,
                     "server program should have at least 2 blocks");
 
-  std::vector<int> optimize_blocks_idx;
-  for (auto blk : optimize_blocks) {
-    optimize_blocks_idx.push_back(blk->ID());
+  // Prepare all the server block
+  std::vector<int> optimize_blocks_list;
+  for (size_t i = 1; i < program->Size(); ++i) {
+    optimize_blocks_list.push_back(i);
   }
-  auto optimize_prepared = executor->Prepare(*program, optimize_blocks_idx);
-  // Insert placeholder for block0 which holds current op itself.
+  auto optimize_prepared = executor->Prepare(*program, optimize_blocks_list);
+  // Insert placeholder for block0 which holds current op itself,
+  // NOTE the first block in `optimize_prepared` should never be ran.
   optimize_prepared.insert(
       optimize_prepared.begin(),
       std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
 
   rpc_service_->ResetBarrierCounter();
+
+  int32_t profile_step = 0;
   while (true) {
+    PADDLE_ENFORCE_LE(profile_step, FLAGS_listen_and_serv_profile_period,
+                      "profile_step should not be larger then "
+                      "FLAGS_listen_and_serv_profile_period");
+    if (FLAGS_listen_and_serv_profile_period > 0) {
+      if (profile_step == 0) {
+        auto pf_state = paddle::platform::ProfilerState::kCPU;
+        paddle::platform::EnableProfiler(pf_state);
+      }
+    }
     // Get from multiple trainers, we don't care about the order in which
     // the gradients arrives, just add suffix 0~n and merge the gradient.
     rpc_service_->SetCond(distributed::kRequestSend);
@@ -160,6 +180,15 @@ void ListenAndServOp::RunSyncLoop(
     // reset received sparse vars to avoid reuse it in the next mini-batch
     dynamic_cast<distributed::RequestSendHandler *>(request_send_handler_.get())
         ->ResetSparseVarRecorder();
+    if (FLAGS_listen_and_serv_profile_period > 0) {
+      if (profile_step == FLAGS_listen_and_serv_profile_period) {
+        paddle::platform::DisableProfiler(
+            paddle::platform::EventSortingKey::kTotal, "/dev/null");
+        profile_step = 0;
+      } else {
+        profile_step++;
+      }
+    }
   }  // while(true)
 }
 
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index ac35cf0b89bfaa0c0f8e64445f18a3bbd478e70a..27e26cb1b5c1e831f05dac299489628b92eaa58c 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -31,9 +31,6 @@ class LoadOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
-    auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-    platform::RecordEvent record_event(Type(), dev_ctx);
-
     // FIXME(yuyang18): We save variable to local file now, but we should change
     // it to save an output stream.
     auto filename = Attr<std::string>("file_path");
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index bda499432214b8841c8dfc406ee45ca0367920e7..d77b095c5d783a2a9fab87eb8b458117a6a3d225 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -32,20 +32,21 @@ class LookupTableOp : public framework::OperatorWithKernel {
 
     auto table_dims = ctx->GetInputDim("W");
     auto ids_dims = ctx->GetInputDim("Ids");
+    int ids_rank = ids_dims.size();
 
-    auto ids_var_type = ctx->GetInputsVarType("Ids").front();
-    // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type
-    // is LoDTensor, this tensor contains the ids to be looked up in W
-    // and it must be a column vector with rank = 2 while the 2nd dimension
-    // size must be 1, when Ids's type is SelectedRows, the rows of Ids
-    // contains the ids to be looked up in W;
-    if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
-      PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
-      PADDLE_ENFORCE_EQ(ids_dims[1], 1);
-    }
+    PADDLE_ENFORCE_EQ(table_dims.size(), 2);
+    PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1,
+                      "The last dimension of the 'Ids' tensor must be 1.");
+
+    auto output_dims =
+        framework::vectorize(framework::slice_ddim(ids_dims, 0, ids_rank - 1));
+    output_dims.push_back(table_dims[1]);
+    ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
 
-    ctx->SetOutputDim("Out", {ids_dims[0], table_dims[1]});
-    ctx->ShareLoD("Ids", /*->*/ "Out");
+    if (ctx->GetOutputsVarType("Out")[0] ==
+        framework::proto::VarType::LOD_TENSOR) {
+      ctx->ShareLoD("Ids", /*->*/ "Out");
+    }
   }
 
  protected:
@@ -62,17 +63,11 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("W",
              "(Tensor) The input represents embedding tensors, "
              "which is a learnable parameter.");
-    AddInput(
-        "Ids",
-        "(Tensor or SelectedRows) Ids's type can be Tensor or "
-        "SelectedRows, when Ids's type is Tensor, this tensor contains "
-        "the ids to be looked up in W and it must be a column vector with "
-        "rank = 2 while the 2nd dimension size must be 1; when Ids's type is "
-        "SelectedRows, the rows of Ids contains the ids to be looked up "
-        "in W.");
-    AddOutput("Out",
-              "(Tensor or SelectedRows) The lookup results, which have the "
-              "same type as W.");
+    AddInput("Ids",
+             "An input with type int32 or int64 "
+             "contains the ids to be looked up in W. "
+             "The last dimension size must be 1.");
+    AddOutput("Out", "The lookup results, which have the same type as W.");
     AddAttr<bool>("is_sparse",
                   "(boolean, default false) "
                   "Sparse update.")
@@ -90,15 +85,10 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
 Lookup Table Operator.
 
 This operator is used to perform lookups on the parameter W,
-then concatenated into a dense or sparse tensor.
-
-The type of Ids(Input) is SelectedRows, Tensor or LoDTensor, when Ids's
-type is SelectedRows, the rows of Ids contains the ids to be looked up in W;
-when Ids's type is Tensor, this tensor contains the ids to be looked up in W
-and it must be a column vector with rank = 2 while the 2nd dimension size must be 1,
-at this time, Ids can carry the LoD (Level of Details) information, or not, and
-the output only shares the LoD information with input Ids.
+then concatenated into a dense tensor.
 
+The input Ids can carry the LoD (Level of Details) information,
+or not. And the output only shares the LoD information with input Ids.
 
 )DOC");
   }
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index 77722c50d39003d9342afb04a61ae3aaf6b21100..74823dab09cac358f647c074ac2f2ee2fed17e55 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -23,7 +23,7 @@ namespace operators {
 
 template <typename T, int BlockDimX, int BlockDimY, int GridDimX,
           bool PaddingFlag>
-__global__ void LookupTable(T* output, const T* table, const int64_t* ids,
+__global__ void LookupTable(T *output, const T *table, const int64_t *ids,
                             const int64_t N, const int64_t K, const int64_t D,
                             const int64_t padding_idx) {
   int idx = threadIdx.x;
@@ -33,8 +33,8 @@ __global__ void LookupTable(T* output, const T* table, const int64_t* ids,
     int64_t id = ids[idy];
     PADDLE_ASSERT(id >= 0);
     PADDLE_ASSERT(id < N);
-    T* out = output + idy * D;
-    const T* tab = table + id * D;
+    T *out = output + idy * D;
+    const T *tab = table + id * D;
     for (int i = idx; i < D; i += BlockDimX) {
       if (PaddingFlag) {
         if (id == padding_idx)
@@ -50,7 +50,7 @@ __global__ void LookupTable(T* output, const T* table, const int64_t* ids,
 }
 
 template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
-__global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids,
+__global__ void LookupTableGrad(T *table, const T *output, const int64_t *ids,
                                 const int64_t N, const int64_t K,
                                 const int64_t D) {
   int idx = threadIdx.x;
@@ -60,8 +60,8 @@ __global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids,
     int id = ids[idy];
     PADDLE_ASSERT(id >= 0);
     PADDLE_ASSERT(id < N);
-    const T* out = output + idy * D;
-    T* tab = table + id * D;
+    const T *out = output + idy * D;
+    T *tab = table + id * D;
     for (int i = idx; i < D; i += BlockDimX) {
       paddle::platform::CudaAtomicAdd(&tab[i], out[i]);
     }
@@ -72,36 +72,19 @@ __global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids,
 template <typename T>
 class LookupTableCUDAKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* table_t = context.Input<LoDTensor>("W");
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *table_t = context.Input<LoDTensor>("W");
+    auto *ids_t = context.Input<LoDTensor>("Ids");
+    auto *output_t = context.Output<LoDTensor>("Out");
     int64_t padding_idx = context.Attr<int64_t>("padding_idx");
-    auto* ids_var = context.InputVar("Ids");
-    Tensor* output_t = context.Output<Tensor>("Out");
-
-    int64_t* ids;
-    int64_t K;
-
-    // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type
-    // is LoDTensor, this tensor contains the ids to be looked up in W;
-    // when Ids's type is SelectedRows, the rows of Ids contains the
-    // ids to be looked up in W.
-    if (ids_var->IsType<framework::LoDTensor>()) {
-      auto* ids_t = context.Input<LoDTensor>("Ids");
-      ids = const_cast<int64_t*>(ids_t->data<int64_t>());
-      K = ids_t->numel();
-    } else if (ids_var->IsType<framework::SelectedRows>()) {
-      auto* ids_t = context.Input<framework::SelectedRows>("Ids");
-      ids = const_cast<int64_t*>(ids_t->rows().CUDAData(context.GetPlace()));
-      K = ids_t->rows().size();
-      output_t->Resize({K, table_t->dims()[1]});
-    } else {
-      PADDLE_THROW("Unsupported Variable Type of Ids");
-    }
 
     size_t N = table_t->dims()[0];
     size_t D = table_t->dims()[1];
-    auto* table = table_t->data<T>();
-    auto* output = output_t->mutable_data<T>(context.GetPlace());
+    size_t K = ids_t->numel();
+
+    auto *ids = ids_t->data<int64_t>();
+    auto *table = table_t->data<T>();
+    auto *output = output_t->mutable_data<T>(context.GetPlace());
 
     dim3 threads(128, 8);
     dim3 grids(8, 1);
@@ -122,41 +105,44 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
 template <typename T>
 class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto& dev_ctx =
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto &dev_ctx =
         context.template device_context<platform::CUDADeviceContext>();
     bool is_sparse = context.Attr<bool>("is_sparse");
     // Since paddings are not trainable and fixed in forward, the gradient of
     // paddings makes no sense and we don't deal with it in backward.
     if (is_sparse) {
-      auto* ids = context.Input<LoDTensor>("Ids");
-      auto* table = context.Input<LoDTensor>("W");
-      auto* d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto* d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+      auto *ids = context.Input<LoDTensor>("Ids");
+      auto *table = context.Input<LoDTensor>("W");
+      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
+      auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
 
-      auto* ids_data = ids->data<int64_t>();
-      auto ids_dim = ids->dims();
+      auto *ids_data = ids->data<int64_t>();
+      int64_t ids_num = ids->numel();
 
       auto stream = dev_ctx.stream();
       // copy GPU memory to CPU pinned memory
       framework::Vector<int64_t> new_rows;
-      new_rows.resize(ids_dim[0]);
+      new_rows.resize(ids_num);
       auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
 
       // TODO(yuyang18): Strange code here.
       memory::Copy(platform::CPUPlace(),
                    new_rows.CUDAMutableData(context.GetPlace()), gpu_place,
-                   ids_data, ids_dim[0] * sizeof(int64_t), stream);
+                   ids_data, ids_num * sizeof(int64_t), stream);
 
       d_table->set_rows(new_rows);
 
-      auto* d_table_value = d_table->mutable_value();
-      d_table_value->Resize({ids_dim[0], table->dims()[1]});
+      auto *d_table_value = d_table->mutable_value();
+      d_table_value->Resize({ids_num, table->dims()[1]});
       d_table_value->mutable_data<T>(context.GetPlace());
 
-      auto* d_table_data = d_table_value->data<T>();
-      auto* d_output_data = d_output->data<T>();
-      PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
+      auto *d_table_data = d_table_value->data<T>();
+      auto *d_output_data = d_output->data<T>();
+      auto d_output_dims = d_output->dims();
+      PADDLE_ENFORCE_EQ(
+          d_table_value->dims(),
+          framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
       memory::Copy(gpu_place, d_table_data, gpu_place, d_output_data,
                    d_output->numel() * sizeof(T), stream);
 
@@ -168,9 +154,9 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
       int N = d_table_t->dims()[0];
       int D = d_table_t->dims()[1];
       int K = ids_t->numel();
-      const int64_t* ids = ids_t->data<int64_t>();
-      const T* d_output = d_output_t->data<T>();
-      T* d_table = d_table_t->mutable_data<T>(context.GetPlace());
+      const int64_t *ids = ids_t->data<int64_t>();
+      const T *d_output = d_output_t->data<T>();
+      T *d_table = d_table_t->mutable_data<T>(context.GetPlace());
 
       auto t = framework::EigenVector<T>::Flatten(*d_table_t);
       t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index d482506bf0361c11a019e32efbf348a64aaf5164..f5c10ced8305b64c6386c5051804f8c9a8f71802 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -36,43 +36,13 @@ template <typename T>
 class LookupTableKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
+    auto *ids_t = context.Input<LoDTensor>("Ids");      // int tensor
+    auto *output_t = context.Output<LoDTensor>("Out");  // float tensor
     auto *table_var = context.InputVar("W");
-    auto *ids_var = context.InputVar("Ids");
-    Tensor *output_t = context.Output<Tensor>("Out");
-    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
-
-    DDim table_dim;
 
-    if (table_var->IsType<LoDTensor>()) {
-      table_dim = context.Input<LoDTensor>("W")->dims();
-    } else if (table_var->IsType<SelectedRows>()) {
-      auto *table_t = context.Input<SelectedRows>("W");
-      table_dim = table_t->value().dims();
-    } else {
-      PADDLE_THROW(
-          "The parameter W of a LookupTable "
-          "must be either LoDTensor or SelectedRows");
-    }
-
-    int64_t *ids;
-    int64_t ids_numel;
-
-    // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type
-    // is LoDTensor, this tensor contains the ids to be looked up in W;
-    // when Ids's type is SelectedRows, the rows of Ids contains the
-    // ids to be looked up in W.
-    if (ids_var->IsType<LoDTensor>()) {
-      auto *ids_t = context.Input<LoDTensor>("Ids");
-      ids = const_cast<int64_t *>(ids_t->data<int64_t>());
-      ids_numel = ids_t->numel();
-    } else if (ids_var->IsType<SelectedRows>()) {
-      auto *ids_t = context.Input<SelectedRows>("Ids");
-      ids = const_cast<int64_t *>(ids_t->rows().data());
-      ids_numel = ids_t->rows().size();
-      output_t->Resize({ids_numel, table_dim[1]});
-    } else {
-      PADDLE_THROW("Unsupported Variable Type of Ids");
-    }
+    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+    int64_t *ids = const_cast<int64_t *>(ids_t->data<int64_t>());
+    int64_t ids_numel = ids_t->numel();
 
     if (table_var->IsType<LoDTensor>()) {
       auto *table_t = context.Input<LoDTensor>("W");
@@ -139,17 +109,17 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
 
       auto *ids_data = ids->data<int64_t>();
-      auto ids_dim = ids->dims();
+      int64_t ids_num = ids->numel();
 
       framework::Vector<int64_t> new_rows;
-      new_rows.reserve(ids_dim[0]);
-      for (int64_t i = 0; i < ids_dim[0]; i++) {
+      new_rows.reserve(ids_num);
+      for (int64_t i = 0; i < ids_num; i++) {
         new_rows.push_back(ids_data[i]);
       }
       d_table->set_rows(new_rows);
 
       auto *d_table_value = d_table->mutable_value();
-      d_table_value->Resize({ids_dim[0], table_dim[1]});
+      d_table_value->Resize({ids_num, table_dim[1]});
       d_table_value->mutable_data<T>(context.GetPlace());
 
       d_table->set_height(table_dim[0]);
@@ -157,7 +127,10 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       auto *d_output_data = d_output->data<T>();
       auto *d_table_data = d_table_value->data<T>();
 
-      PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
+      auto d_output_dims = d_output->dims();
+      PADDLE_ENFORCE_EQ(
+          d_table_value->dims(),
+          framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
       memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
     } else {
       auto *ids = context.Input<LoDTensor>("Ids");
@@ -165,10 +138,9 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       auto *d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
 
       auto *ids_data = ids->data<int64_t>();
-      auto ids_dim = ids->dims();
 
       int N = table_dim[0];
-      int D = d_output->dims()[1];
+      int D = table_dim[1];
 
       auto *d_output_data = d_output->data<T>();
       auto *d_table_data = d_table->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 5571ff9a7151c1f971ad1805bf001815a651202b..d2b772d11379c218be77277b89f3ded7b59ab9f3 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -51,6 +51,7 @@ math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function)
 math_library(sequence_scale)
 math_library(softmax DEPS math_function)
+math_library(matrix_bit_code)
 math_library(unpooling)
 math_library(vol2col)
 
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index 9f6c1e5c35f02cd4bc729eea78b17fac017aa90e..70f88f24f682e05972ca73ef7b50f96be50d1ef4 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -21,6 +21,10 @@
 #include "paddle/fluid/platform/dynload/mklml.h"
 #endif
 
+#ifdef PADDLE_WITH_LIBXSMM
+#include <libxsmm.h>
+#endif
+
 #ifdef PADDLE_USE_OPENBLAS
 #include <cblas.h>
 #endif
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index 2ce94cfc93823aa891114ef8fd1e851727ebc623..a0802ef90ca7e30a2b22d187cb9092163518d8e9 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
+#include <limits>
 #include <vector>
 #include "paddle/fluid/operators/math/math_function.h"
 
@@ -30,6 +31,13 @@ struct CBlas<float> {
     platform::dynload::cblas_sgemm(args...);
   }
 
+#ifdef PADDLE_WITH_LIBXSMM
+  template <typename... ARGS>
+  static void SMM_GEMM(ARGS... args) {
+    libxsmm_sgemm(args...);
+  }
+#endif
+
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
     platform::dynload::cblas_saxpy(args...);
@@ -63,6 +71,13 @@ struct CBlas<double> {
     platform::dynload::cblas_dgemm(args...);
   }
 
+#ifdef PADDLE_WITH_LIBXSMM
+  template <typename... ARGS>
+  static void SMM_GEMM(ARGS... args) {
+    libxsmm_dgemm(args...);
+  }
+#endif
+
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
     platform::dynload::cblas_daxpy(args...);
@@ -137,9 +152,13 @@ struct CBlas<double> {
   }
 };
 #endif
+
 template <>
 struct CBlas<platform::float16> {
   static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
+  static void SMM_GEMM(...) {
+    PADDLE_THROW("float16 SMM_GEMM not supported on CPU");
+  }
 #ifdef PADDLE_WITH_MKLML
   static void GEMM_BATCH(...) {
     PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
@@ -147,6 +166,64 @@ struct CBlas<platform::float16> {
 #endif
 };
 
+template <typename T>
+inline bool UseXSMM(const int &m, const int &n, const int &k, bool transa,
+                    bool transb, const T &alpha, const T &beta) {
+#ifdef PADDLE_WITH_LIBXSMM
+  // Refer to https://github.com/hfp/libxsmm/blob/master/README.md
+  // But the threshold is custom
+  constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20;
+  if (m * n * k > LIBXSMM_THRESHOLD || transa || transb ||
+      std::abs<T>(alpha - static_cast<T>(1) >
+                  std::numeric_limits<T>::epsilon()) ||
+      std::abs<T>(beta) > std::numeric_limits<T>::epsilon()) {
+    return false;
+  } else {
+    return true;
+  }
+#endif
+  return false;
+}
+
+template <>
+inline bool UseXSMM<platform::float16>(const int &m, const int &n, const int &k,
+                                       bool transa, bool transb,
+                                       const platform::float16 &alpha,
+                                       const platform::float16 &beta) {
+  return false;
+}
+
+template <typename T>
+inline void GEMM_WARP(CBLAS_ORDER order, CBLAS_TRANSPOSE transA,
+                      CBLAS_TRANSPOSE transB, int M, int N, int K, T alpha,
+                      const T *A, int lda, const T *B, int ldb, T beta, T *C,
+                      int ldc) {
+#ifdef PADDLE_WITH_LIBXSMM
+  if (UseXSMM<T>(M, N, K, transA != CblasNoTrans, transB != CblasNoTrans, alpha,
+                 beta)) {
+    // Note: SMM use ColMajor
+    const char transa = 'N';
+    const char transb = 'N';
+    CBlas<T>::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &ldb, A, &lda,
+                       &beta, C, &ldc);
+    return;
+  }
+#endif
+
+#ifdef PADDLE_MKL_SPLIT_GEMM
+  constexpr int bs = 2;
+  if (M % bs == 0 && transA == CblasNoTrans && transB == CblasNoTrans) {
+    for (int off = 0; off < M; off += bs) {
+      CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, bs, N, K, alpha,
+                     A + off * lda, lda, B, ldb, beta, C + off * ldb, ldc);
+    }
+    return;
+  }
+#endif
+  CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
+                 beta, C, ldc);
+}
+
 template <>
 template <typename T>
 void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
@@ -156,8 +233,8 @@ void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
   int lda = (transA == CblasNoTrans) ? K : M;
   int ldb = (transB == CblasNoTrans) ? N : K;
   int ldc = N;
-  CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-                 beta, C, ldc);
+  GEMM_WARP<T>(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
+               beta, C, ldc);
 }
 
 template <>
@@ -166,9 +243,9 @@ void Blas<platform::CPUDeviceContext>::GEMM(bool transA, bool transB, int M,
                                             int N, int K, T alpha, const T *A,
                                             int lda, const T *B, int ldb,
                                             T beta, T *C, int ldc) const {
-  CBlas<T>::GEMM(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
-                 transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
-                 lda, B, ldb, beta, C, ldc);
+  GEMM_WARP<T>(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
+               transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
+               lda, B, ldb, beta, C, ldc);
 }
 
 template <typename DeviceContext>
diff --git a/paddle/fluid/operators/math/functors.h b/paddle/fluid/operators/math/functors.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad2f49ccbf5ff37d33cc9e71c1a683571f4f8137
--- /dev/null
+++ b/paddle/fluid/operators/math/functors.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// AddFunctor
+template <typename T>
+struct AddFunctor {
+  // out = x + y;
+  inline HOSTDEVICE T operator()(T x, T y) { return x + y; }
+};
+
+template <typename T>
+struct AddGradFunctor {
+  inline HOSTDEVICE T operator()(T x, T y) { return 1; }
+
+  inline HOSTDEVICE T operator()(T x, T y, T out) const { return 1; }
+};
+
+template <typename T>
+struct ScaleFunctor {
+  explicit ScaleFunctor(const T coeff) : coeff_(coeff) {}
+
+  inline HOSTDEVICE T operator()(T ele) { return ele * coeff_; }
+
+ private:
+  T coeff_;
+};
+
+template <typename T>
+struct ScaleGradFunctor {
+  explicit ScaleGradFunctor(T coeff) : coeff_(coeff) {}
+
+  inline HOSTDEVICE T operator()(T x) { return coeff_; }
+
+  inline HOSTDEVICE T operator()(T x, T out) { return coeff_; }
+
+ private:
+  T coeff_;
+};
+
+template <typename T>
+struct ReluFunctor {
+  inline HOSTDEVICE T operator()(T x) { return x * (x > 0); }
+};
+
+template <typename T>
+struct ReluGradFunctor {
+  inline HOSTDEVICE T operator()(T x) { return x > 0 ? 1 : 0; }
+
+  inline HOSTDEVICE T operator()(T x, T out) { return x > 0 ? 1 : 0; }
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
index 336d6febc2ce3a55e82ed613bbc1081101f822f0..1472edbbf47e3e4d6b22c65349713904b13647d2 100644
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/im2col.h"
 #include <vector>
+#include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
 
 namespace paddle {
 namespace operators {
@@ -35,51 +36,18 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
     PADDLE_ENFORCE(im.dims().size() == 3);
     PADDLE_ENFORCE(col->dims().size() == 5);
 
-    int im_channels = im.dims()[0];
-    int im_height = im.dims()[1];
-    int im_width = im.dims()[2];
-    int filter_height = col->dims()[1];
-    int filter_width = col->dims()[2];
-    int col_height = col->dims()[3];
-    int col_width = col->dims()[4];
-
-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       ((dilation[0] * (filter_height - 1) + 1))) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       ((dilation[1] * (filter_width - 1) + 1))) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-
-    int channels_col = im_channels * filter_height * filter_width;
-
-    const T* im_data = im.data<T>();
-    T* col_data = col->data<T>();
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int c_im = c / (filter_width * filter_height);
-      for (int h = 0; h < col_height; ++h) {
-        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
-        for (int w = 0; w < col_width; ++w) {
-          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
-          int col_idx = (c * col_height + h) * col_width + w;
-          int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
-
-          col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
-                               im_col_idx < 0 || im_col_idx >= im_width)
-                                  ? static_cast<T>(0)
-                                  : im_data[im_idx];
-        }
+    if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 &&
+        dilation[1] == 1) {
+      if (padding[0] == 0 && padding[1] == 0) {
+        im2col_sh1sw1dh1dw1ph0pw0<T>(im, col);
+        return;
+      } else if (padding[0] == 1 && padding[1] == 1) {
+        im2col_sh1sw1dh1dw1ph1pw1<T>(im, col);
+        return;
       }
+      // TODO(TJ): complete padding >=2
     }
+    im2col_common<T>(im, dilation, stride, padding, col);
   }
 };
 
@@ -178,17 +146,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
     int col_height = col->dims()[0];
     int col_width = col->dims()[1];
 
-    PADDLE_ENFORCE_EQ(
-        (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
-        col_height,
-        "Output_height and padding(padding_up, padding_down) are "
-        "inconsistent.");
-    PADDLE_ENFORCE_EQ(
-        (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
-        col_width,
-        "col_width and padding(padding_left, padding_right) are "
-        "inconsistent.");
-
     const T* im_data = im.data<T>();
     T* col_data = col->data<T>();
 
diff --git a/paddle/fluid/operators/math/im2col.cu b/paddle/fluid/operators/math/im2col.cu
index eecb233d22cea06da016b2671fd606b70eddf5a5..4897767f4d88d9e079f05c921153923c4eb354b0 100644
--- a/paddle/fluid/operators/math/im2col.cu
+++ b/paddle/fluid/operators/math/im2col.cu
@@ -77,21 +77,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
     int col_height = col->dims()[3];
     int col_width = col->dims()[4];
 
-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       (dilation[0] * (filter_height - 1) + 1)) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       (dilation[1] * (filter_width - 1) + 1)) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "col_width and padding(padding_left, padding_right) are "
-                      "inconsistent.");
-
     int num_outputs = im_channels * col_height * col_width;
     int blocks = (num_outputs + 1024 - 1) / 1024;
     int block_x = 512;
@@ -274,21 +259,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
     int col_height = col->dims()[0];
     int col_width = col->dims()[1];
 
-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       (dilation[0] * (filter_height - 1) + 1)) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       (dilation[1] * (filter_width - 1) + 1)) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "col_width and padding(padding_left, padding_right) are "
-                      "inconsistent.");
-
     int block_dim_x = 0;
     int block_dim_y = 0;
     if (filter_height <= 4 && filter_width <= 4) {
diff --git a/paddle/fluid/operators/math/im2col_cfo_cpu.h b/paddle/fluid/operators/math/im2col_cfo_cpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d32bc5bd0d7f25479370959cabeb9b9c9e7e2d6
--- /dev/null
+++ b/paddle/fluid/operators/math/im2col_cfo_cpu.h
@@ -0,0 +1,252 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/**
+ * The most common im2col algorithm.
+ * Support dilation, stride and padding.
+ */
+template <typename T>
+inline void im2col_common(const framework::Tensor& im,
+                          const std::vector<int>& dilation,
+                          const std::vector<int>& stride,
+                          const std::vector<int>& padding,
+                          framework::Tensor* col) {
+  int im_channels = im.dims()[0];
+  int im_height = im.dims()[1];
+  int im_width = im.dims()[2];
+  int filter_height = col->dims()[1];
+  int filter_width = col->dims()[2];
+  int output_height = col->dims()[3];
+  int output_width = col->dims()[4];
+  int channels_col = im_channels * filter_height * filter_width;
+
+  const T* im_data = im.data<T>();
+  T* col_data = col->data<T>();
+  for (int c = 0; c < channels_col; ++c) {
+    int w_offset = c % filter_width;
+    int h_offset = (c / filter_width) % filter_height;
+    int c_im = c / (filter_width * filter_height);
+    for (int h = 0; h < output_height; ++h) {
+      int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
+      for (int w = 0; w < output_width; ++w) {
+        int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
+        int col_idx = (c * output_height + h) * output_width + w;
+        int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
+        col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
+                             im_col_idx < 0 || im_col_idx >= im_width)
+                                ? static_cast<T>(0)
+                                : im_data[im_idx];
+      }
+    }
+  }
+}
+
+/**
+ * im2col algorithm with strides == 1, dilations == 1, paddings == 0
+ */
+template <typename T>
+inline void im2col_sh1sw1dh1dw1ph0pw0(const framework::Tensor& im,
+                                      framework::Tensor* col) {
+  int im_channels = im.dims()[0];
+  int im_height = im.dims()[1];
+  int im_width = im.dims()[2];
+  int filter_height = col->dims()[1];
+  int filter_width = col->dims()[2];
+  int output_height = col->dims()[3];
+  int output_width = col->dims()[4];
+
+  const T* im_data = im.data<T>();
+  T* col_data = col->data<T>();
+  int col_matrix_width = output_width * output_height;
+  int im_size = im_height * im_width;
+  size_t copy_size = sizeof(T) * output_width;
+  const T* im_data_oh = im_data;
+  T* dst_data_oh = col_data;
+  for (int oh = 0; oh < output_height; ++oh) {
+    const T* src_data_ic = im_data_oh;
+    T* dst_data = dst_data_oh;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      const T* src_data = src_data_ic;
+      for (int kh = 0; kh < filter_height; ++kh) {
+        for (int kw = 0; kw < filter_width; ++kw) {
+          std::memcpy(dst_data, src_data + kw, copy_size);
+          dst_data = dst_data + col_matrix_width;
+        }
+        src_data = src_data + im_width;
+      }
+      src_data_ic = src_data_ic + im_size;
+    }
+    im_data_oh = im_data_oh + im_width;
+    dst_data_oh = dst_data_oh + output_width;
+  }
+}
+
+/**
+ * im2col algorithm with strides == 1, dilations == 1, paddings == 1
+ * and filter_width == 1 have a special implementation
+ */
+template <typename T>
+inline void im2col_sh1sw1dh1dw1ph1pw1(const framework::Tensor& im,
+                                      framework::Tensor* col) {
+  int im_channels = im.dims()[0];
+  int im_height = im.dims()[1];
+  int im_width = im.dims()[2];
+  int filter_height = col->dims()[1];
+  int filter_width = col->dims()[2];
+  int output_height = col->dims()[3];
+  int output_width = col->dims()[4];
+
+  constexpr int plh = 1;
+  constexpr int prh = 1;
+  constexpr int plw = 1;
+  constexpr int prw = 1;
+
+  const T* im_data = im.data<T>();
+  T* col_data = col->data<T>();
+  int im_size = im_height * im_width;
+  int col_matrix_width = output_width * output_height;
+  int col_block_fh = filter_width * col_matrix_width;  // fw*oh*ow
+  int col_block_ic = filter_height * col_block_fh;     // fh*fw*oh*ow
+
+  // fill height padding
+  {
+    size_t copy_size = sizeof(T) * output_width;
+    T* col_start_l = col_data;
+    T* col_start_r = col_data + (filter_height - 1) * col_block_fh +
+                     col_matrix_width - output_width;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      T* dst_data_l = col_start_l;
+      T* dst_data_r = col_start_r;
+      for (int kw = 0; kw < filter_width; ++kw) {
+        std::memset(dst_data_l, 0, copy_size);
+        std::memset(dst_data_r, 0, copy_size);
+        dst_data_l = dst_data_l + col_matrix_width;
+        dst_data_r = dst_data_r + col_matrix_width;
+      }
+      col_start_l = col_start_l + col_block_ic;
+      col_start_r = col_start_r + col_block_ic;
+    }
+  }
+
+  auto pad = static_cast<T>(0);
+  if (filter_width == 1) {
+    // fill width padding
+    T* dst_data_ic = col_data;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      T* dst_data_kh = dst_data_ic;
+      for (int kh = 0; kh < filter_height; ++kh) {
+        T* dst_data = dst_data_kh;
+        for (int oh = 0; oh < output_height; ++oh) {
+          *dst_data = pad;
+          dst_data = dst_data + output_width - 1;
+          *dst_data = pad;
+          ++dst_data;
+        }
+        dst_data_kh = dst_data_kh + col_block_fh;
+      }
+      dst_data_ic = dst_data_ic + col_block_ic;
+    }
+    // fill core
+    size_t copy_size = sizeof(T) * (output_width - plw - prw);
+    for (int oh = 0; oh < output_height; ++oh) {
+      const T* im_data_start =
+          im_data + (oh - plh > 0 ? oh - plh : 0) * im_width;
+      T* dst_data = col_data + oh * output_width;
+      for (int ic = 0; ic < im_channels; ++ic) {
+        const T* src_data = im_data_start + ic * im_size;
+        for (int kh = 0; kh < filter_height; ++kh) {
+          if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) &&
+                                         kh > (filter_height - prh - 1))) {
+            dst_data = dst_data + col_matrix_width;
+            continue;
+          }
+          std::memcpy(dst_data + plw, src_data, copy_size);
+          dst_data = dst_data + col_matrix_width;
+          src_data = src_data + im_width;
+        }
+      }
+    }
+    return;
+  }
+
+  // filter_width != 1
+  // fill width padding
+  T* dst_data_ic = col_data;
+  for (int ic = 0; ic < im_channels; ++ic) {
+    T* dst_data_kh = dst_data_ic;
+    for (int kh = 0; kh < filter_height; ++kh) {
+      for (T* dst_data :
+           {dst_data_kh, dst_data_kh + (filter_width - prw) * col_matrix_width +
+                             output_width - 1}) {
+        // TODO(TJ): from plh, saving repeated assignment
+        for (int oh = 0; oh < output_height; ++oh) {
+          *dst_data = pad;
+          dst_data = dst_data + output_width;
+        }
+      }
+      dst_data_kh = dst_data_kh + col_block_fh;
+    }
+    dst_data_ic = dst_data_ic + col_block_ic;
+  }
+
+  // TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) *
+  // (output_width-1)}
+  // length of copy_size is equal kw.
+  for (int oh = 0; oh < output_height; ++oh) {
+    const T* im_data_start = im_data + (oh - plh > 0 ? oh - plh : 0) * im_width;
+    T* dst_data = col_data + oh * output_width;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      const T* src_data = im_data_start + ic * im_size;
+      for (int kh = 0; kh < filter_height; ++kh) {
+        if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) &&
+                                       kh > (filter_height - prh - 1))) {
+          dst_data = dst_data + filter_width * col_matrix_width;
+          continue;
+        }
+        // TODO(TJ): reuse plw-kw outside this for
+        // try to unify
+        for (int kw = 0; kw < plw; ++kw) {
+          std::memcpy(dst_data + (plw - kw), src_data,
+                      sizeof(T) * (output_width - (plw - kw)));
+          dst_data = dst_data + col_matrix_width;
+        }
+        for (int kw = plw; kw < filter_width - prw; ++kw) {
+          std::memcpy(dst_data, src_data + (kw - plw),
+                      sizeof(T) * output_width);
+          dst_data = dst_data + col_matrix_width;
+        }
+        int i = 1;
+        for (int kw = filter_width - prw; kw < filter_width; ++kw, ++i) {
+          std::memcpy(dst_data, src_data + (kw - plw),
+                      sizeof(T) * (output_width - i));
+          dst_data = dst_data + col_matrix_width;
+        }
+        src_data = src_data + im_width;
+      }
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc
index 8e3f0f286823c383bb0c44d0e7887040ec9b20a0..ae2c90b33a4298ada4fd01aa2a44ebdf10d036d4 100644
--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/im2col.h"
 #include <gtest/gtest.h>
+#include <sys/time.h>
 #include <vector>
+#include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
 
 template <typename DeviceContext, typename Place>
 void testIm2col() {
@@ -167,3 +169,104 @@ TEST(math, im2col) {
              paddle::platform::CUDAPlace>();
 #endif
 }
+
+#define PREPARE_IM2COL_CPU                                                   \
+  paddle::platform::CPUPlace place;                                          \
+  paddle::platform::CPUDeviceContext context(place);                         \
+  paddle::framework::Tensor input;                                           \
+  paddle::framework::Tensor out;                                             \
+  paddle::framework::Tensor ref;                                             \
+  std::vector<int> padding({ph, pw});                                        \
+  std::vector<int> stride({1, 1});                                           \
+  std::vector<int> dilation({1, 1});                                         \
+  float* input_ptr = input.mutable_data<float>({ic, ih, iw}, place);         \
+  for (int i = 0; i < input.numel(); ++i) {                                  \
+    input_ptr[i] = static_cast<float>(i + 1);                                \
+  }                                                                          \
+  int output_height = (ih - fh + padding[0] * 2) / stride[0] + 1;            \
+  int output_width = (iw - fw + padding[1] * 2) / stride[1] + 1;             \
+  out.mutable_data<float>({ic, fh, fw, output_height, output_width}, place); \
+  ref.mutable_data<float>({ic, fh, fw, output_height, output_width}, place); \
+  paddle::operators::math::Im2ColFunctor<                                    \
+      paddle::operators::math::ColFormat::kCFO,                              \
+      paddle::platform::CPUDeviceContext, float>                             \
+      im2col
+
+void testIm2colCPU(int ic, int ih, int iw, int fh, int fw, int ph, int pw) {
+  PREPARE_IM2COL_CPU;
+
+  im2col(context, input, dilation, stride, padding, &out);
+  paddle::operators::math::im2col_common<float>(input, dilation, stride,
+                                                padding, &ref);
+
+  float* ref_data = ref.data<float>();
+  float* out_data = out.data<float>();
+  for (int i = 0; i < out.numel(); ++i) {
+    EXPECT_EQ(out_data[i], ref_data[i]);
+  }
+}
+
+void benchIm2col(int ic, int ih, int iw, int fh, int fw, int ph, int pw) {
+  PREPARE_IM2COL_CPU;
+  constexpr int repeat = 100;
+  auto GetCurrentMs = []() -> double {
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec;
+  };
+  auto t1 = GetCurrentMs();
+  for (int i = 0; i < repeat; ++i) {
+    im2col(context, input, dilation, stride, padding, &out);
+  }
+  auto t2 = GetCurrentMs();
+
+  for (int i = 0; i < repeat; ++i) {
+    paddle::operators::math::im2col_common<float>(input, dilation, stride,
+                                                  padding, &ref);
+  }
+  auto t3 = GetCurrentMs();
+
+  LOG(INFO) << "before: " << (t3 - t2) / repeat
+            << ",after: " << (t2 - t1) / repeat
+            << ",boost: " << ((t3 - t2) / (t2 - t1) - 1) * 100 << "%";
+}
+
+TEST(math, im2col_cputest) {
+  // padding_h == padding_w
+  for (int p = 0; p < 4; ++p) {
+    // width == height
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 5, /*fh*/ 4, /*fw*/ 4, /*ph*/ p,
+                  /*pw*/ p);
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 3, /*fw*/ 3, /*ph*/ p,
+                  /*pw*/ p);
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 2, /*fw*/ 2, /*ph*/ p,
+                  /*pw*/ p);
+
+    // height != width
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 2, /*fw*/ 3, /*ph*/ p,
+                  /*pw*/ p);
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 1, /*fw*/ 3, /*ph*/ p,
+                  /*pw*/ p);
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 5, /*fh*/ 3, /*fw*/ 1, /*ph*/ p,
+                  /*pw*/ p);
+
+    // filter == 1
+    testIm2colCPU(/*ic*/ 3, /*ih*/ 4, /*iw*/ 4, /*fh*/ 1, /*fw*/ 1, /*ph*/ p,
+                  /*pw*/ p);
+    testIm2colCPU(/*ic*/ 3, /*ih*/ 3, /*iw*/ 4, /*fh*/ 1, /*fw*/ 1, /*ph*/ p,
+                  /*pw*/ p);
+  }
+
+  // padding_h != padding_w
+  testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 2, /*fw*/ 3, /*ph*/ 1,
+                /*pw*/ 2);
+
+  // benchmark
+  for (int p : {0, 1}) {
+    for (int k : {1, 3, 5}) {
+      LOG(INFO) << "padding == " << p << ", filter == " << k;
+      benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ k, /*fw*/ k,
+                  /*ph*/ p, /*pw*/ p);
+    }
+  }
+}
diff --git a/paddle/fluid/operators/math/math_function_impl.h b/paddle/fluid/operators/math/math_function_impl.h
index b9bd49d77d935e985705f78402ffe1ea90f24cb3..895a7019aa10e5d9bb8f0c17e433a4344eac3bf4 100644
--- a/paddle/fluid/operators/math/math_function_impl.h
+++ b/paddle/fluid/operators/math/math_function_impl.h
@@ -155,7 +155,7 @@ class RowwiseSum<platform::CPUDeviceContext, T> {
     PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
     auto height = in_dims[0];
     auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), size);
+    PADDLE_ENFORCE_EQ(out->numel(), height);
 
     T* out_buf = out->mutable_data<T>(out->place());
     const T* in_buf = input.data<T>();
diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc
index b545671b43d3a453ab03e4774427179617f62db0..2343e0ee965303c9fdb2ad3faf9ddf6e5bb7782f 100644
--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
@@ -54,8 +54,64 @@ TEST(math_function, gemm_notrans_cblas) {
   EXPECT_EQ(input3_ptr[6], 86);
   EXPECT_EQ(input3_ptr[7], 99);
 }
+#ifdef PADDLE_WITH_LIBXSMM
+template <typename T>
+void MklSmmCompare(int m, int n, int k) {
+  paddle::framework::Tensor mat_a;
+  paddle::framework::Tensor mat_b;
+  paddle::framework::Tensor mat_c_smm;
+  paddle::framework::Tensor mat_c_mkl;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+
+  T* A = mat_a.mutable_data<T>({m, k}, *cpu_place);
+  T* B = mat_b.mutable_data<T>({k, n}, *cpu_place);
+  T* CSMM = mat_c_smm.mutable_data<T>({m, n}, *cpu_place);
+  T* CMKL = mat_c_mkl.mutable_data<T>({m, n}, *cpu_place);
+  T alpha = static_cast<T>(1);
+  T beta = static_cast<T>(0);
+  for (int i = 0; i < mat_a.numel(); ++i) {
+    A[i] = static_cast<T>(i);
+  }
+  for (int i = 0; i < mat_b.numel(); ++i) {
+    B[i] = static_cast<T>(i);
+  }
+  // lda,ldb,ldc follow RowMajor
+  int lda = k;
+  int ldb = n;
+  int ldc = n;
+
+  auto smm = [&, m, n, k, lda, ldb, ldc, alpha, beta]() {
+    const char transa = 'N';
+    const char transb = 'N';
+    paddle::operators::math::CBlas<T>::SMM_GEMM(&transa, &transb, &n, &m, &k,
+                                                &alpha, B, &ldb, A, &lda, &beta,
+                                                CSMM, &ldc);
+  };
 
-TEST(math_function, gemm_trans_clbas) {
+  auto mkl = [&, m, n, k, lda, ldb, ldc, alpha, beta]() {
+    paddle::operators::math::CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans,
+                                            CblasNoTrans, m, n, k, alpha, A,
+                                            lda, B, ldb, beta, CMKL, ldc);
+  };
+
+  smm();
+  mkl();
+  ASSERT_EQ(mat_c_mkl.numel(), mat_c_smm.numel());
+  for (int i = 0; i < mat_c_mkl.numel(); ++i) {
+    EXPECT_FLOAT_EQ(CSMM[i], CMKL[i]);
+  }
+}
+TEST(math_function, gemm_mkl_vs_smm) {
+  MklSmmCompare<float>(1, 2, 3);
+  MklSmmCompare<double>(1, 2, 3);
+  MklSmmCompare<float>(3, 2, 1);
+  MklSmmCompare<double>(3, 2, 1);
+  MklSmmCompare<float>(3, 8, 5);
+  MklSmmCompare<double>(3, 8, 5);
+}
+#endif
+
+TEST(math_function, gemm_trans_cblas) {
   paddle::framework::Tensor input1;
   paddle::framework::Tensor input2;
   paddle::framework::Tensor input3;
@@ -172,3 +228,57 @@ TEST(math_funciton, set_constant) {
   }
   delete ctx;
 }
+
+template <typename T>
+void GemmWarpTest(int m, int n, int k, T alpha, T beta) {
+  paddle::framework::Tensor mat_a;
+  paddle::framework::Tensor mat_b;
+  paddle::framework::Tensor mat_c_ref;
+  paddle::framework::Tensor mat_c_mkl;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+
+  T* A = mat_a.mutable_data<T>({m, k}, *cpu_place);
+  T* B = mat_b.mutable_data<T>({k, n}, *cpu_place);
+  T* CREF = mat_c_ref.mutable_data<T>({m, n}, *cpu_place);
+  T* CMKL = mat_c_mkl.mutable_data<T>({m, n}, *cpu_place);
+
+  ASSERT_EQ(mat_c_mkl.numel(), mat_c_ref.numel());
+  for (int i = 0; i < mat_a.numel(); ++i) {
+    A[i] = static_cast<T>(i);
+  }
+  for (int i = 0; i < mat_b.numel(); ++i) {
+    B[i] = static_cast<T>(i + 1);
+  }
+  for (int i = 0; i < mat_c_ref.numel(); ++i) {
+    CREF[i] = static_cast<T>(i + 2);
+    CMKL[i] = CREF[i];
+  }
+
+  // this would call gemm_warp
+  paddle::platform::CPUDeviceContext context(*cpu_place);
+  GetBlas<T>(context).GEMM(CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, B,
+                           beta, CREF);
+
+  // lda,ldb,ldc follow RowMajor
+  int lda = k;
+  int ldb = n;
+  int ldc = n;
+  paddle::operators::math::CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans,
+                                          CblasNoTrans, m, n, k, alpha, A, lda,
+                                          B, ldb, beta, CMKL, ldc);
+
+  for (int i = 0; i < mat_c_mkl.numel(); ++i) {
+    EXPECT_FLOAT_EQ(CREF[i], CMKL[i]);
+  }
+}
+
+TEST(math_function, gemm_warp) {
+  GemmWarpTest<float>(3, 2, 5, 1.f, 0.f);
+  GemmWarpTest<float>(3, 2, 5, 2.f, 1.f);
+  GemmWarpTest<float>(8, 5, 6, 1.f, 0.f);
+  GemmWarpTest<float>(8, 5, 6, 2.f, 1.f);
+  GemmWarpTest<double>(3, 2, 5, 1.0, 0.0);
+  GemmWarpTest<double>(3, 2, 5, 2.0, 1.0);
+  GemmWarpTest<double>(8, 5, 6, 1.0, 0.0);
+  GemmWarpTest<double>(8, 5, 6, 2.0, 1.0);
+}
diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1e56e297396c6e37867a53f039478191f0caf08e
--- /dev/null
+++ b/paddle/fluid/operators/math/matrix_bit_code.cc
@@ -0,0 +1,176 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/matrix_bit_code.h"
+#include <iostream>
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::Add(framework::Tensor* tmat,
+                                  const framework::Tensor& vec) {
+  SimpleCodeTable code_table(num_classes_);
+  size_t batch_size = tmat->dims()[0];
+  size_t width = tmat->dims()[1];
+  for (size_t i = 0; i < batch_size; ++i) {
+    auto code = code_table(static_cast<size_t>(ids_[i]));
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      size_t index = code.calc_index(j);
+      tmat->data<T>()[i * width + j] += vec.data<T>()[index];
+    }
+  }
+}
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor& tmat,
+                                      framework::Tensor* vec) {
+  SimpleCodeTable code_table(num_classes_);
+  size_t batch_size = tmat.dims()[0];
+  size_t width = tmat.dims()[1];
+  for (size_t i = 0; i < batch_size; ++i) {
+    auto code = code_table(static_cast<size_t>(ids_[i]));
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      size_t index = code.calc_index(j);
+      vec->data<T>()[index] += tmat.data<T>()[i * width + j];
+    }
+  }
+}
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::Sum(const framework::Tensor& tmat,
+                                  framework::Tensor* sum, T scale_sum) {
+  SimpleCodeTable code_table(num_classes_);
+  size_t num_samples = tmat.dims()[0];
+  size_t o_width = tmat.dims()[1];
+  for (size_t i = 0; i < num_samples; ++i) {
+    T sm = static_cast<T>(0.0);
+    auto code = code_table(static_cast<size_t>(ids_[i]));
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      if (code.calc_bit(j)) {
+        // calc_bit starts from right most bit, while data in tmat[i] is in the
+        // reverse order.
+        sm += tmat.data<T>()[i * o_width + j];
+      }
+    }
+    sum->data<T>()[i] = scale_sum * sm;
+  }
+}
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat,
+                                  const framework::Tensor& weight,
+                                  const framework::Tensor& input) {
+  SimpleCodeTable code_table(num_classes_);
+  size_t num_samples = tmat->dims()[0];
+  size_t tmat_width = tmat->dims()[1];
+  size_t input_width = input.dims()[1];
+  size_t weight_width = weight.dims()[1];
+  auto tmat_value = tmat->data<T>();
+  auto weight_value = weight.data<T>();
+  auto input_value = input.data<T>();
+  for (size_t i = 0; i < num_samples; ++i) {
+    auto code = code_table(static_cast<size_t>(ids_[i]));
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      size_t index = code.calc_index(j);
+      T sum = static_cast<T>(0.0);
+      for (size_t k = 0; k < input_width; ++k) {
+        sum += weight_value[weight_width * index + k] *
+               input_value[input_width * i + k];
+      }
+      tmat_value[i * tmat_width + j] += sum;
+    }
+  }
+}
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
+                                            framework::Tensor* weight,
+                                            const framework::Tensor& input) {
+  SimpleCodeTable code_table(num_classes_);
+  size_t num_samples = tmat.dims()[0];
+  size_t input_width = input.dims()[1];
+  size_t tmat_width = tmat.dims()[1];
+  size_t weight_width = weight->dims()[1];
+  auto tmat_value = tmat.data<T>();
+  auto weight_value = weight->data<T>();
+  auto input_value = input.data<T>();
+  for (size_t i = 0; i < num_samples; ++i) {
+    auto code = code_table(static_cast<size_t>(ids_[i]));
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      size_t index = code.calc_index(j);
+
+      for (size_t k = 0; k < input_width; ++k) {
+        weight_value[weight_width * index + k] +=
+            tmat_value[i * tmat_width + j] * input_value[input_width * i + k];
+      }
+    }
+  }
+}
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor& tmat,
+                                           const framework::Tensor& weight,
+                                           framework::Tensor* input) {
+  SimpleCodeTable code_table(num_classes_);
+  size_t num_samples = tmat.dims()[0];
+  size_t tmat_width = tmat.dims()[1];
+  size_t input_width = input->dims()[1];
+  size_t weight_width = weight.dims()[1];
+  auto tmat_value = tmat.data<T>();
+  auto weight_value = weight.data<T>();
+  auto input_value = input->data<T>();
+
+  for (size_t i = 0; i < num_samples; ++i) {
+    auto code = code_table(static_cast<size_t>(ids_[i]));
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      size_t index = code.calc_index(j);
+
+      for (size_t k = 0; k < input_width; ++k) {
+        input_value[input_width * i + k] +=
+            tmat_value[i * tmat_width + j] *
+            weight_value[weight_width * index + k];
+      }
+    }
+  }
+}
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::Sub(framework::Tensor* tmat) {
+  SimpleCodeTable code_table(num_classes_);
+  size_t num_samples = tmat->dims()[0];
+  size_t o_width = tmat->dims()[1];
+  for (size_t i = 0; i < num_samples; ++i) {
+    auto code = code_table(static_cast<size_t>(ids_[i]));
+    int code_length = code.get_length();
+    for (int j = 0; j < code_length; ++j) {
+      if (code.calc_bit(j)) {
+        tmat->data<T>()[i * o_width + j] -= 1;
+      }
+    }
+  }
+}
+
+template class MatrixBitCodeFunctor<float>;
+template class MatrixBitCodeFunctor<double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h
new file mode 100644
index 0000000000000000000000000000000000000000..5454d58f371afb5f5d6a1c3208318f80d4e0aa36
--- /dev/null
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -0,0 +1,143 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+/**
+ * SimpleCodeTable class should support 3 functions:
+ *
+ * size_t size()
+ *   return the number of ids
+ *
+ * int get_max_code_length()
+ *   return the maximal code length
+ *
+ * SimpleCode operator()(size_t i)
+ *   return the i-th code. Code class is descriebed below.
+ *
+ * SimpleCode class should support 3 functions:
+ *
+ * int get_length()
+ *   return the length of the code
+ *
+ * size_t cal_index(int bit)
+ *   bit ranges from 0 to get_length() - 1
+ *   return the index for the (1+bit) level parent
+ *
+ * bool calc_bit(int bit)
+ *   return true if the bit level parent is the right child of (1+bit) level
+ *   parent
+ *
+ */
+
+/**
+ * return the 1-based index of the highest bit set
+ *
+ * for x > 0:
+ * \f[
+ *    FindLastSet(x) = 1 + \floor*{\log_{2}x}
+ * \f]
+ */
+inline constexpr size_t FindLastSet(size_t x) {
+  return std::is_same<size_t, unsigned int>::value
+             ? (x ? 8 * sizeof(x) - __builtin_clz(x) : 0)
+             : (std::is_same<size_t, unsigned long>::value  // NOLINT
+                    ? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0)
+                    : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0));
+}
+
+struct SimpleCode {
+  SimpleCode(size_t code, size_t num_classes) : c_(code + num_classes) {}
+  /**
+   * Here the id of root shoud be 1 rather than 0, thus the encoding of class c
+   * is `c + num_classes` and all siblings can get the same weight indice using
+   * prefixes.
+   * Weight index is the prefixes of encoding, thus leave out the right most
+   * bit in calc_index.
+   * Binary classification path is the suffixes of encoding, thus leave out the
+   * left most bit in calc_bit.
+   */
+  inline size_t calc_index(int bit) const { return (c_ >> (bit + 1)) - 1; }
+  inline bool calc_bit(int bit) const { return c_ & (1 << bit); }
+  inline int get_length() const { return FindLastSet(c_) - 1; }
+
+ private:
+  size_t c_;
+};
+
+struct SimpleCodeTable {
+  explicit SimpleCodeTable(size_t num_classes) : num_classes_(num_classes) {}
+  SimpleCode operator()(size_t code) const {
+    return SimpleCode(code, num_classes_);
+  }
+  size_t size() const { return num_classes_; }
+  int get_max_code_length() const { return FindLastSet(num_classes_ - 1); }
+
+ private:
+  size_t num_classes_;
+};
+
+template <typename T>
+class MatrixBitCodeFunctor {
+ public:
+  explicit MatrixBitCodeFunctor(size_t num_classes, const int64_t* ids)
+      : num_classes_(num_classes), ids_(ids) {}
+  /* For j < code_length
+       tmat(i, j) += vec(0, index(i, j))
+  */
+  void Add(framework::Tensor* tmat, const framework::Tensor& vec);
+
+  /* For j < code_length
+       vec(0, index(i, j)) += tmat(i, j)
+  */
+  void AddGrad(const framework::Tensor& tmat, framework::Tensor* vec);
+
+  /* For j < code_length
+    sum(i, 0) = \sum_j bit(i, j) * tmat(i, j)
+  */
+  void Sum(const framework::Tensor& tmat, framework::Tensor* sum, T scale_sum);
+
+  /* For j < code_length
+       tmat(i, j) -= bit(i, j)
+  */
+  void Sub(framework::Tensor* tmat);
+  /* For j < code_length
+       input.row(i) += tmat(i, j) * weight.row(index(i, j))
+  */
+  void Mul(framework::Tensor* tmat, const framework::Tensor& weight,
+           const framework::Tensor& input);
+
+  /* For index(i, j) >= 0:
+      weight.row(index(i, j)) += tmat(i, j) * input.row(i)
+  */
+  void MulGradWeight(const framework::Tensor& tmat, framework::Tensor* weight,
+                     const framework::Tensor& input);
+  /* For j < code_length
+    input.row(i) += tmat(i, j) * weight.row(index(i, j))
+  */
+  void MulGradError(const framework::Tensor& tmat,
+                    const framework::Tensor& weight, framework::Tensor* input);
+
+  size_t num_classes_;
+  const int64_t* ids_;
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sequence2batch.h b/paddle/fluid/operators/math/sequence2batch.h
index 62e6307ae9f4236a38c49daaf09fc05c54268159..07372235a7c23832e528c3e852a4747f4244b833 100644
--- a/paddle/fluid/operators/math/sequence2batch.h
+++ b/paddle/fluid/operators/math/sequence2batch.h
@@ -78,7 +78,7 @@ class LoDTensor2BatchFunctor {
     auto lods = lod_tensor.lod();
     PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
 
-    auto lod = lods[0];
+    const auto& lod = lods[0];
 
     std::vector<SeqInfo> seq_info;
     for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index a579182ec1bd5d10d95bbf8c6f5a0e70ceaaaf4b..3effe776258cb541dbba32f63eda457d917011f4 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -52,7 +52,7 @@ void SoftmaxCUDNNFunctor<T>::operator()(
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
   cudnnTensorDescriptor_t cudnn_y_desc =
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE(platform::dynload::cudnnSoftmaxForward(
+  CUDNN_ENFORCE(platform::dynload::cudnnSoftmaxForward(
       context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE,
       CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType<T>::kOne(), cudnn_x_desc,
       X->data<T>(), CudnnDataType<T>::kZero(), cudnn_y_desc,
@@ -83,7 +83,7 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
       dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
   cudnnTensorDescriptor_t cudnn_ygrad_desc =
       dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE(platform::dynload::cudnnSoftmaxBackward(
+  CUDNN_ENFORCE(platform::dynload::cudnnSoftmaxBackward(
       context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE,
       CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType<T>::kOne(), cudnn_y_desc,
       Y->data<T>(), cudnn_ygrad_desc, YGrad->data<T>(),
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index a16861b3b77fc980ab932b9d88859b38ec36108b..2dc1467b0d4816d5cc0535eb62e936cf342a241c 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -44,8 +44,10 @@ class MergeLoDTensorOp : public framework::OperatorBase {
         scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
     auto level = static_cast<size_t>(Attr<int>("level"));
 
-    auto &mask_dim = mask.dims();
+    PADDLE_ENFORCE(in_true.numel() || in_false.numel(),
+                   "Input(InTrue) or Input(InFalse) should be initialized.");
 
+    auto &mask_dim = mask.dims();
     std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
     if (platform::is_cpu_place(mask.place())) {
       cpu_mask->ShareDataWith(mask);
@@ -59,19 +61,27 @@ class MergeLoDTensorOp : public framework::OperatorBase {
     }
     auto *mask_data = cpu_mask->data<bool>();
 
-    int rank = in_true.dims().size();
-    platform::Place place = in_true.place();
-    std::type_index data_type = in_true.type();
-    framework::DDim in_true_dims =
-        framework::slice_ddim(in_true.dims(), 1, rank);
-
+    platform::Place place = dev_place;
     int64_t batch_size = in_true.dims()[0] + in_false.dims()[0];
 
-    auto in_true_dim_vec = framework::vectorize(in_true_dims);
-    in_true_dim_vec.insert(in_true_dim_vec.begin(), batch_size);
+    std::type_index data_type =
+        in_true.IsInitialized() ? in_true.type() : in_false.type();
+    int rank;
+    framework::DDim in_dims;
+    if (in_true.IsInitialized()) {
+      rank = in_true.dims().size();
+      in_dims = framework::slice_ddim(in_true.dims(), 1, rank);
+    } else {
+      rank = in_false.dims().size();
+      in_dims = framework::slice_ddim(in_false.dims(), 1, rank);
+    }
+
+    auto in_dim_vec = framework::vectorize(in_dims);
+    in_dim_vec.insert(in_dim_vec.begin(), batch_size);
 
-    framework::DDim out_dims = framework::make_ddim(in_true_dim_vec);
+    framework::DDim out_dims = framework::make_ddim(in_dim_vec);
     out->Resize(out_dims);
+
     out->mutable_data(place, data_type);
 
     auto *out_lod = out->mutable_lod();
diff --git a/paddle/fluid/operators/momentum_op.cc b/paddle/fluid/operators/momentum_op.cc
index dcd73e3c3e40f80e07b73944d1f0cc57fea010d3..5f43c5810812260c4384349bdb709716c9a182f5 100644
--- a/paddle/fluid/operators/momentum_op.cc
+++ b/paddle/fluid/operators/momentum_op.cc
@@ -98,7 +98,7 @@ The update equations are as follows:
 $$
 velocity = mu * velocity + gradient \\
 if (use\_nesterov):   \\
-  param = param - gradient * learning\_rate + mu * velocity * learning\_rate \\
+  param = param - (gradient + mu * velocity) * learning\_rate \\
 else:   \\
   param = param - learning\_rate * velocity. \\
 $$
diff --git a/paddle/fluid/operators/momentum_op.cu b/paddle/fluid/operators/momentum_op.cu
index 5eb9d9950248bb50bb823f071c7fff0ddcc47234..a3932db1f3a50305d585cd3d5e86fa1b527df78b 100644
--- a/paddle/fluid/operators/momentum_op.cu
+++ b/paddle/fluid/operators/momentum_op.cu
@@ -30,7 +30,7 @@ __global__ void MomentumKernel(const T* p, const T* g, const T* v,
       T g_val = g[i];
       T v_new = v[i] * mu + g_val;
       v_out[i] = v_new;
-      p_out[i] = p[i] - (g_val - v_new * mu) * lr;
+      p_out[i] = p[i] - (g_val + v_new * mu) * lr;
     }
   } else {
     for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
diff --git a/paddle/fluid/operators/momentum_op.h b/paddle/fluid/operators/momentum_op.h
index 04a1929b84a93af6465bacfe7974a1530296946d..264726040fb566a52b8c0cdee0a1524197d2a675 100644
--- a/paddle/fluid/operators/momentum_op.h
+++ b/paddle/fluid/operators/momentum_op.h
@@ -46,7 +46,7 @@ class MomentumOpKernel : public framework::OpKernel<T> {
 
     v_out = v * mu + g;
     if (use_nesterov) {
-      p_out = p - (g - v_out * mu) * lr[0];
+      p_out = p - (g + v_out * mu) * lr[0];
     } else {
       p_out = p - lr[0] * v_out;
     }
diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc
index c9744db3d0654ef63357963d9a9a3cb946f56e2d..eb09470f37eabb5524f774bc289fc68f5884c540 100644
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -164,14 +163,11 @@ class ParallelDoOp : public framework::OperatorBase {
       auto &place = places[place_idx];
       auto *cur_scope = sub_scopes[place_idx];
 
-      workers.emplace_back(
-          framework::Async([program, cur_scope, place, block, place_idx] {
-            // Give the thread an id to distinguish parallel block with same id.
-            platform::RecordThread rt(static_cast<int>(place_idx) + 1);
-            framework::Executor executor(place);
-            executor.Run(*program, cur_scope, block->ID(),
-                         false /*create_local_scope*/);
-          }));
+      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
+        framework::Executor executor(place);
+        executor.Run(*program, cur_scope, block->ID(),
+                     false /*create_local_scope*/);
+      }));
     }
     for (auto &worker : workers) {
       worker.wait();
@@ -242,14 +238,11 @@ class ParallelDoGradOp : public framework::OperatorBase {
       auto *cur_scope = sub_scopes[i];
 
       // execute
-      workers.emplace_back(
-          framework::Async([program, cur_scope, place, block, i] {
-            // Give the thread an id to distinguish parallel block with same id.
-            platform::RecordThread rt(static_cast<int>(i) + 1);
-            framework::Executor executor(place);
-            executor.Run(*program, cur_scope, block->ID(),
-                         false /*create_local_scope*/);
-          }));
+      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
+        framework::Executor executor(place);
+        executor.Run(*program, cur_scope, block->ID(),
+                     false /*create_local_scope*/);
+      }));
     }
     for (auto &worker : workers) {
       worker.wait();
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
index be55bc43b14f1e6211f71b4080d1676838ad508c..31f083565fddee66aea1485ed71f41b6199f4502 100644
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -81,7 +81,7 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
     // ------------------- cudnn pool algorithm ---------------------
     auto handle = ctx.cuda_device_context().cudnn_handle();
     ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-    PADDLE_ENFORCE(platform::dynload::cudnnPoolingForward(
+    CUDNN_ENFORCE(platform::dynload::cudnnPoolingForward(
         handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta,
         cudnn_output_desc, output_data));
   }
@@ -154,7 +154,7 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
       T *input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset input_grad.
 
-      PADDLE_ENFORCE(platform::dynload::cudnnPoolingBackward(
+      CUDNN_ENFORCE(platform::dynload::cudnnPoolingBackward(
           handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,
           cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data,
           &beta, cudnn_input_desc, input_grad_data));
diff --git a/paddle/fluid/operators/prefetch_op.cc b/paddle/fluid/operators/prefetch_op.cc
index 8734282fe496b8e90af19abd5549566d62316fc3..4b804740a06f9e29704f2b3f58a90191e3559347 100644
--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
@@ -53,7 +53,7 @@ class PrefetchOp : public framework::OperatorBase {
         VLOG(3) << "don't send no-initialied variable: " << ins[i];
       }
     }
-    rpc_client->Wait();
+    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
   }
 };
 
diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc
index 65fcce8bb019965a805ad09d50be0aba64e4f24e..a0d640b2020958af53a4405ae886eadb2a1e117e 100644
--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -65,6 +66,12 @@ class ReadOp : public framework::OperatorBase {
             .GetMutable<framework::ReaderHolder>();
     std::vector<std::string> out_arg_names = Outputs("Out");
     std::vector<framework::LoDTensor> ins;
+
+    // For profiling
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& ctx = *pool.Get(dev_place);
+    platform::RecordEvent record_event(Type(), &ctx);
+
     reader->ReadNext(&ins);
     if (ins.empty()) {
       if (Attr<bool>("throw_eof_exp")) {
diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt
index a39c8a00538875e4e3284898230a6cb0693b7a12..728197377df04df8c993a48bc282431473fe9959 100644
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
@@ -15,14 +15,14 @@ function(reader_library TARGET_NAME)
         PARENT_SCOPE)
 endfunction()
 
-reader_library(open_files_op SRCS open_files_op.cc)
+cc_library(buffered_reader SRCS buffered_reader.cc DEPS reader simple_threadpool)
+reader_library(open_files_op SRCS open_files_op.cc DEPS buffered_reader)
 reader_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc)
 reader_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc)
 reader_library(create_batch_reader_op SRCS create_batch_reader_op.cc)
 reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_op.cc)
-reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc)
+reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc DEPS buffered_reader)
 reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
-reader_library(create_threaded_reader_op SRCS create_threaded_reader_op.cc)
 reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc)
 reader_library(create_py_reader_op SRCS create_py_reader_op.cc)
 
diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h
index db8cf3b605c9175eeda4548b1e7c8203f26c5d89..28cc91a5ed5d74994e5b960a0a4dd3c6a5e6cdcc 100644
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -81,6 +81,15 @@ class BlockingQueue {
     }
   }
 
+  void ReOpen() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    closed_ = false;
+    std::deque<T> new_deque;
+    queue_.swap(new_deque);
+    send_cv_.notify_all();
+    receive_cv_.notify_all();
+  }
+
   void Close() {
     std::lock_guard<std::mutex> lock(mutex_);
     closed_ = true;
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
new file mode 100644
index 0000000000000000000000000000000000000000..26ff221dfa0768bd2bcc9e6485a32485f0212ac6
--- /dev/null
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reader/buffered_reader.h"
+#include <vector>
+
+namespace paddle {
+namespace operators {
+namespace reader {
+BufferedReader::~BufferedReader() {
+  reader_->Shutdown();
+  while (!position_.empty()) {
+    position_.front().wait();
+    position_.pop();
+  }
+}
+
+BufferedReader::BufferedReader(
+    const std::shared_ptr<framework::ReaderBase> &reader,
+    const platform::Place &place, size_t buffer_size)
+    : framework::DecoratedReader(reader),
+      thread_pool_(1),
+      place_(place),
+      buffer_size_(buffer_size) {
+  cpu_buffer_.resize(buffer_size);
+  gpu_buffer_.resize(buffer_size);
+  ReadTillBufferFullAsync();
+}
+
+void BufferedReader::ReadTillBufferFullAsync() {
+  PADDLE_ENFORCE_EQ(position_.size(), 0U);
+  for (size_t i = 0; i < buffer_size_; ++i) {
+    ReadAsync(i);
+  }
+}
+
+void BufferedReader::ReadAsync(size_t i) {
+  position_.emplace(thread_pool_.enqueue([this, i]() -> size_t {
+    TensorVec &cpu = cpu_buffer_[i];
+    reader_->ReadNext(&cpu);
+
+    if (cpu.empty()) {
+      return -1UL;
+    }
+
+    if (platform::is_gpu_place(place_)) {
+      TensorVec &gpu = gpu_buffer_[i];
+      gpu.resize(cpu.size());
+      for (size_t i = 0; i < cpu.size(); ++i) {
+        framework::TensorCopySync(cpu[i], place_, &gpu[i]);
+        gpu[i].set_lod(cpu[i].lod());
+      }
+    }
+    return i;
+  }));
+}
+
+void BufferedReader::ShutdownImpl() {
+  reader_->Shutdown();
+  while (!position_.empty()) {
+    position_.pop();
+  }
+  prev_pos_ = -1UL;
+}
+
+void BufferedReader::StartImpl() {
+  reader_->Start();
+  ReadTillBufferFullAsync();
+}
+
+void BufferedReader::ReadNextImpl(std::vector<framework::LoDTensor> *out) {
+  if (position_.empty()) {
+    out->clear();
+    return;
+  }
+  size_t i = position_.front().get();
+  position_.pop();
+
+  if (i == -1UL) {
+    ReadNextImpl(out);
+    return;
+  }
+
+  *out = platform::is_gpu_place(place_) ? gpu_buffer_[i] : cpu_buffer_[i];
+
+  // Do not push current position into ReadAsync. Push the previous position
+  // Since all computation in fluid are async, change the data of
+  // current position may cause data error.
+  if (prev_pos_ != -1Ul) {
+    ReadAsync(prev_pos_);
+  }
+  prev_pos_ = i;
+}
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
new file mode 100644
index 0000000000000000000000000000000000000000..cbe2bc1b5fdd69d1a843b768e3289acd621369a6
--- /dev/null
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <list>
+#include <queue>
+#include <vector>
+#include "ThreadPool.h"
+#include "paddle/fluid/framework/reader.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class BufferedReader : public framework::DecoratedReader {
+  using TensorVec = std::vector<framework::LoDTensor>;
+  using VecFuture = std::future<TensorVec>;
+
+ public:
+  BufferedReader(const std::shared_ptr<framework::ReaderBase>& reader,
+                 const platform::Place& place, size_t buffer_size);
+
+  ~BufferedReader() override;
+
+ private:
+  void ReadTillBufferFullAsync();
+
+  void ReadAsync(size_t i);
+
+ protected:
+  void ShutdownImpl() override;
+  void StartImpl() override;
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;
+
+ private:
+  ThreadPool thread_pool_;
+  platform::Place place_;
+  const size_t buffer_size_;
+
+  std::queue<std::future<size_t>> position_;
+
+  // The buffer for reading data.
+  // NOTE: the simplest way to implement buffered reader is do not use any
+  // buffer, just read async and create futures as buffer size. However, to
+  // malloc tensors every time is extremely slow. Here we store all data in
+  // buffers and prevent alloc every time.
+  std::vector<TensorVec> cpu_buffer_;
+  std::vector<TensorVec> gpu_buffer_;
+  size_t prev_pos_{-1UL};
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/create_batch_reader_op.cc b/paddle/fluid/operators/reader/create_batch_reader_op.cc
index ecbae3894d551186f53625a6cc9cfdb36adc8d2d..e17c2ffd39eea31fe85933eda144ab97cf8c3dd8 100644
--- a/paddle/fluid/operators/reader/create_batch_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc
@@ -20,15 +20,19 @@ namespace reader {
 
 class BatchReader : public framework::DecoratedReader {
  public:
-  BatchReader(const std::shared_ptr<ReaderBase>& reader, int batch_size)
-      : DecoratedReader(reader), batch_size_(batch_size) {
+  BatchReader(const std::shared_ptr<ReaderBase>& reader, int batch_size,
+              bool discard_leftover)
+      : DecoratedReader(reader),
+        batch_size_(static_cast<size_t>(batch_size)),
+        discard_leftover_(discard_leftover) {
     buffer_.reserve(batch_size_);
   }
 
-  void ReadNext(std::vector<framework::LoDTensor>* out) override;
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;
 
  private:
-  int batch_size_;
+  size_t batch_size_;
+  bool discard_leftover_;
   std::vector<std::vector<framework::LoDTensor>> buffer_;
 };
 
@@ -46,8 +50,9 @@ class CreateBatchReaderOp : public framework::OperatorBase {
     }
     const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                         ->Get<framework::ReaderHolder>();
-    out->Reset(
-        new BatchReader(underlying_reader.Get(), Attr<int>("batch_size")));
+    out->Reset(framework::MakeDecoratedReader<BatchReader>(
+        underlying_reader, Attr<int>("batch_size"),
+        Attr<bool>("discard_leftover")));
   }
 };
 
@@ -57,6 +62,10 @@ class CreateBatchReaderOpMaker : public DecoratedReaderMakerBase {
     AddAttr<int>("batch_size",
                  "How many instances the batch reader yields each time.")
         .GreaterThan(0);
+    AddAttr<bool>("discard_leftover",
+                  "If true, the leftover instances that are not enough for a "
+                  "new batch will be discarded.")
+        .SetDefault(true);
     AddComment(R"DOC(
       CreateBatchReader Operator
 
@@ -66,10 +75,10 @@ class CreateBatchReaderOpMaker : public DecoratedReaderMakerBase {
   }
 };
 
-void BatchReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+void BatchReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
   buffer_.clear();
   buffer_.reserve(batch_size_);
-  for (int i = 0; i < batch_size_; ++i) {
+  for (size_t i = 0; i < batch_size_; ++i) {
     buffer_.push_back(std::vector<framework::LoDTensor>());
     reader_->ReadNext(&buffer_.back());
     if (buffer_.back().empty()) {
@@ -77,15 +86,18 @@ void BatchReader::ReadNext(std::vector<framework::LoDTensor>* out) {
       break;
     }
   }
+  if (discard_leftover_ && buffer_.size() < batch_size_) {
+    buffer_.clear();
+  }
   // Concat instances
   out->clear();
   if (buffer_.empty()) {
     // if buffer_ is empty, the 'out' will return as an empty vector.
     return;
   }
-  int out_num = buffer_[0].size();
+  size_t out_num = buffer_[0].size();
   out->reserve(out_num);
-  for (int j = 0; j < out_num; ++j) {
+  for (size_t j = 0; j < out_num; ++j) {
     // Merge shape and check date type
     std::type_index batch_type = buffer_[0][j].type();
     framework::DDim batch_shape = buffer_[0][j].dims();
diff --git a/paddle/fluid/operators/reader/create_custom_reader_op.cc b/paddle/fluid/operators/reader/create_custom_reader_op.cc
index a75c6d4c567ac93f37b38070421133af305f20a3..85394b336fc967fc6973131fbedda4c796825185 100644
--- a/paddle/fluid/operators/reader/create_custom_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc
@@ -33,7 +33,7 @@ class CustomReader : public framework::DecoratedReader {
         source_var_names_(source_var_names),
         sink_var_names_(sink_var_names) {}
 
-  void ReadNext(std::vector<framework::LoDTensor>* out) override;
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;
 
  private:
   const framework::ProgramDesc program_;
@@ -60,10 +60,10 @@ class CreateCustomReaderOp : public framework::OperatorBase {
     }
     const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                         ->Get<framework::ReaderHolder>();
-    out->Reset(
-        new CustomReader(underlying_reader.Get(), *sub_block,
-                         Attr<std::vector<std::string>>("source_var_names"),
-                         Attr<std::vector<std::string>>("sink_var_names")));
+    out->Reset(framework::MakeDecoratedReader<CustomReader>(
+        underlying_reader, *sub_block,
+        Attr<std::vector<std::string>>("source_var_names"),
+        Attr<std::vector<std::string>>("sink_var_names")));
   }
 };
 
@@ -143,7 +143,7 @@ class CustomReaderInferVarType : public framework::VarTypeInference {
   }
 };
 
-void CustomReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+void CustomReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
   out->clear();
   std::vector<framework::LoDTensor> underlying_outs;
   reader_->ReadNext(&underlying_outs);
diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index 5f734489a81764875988f440696682570ff4d1d7..ed719f91d0980480aa62a5cd3c1f819e6c0e7475 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -12,74 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <thread>  // NOLINT
-
-#include "paddle/fluid/operators/reader/blocking_queue.h"
+#include "paddle/fluid/operators/reader/buffered_reader.h"
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 
 namespace paddle {
 namespace operators {
 namespace reader {
-
-// 'Double buffer' means we shall maintain two batches of input data at the same
-// time. So the kCacheSize shoul be at least 2.
-static constexpr size_t kCacheSize = 5;
-// There will be two bacthes out of the channel during training:
-// 1. the one waiting to be sent to the channel
-// 2. the one just be received from the channel, which is also being used by
-// subsequent operators.
-// So the channel size should be kChacheSize - 2
-static constexpr size_t kChannelSize = 3;  // kCacheSize - 2
-
-class DoubleBufferReader : public framework::DecoratedReader {
- public:
-  explicit DoubleBufferReader(
-      const std::shared_ptr<ReaderBase>& reader,
-      platform::Place target_place = platform::CPUPlace())
-      : DecoratedReader(reader), place_(target_place) {
-    cpu_tensor_cache_.resize(kCacheSize);
-    gpu_tensor_cache_.resize(kCacheSize);
-#ifdef PADDLE_WITH_CUDA
-    if (platform::is_gpu_place(place_)) {
-      for (size_t i = 0; i < kCacheSize; ++i) {
-        ctxs_.emplace_back(new platform::CUDADeviceContext(
-            boost::get<platform::CUDAPlace>(place_)));
-      }
-    }
-#endif
-    StartPrefetcher();
-  }
-
-  void ReadNext(std::vector<framework::LoDTensor>* out) override;
-  void ReInit() override;
-
-  ~DoubleBufferReader() { EndPrefetcher(); }
-
- private:
-  void StartPrefetcher() {
-    channel_ = new reader::BlockingQueue<size_t>(kChannelSize);
-    prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
-  }
-
-  void EndPrefetcher() {
-    channel_->Close();
-    if (prefetcher_.joinable()) {
-      prefetcher_.join();
-    }
-    delete channel_;
-    channel_ = nullptr;
-  }
-
-  void PrefetchThreadFunc();
-
-  std::thread prefetcher_;
-  reader::BlockingQueue<size_t>* channel_;
-  platform::Place place_;
-  std::vector<std::vector<framework::LoDTensor>> cpu_tensor_cache_;
-  std::vector<std::vector<framework::LoDTensor>> gpu_tensor_cache_;
-  std::vector<std::unique_ptr<platform::DeviceContext>> ctxs_;
-};
-
 class CreateDoubleBufferReaderOp : public framework::OperatorBase {
  public:
   using framework::OperatorBase::OperatorBase;
@@ -109,7 +47,8 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase {
       place = platform::CUDAPlace(static_cast<int>(num));
     }
 
-    out->Reset(new DoubleBufferReader(underlying_reader.Get(), place));
+    out->Reset(framework::MakeDecoratedReader<BufferedReader>(underlying_reader,
+                                                              place, 2));
   }
 };
 
@@ -136,57 +75,6 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
   }
 };
 
-void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
-  size_t cached_tensor_id;
-  if (channel_->Receive(&cached_tensor_id)) {
-    if (platform::is_gpu_place(place_)) {
-      *out = gpu_tensor_cache_[cached_tensor_id];
-    } else {
-      // CPU place
-      *out = cpu_tensor_cache_[cached_tensor_id];
-    }
-  } else {
-    out->clear();
-  }
-}
-
-void DoubleBufferReader::ReInit() {
-  reader_->ReInit();
-  EndPrefetcher();
-  StartPrefetcher();
-}
-
-void DoubleBufferReader::PrefetchThreadFunc() {
-  VLOG(5) << "A new prefetch thread starts.";
-  size_t cached_tensor_id = 0;
-  while (true) {
-    auto& cpu_batch = cpu_tensor_cache_[cached_tensor_id];
-    reader_->ReadNext(&cpu_batch);
-    if (cpu_batch.empty()) {
-      // The underlying reader have no next data.
-      break;
-    }
-    if (platform::is_gpu_place(place_)) {
-      auto& gpu_batch = gpu_tensor_cache_[cached_tensor_id];
-      gpu_batch.resize(cpu_batch.size());
-      for (size_t i = 0; i < cpu_batch.size(); ++i) {
-        // TODO(fengjiayi): Use asynchronous TensorCopy instead
-        framework::TensorCopySync(cpu_batch[i], place_, &gpu_batch[i]);
-        gpu_batch[i].set_lod(cpu_batch[i].lod());
-      }
-    }
-    if (!channel_->Send(cached_tensor_id)) {
-      VLOG(5) << "WARNING: The double buffer channel has been closed. The "
-                 "prefetch thread will terminate.";
-      break;
-    }
-    ++cached_tensor_id;
-    cached_tensor_id %= kCacheSize;
-  }
-  channel_->Close();
-  VLOG(5) << "Prefetch thread terminates.";
-}
-
 }  // namespace reader
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
index 19b54110b9aeece33b8d6c73612ae0e12dbfafbd..0a225597d34f43c7fb82aeae2552cdf16c8ba566 100644
--- a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
@@ -24,23 +24,22 @@ class MultiPassReader : public framework::DecoratedReader {
   MultiPassReader(const std::shared_ptr<ReaderBase>& reader, int pass_num)
       : DecoratedReader(reader), pass_num_(pass_num), pass_count_(0) {}
 
-  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
     reader_->ReadNext(out);
-    if (out->empty()) {
+    if (out->empty() && pass_count_ < pass_num_ - 1) {
+      reader_->Shutdown();
+      reader_->Start();
+      reader_->ReadNext(out);
       ++pass_count_;
-      if (pass_count_ < pass_num_) {
-        reader_->ReInit();
-        reader_->ReadNext(out);
-      }
     }
   }
 
-  void ReInit() override {
+ private:
+  void StartImpl() override {
     pass_count_ = 0;
-    reader_->ReInit();
+    reader_->Start();
   }
 
- private:
   int pass_num_;
   mutable int pass_count_;
 };
@@ -60,7 +59,8 @@ class CreateMultiPassReaderOp : public framework::OperatorBase {
     const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                         ->Get<framework::ReaderHolder>();
     int pass_num = Attr<int>("pass_num");
-    out->Reset(new MultiPassReader(underlying_reader.Get(), pass_num));
+    out->Reset(framework::MakeDecoratedReader<MultiPassReader>(
+        underlying_reader, pass_num));
   }
 };
 
diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc
index 36587360f7347a10e01d4e994482027d9a9bb5d0..0f31ca1a94326956ae5e6dffd582daedeb55a9e3 100644
--- a/paddle/fluid/operators/reader/create_py_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_py_reader_op.cc
@@ -19,9 +19,10 @@ namespace paddle {
 namespace operators {
 namespace reader {
 
-class PyReader : public framework::ReaderBase {
+class PyReader : public framework::FileReader {
  public:
-  explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue) {
+  explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue)
+      : framework::FileReader() {
     PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
     queue_ = queue;
   }
@@ -32,7 +33,11 @@ class PyReader : public framework::ReaderBase {
     if (!success) out->clear();
   }
 
-  void ReInit() override {}
+  ~PyReader() { queue_->Close(); }
+
+  void Shutdown() override { queue_->Close(); }
+
+  void Start() override { queue_->ReOpen(); }
 
  private:
   std::shared_ptr<LoDTensorBlockingQueue> queue_;
@@ -51,14 +56,14 @@ class CreatePyReaderOp : public framework::OperatorBase {
 
     const std::string& queue_name = Input("blocking_queue");
     auto* queue_holder_var = scope.FindVar(queue_name);
-    PADDLE_ENFORCE(
-        queue_holder_var != nullptr,
+    PADDLE_ENFORCE_NOT_NULL(
+        queue_holder_var,
         "No LoDTensorBlockingQueueHolder variable with name %s found",
         queue_name);
     auto* queue_holder =
         queue_holder_var->template GetMutable<LoDTensorBlockingQueueHolder>();
 
-    out->Reset(new PyReader(queue_holder->GetQueue()));
+    out->Reset(std::make_shared<PyReader>(queue_holder->GetQueue()));
   }
 };
 
diff --git a/paddle/fluid/operators/reader/create_random_data_generator_op.cc b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
index 5b7e8a063a034f0be056065826fca0fe807bc9a7..e5c116dfcd71ef40597ca19d1da0b51038baaad1 100644
--- a/paddle/fluid/operators/reader/create_random_data_generator_op.cc
+++ b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
@@ -19,11 +19,11 @@ namespace operators {
 namespace reader {
 
 template <typename T>
-class RandomDataGenerator : public framework::ReaderBase {
+class RandomDataGenerator : public framework::FileReader {
  public:
   RandomDataGenerator(const std::vector<framework::DDim>& shapes, float low,
                       float high)
-      : framework::ReaderBase(), low_(low), high_(high), shapes_(shapes) {
+      : framework::FileReader(), low_(low), high_(high), shapes_(shapes) {
     PADDLE_ENFORCE_LE(low, high,
                       "'low' shouldn't be greater than 'high'.(%f vs %f)", low,
                       high);
@@ -32,7 +32,7 @@ class RandomDataGenerator : public framework::ReaderBase {
     dist_ = std::uniform_real_distribution<float>(low_, high_);
   }
 
-  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
     out->clear();
     out->reserve(shapes_.size());
     for (const framework::DDim& shape : shapes_) {
@@ -51,8 +51,6 @@ class RandomDataGenerator : public framework::ReaderBase {
     }
   }
 
-  void ReInit() override { return; }
-
  private:
   float low_;
   float high_;
@@ -79,8 +77,8 @@ class CreateRandomDataGeneratorOp : public framework::OperatorBase {
     std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new RandomDataGenerator<T>(shapes, Attr<float>("low"),
-                                          Attr<float>("high")));
+    out->Reset(std::make_shared<RandomDataGenerator<T>>(
+        shapes, Attr<float>("low"), Attr<float>("high")));
   }
 };
 
diff --git a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
index 559827f08494af6730aafa1e67c46a47c21dedf6..a08a9dbd0da46e73082cdd24c019e8d210d8bcc4 100644
--- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
@@ -21,10 +21,8 @@ namespace reader {
 template <bool ThreadSafe>
 class RecordIOFileReader : public framework::FileReader {
  public:
-  explicit RecordIOFileReader(const std::string& filename,
-                              const std::vector<framework::DDim>& dims)
-      : FileReader(dims),
-        scanner_(filename),
+  explicit RecordIOFileReader(const std::string& filename)
+      : scanner_(filename),
         dev_ctx_(*platform::DeviceContextPool::Instance().Get(
             platform::CPUPlace())) {
     if (ThreadSafe) {
@@ -33,18 +31,21 @@ class RecordIOFileReader : public framework::FileReader {
     LOG(INFO) << "Creating file reader" << filename;
   }
 
-  void ReInit() override { scanner_.Reset(); }
-
  protected:
   void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
+    std::unique_ptr<std::lock_guard<std::mutex>> guard;
     if (ThreadSafe) {
-      std::lock_guard<std::mutex> guard(*mutex_);
-      *out = framework::ReadFromRecordIO(&scanner_, dev_ctx_);
-    } else {
-      *out = framework::ReadFromRecordIO(&scanner_, dev_ctx_);
+      guard.reset(new std::lock_guard<std::mutex>(*mutex_));
+    }
+
+    bool ok = framework::ReadFromRecordIO(&scanner_, dev_ctx_, out);
+    if (!ok) {
+      out->clear();
     }
   }
 
+  void StartImpl() override { scanner_.Reset(); }
+
  private:
   std::unique_ptr<std::mutex> mutex_;
   recordio::Scanner scanner_;
@@ -58,20 +59,11 @@ class CreateRecordIOReaderOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override {
-    const auto& shape_concat = Attr<std::vector<int>>("shape_concat");
-    const auto& ranks = Attr<std::vector<int>>("ranks");
-    PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
-    PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
-                      static_cast<int>(shape_concat.size()),
-                      "The accumulate of all ranks should be equal to the "
-                      "shape concat's length.");
     std::string filename = Attr<std::string>("filename");
-
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
 
-    out->Reset(new RecordIOFileReader<true>(
-        filename, RestoreShapes(shape_concat, ranks)));
+    out->Reset(std::make_shared<RecordIOFileReader<true>>(filename));
   }
 };
 
diff --git a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
index 57e8e21214b7c99e52550fe51a67c9b5201cb46f..3f72890a7cee1453585d50afa04fa62a9b059dc3 100644
--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
@@ -34,7 +34,7 @@ class ShuffleReader : public framework::DecoratedReader {
     ReloadBuffer();
   }
 
-  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
     out->clear();
     if (iteration_pos_ >= buffer_.size()) {
       VLOG(10) << "Resetting shuffle buffer";
@@ -47,6 +47,17 @@ class ShuffleReader : public framework::DecoratedReader {
   }
 
  private:
+  void ShutdownImpl() override {
+    reader_->Shutdown();
+    buffer_.clear();
+    iteration_pos_ = 0;
+  }
+
+  void StartImpl() override {
+    reader_->Start();
+    ReloadBuffer();
+  }
+
   void ReloadBuffer() {
     buffer_.clear();
     buffer_.reserve(buffer_size_);
@@ -86,9 +97,8 @@ class CreateShuffleReaderOp : public framework::OperatorBase {
     }
     const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                         ->Get<framework::ReaderHolder>();
-    out->Reset(
-        new ShuffleReader(underlying_reader.Get(),
-                          static_cast<size_t>(Attr<int>("buffer_size"))));
+    out->Reset(framework::MakeDecoratedReader<ShuffleReader>(
+        underlying_reader, static_cast<size_t>(Attr<int>("buffer_size"))));
   }
 };
 
diff --git a/paddle/fluid/operators/reader/create_threaded_reader_op.cc b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
deleted file mode 100644
index 3798015146f4ffb085aa82e23ca3f1fb3c5cf5a4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reader/create_threaded_reader_op.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/operators/reader/reader_op_registry.h"
-
-namespace paddle {
-namespace operators {
-namespace reader {
-
-class ThreadedReader : public framework::DecoratedReader {
- public:
-  explicit ThreadedReader(const std::shared_ptr<ReaderBase>& reader)
-      : DecoratedReader(reader) {}
-
-  void ReadNext(std::vector<framework::LoDTensor>* out) override {
-    std::lock_guard<std::mutex> lock(mutex_);
-    reader_->ReadNext(out);
-  }
-
-  void ReInit() override { reader_->ReInit(); }
-
- private:
-  std::mutex mutex_;
-};
-
-class CreateThreadedReaderOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    auto* out = detail::Ref(scope.FindVar(Output("Out")))
-                    .GetMutable<framework::ReaderHolder>();
-    if (out->Get() != nullptr) {
-      return;
-    }
-    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
-                                        ->Get<framework::ReaderHolder>();
-    out->Reset(new ThreadedReader(underlying_reader.Get()));
-  }
-};
-
-class CreateThreadedReaderOpMaker : public DecoratedReaderMakerBase {
- protected:
-  void Apply() override {
-    AddComment(R"DOC(
-      CreateThreadedReader Operator
-
-      This operator creates a threaded reader. A threaded reader's
-      'ReadNext()' can be invoked by several threads at the same
-      time.
-      When the attribute 'safe_mode' is true, the threaded reader's
-      'ReInit()' is disabled to avoid unexpected bugs in multi-thread
-      environment.
-    )DOC");
-  }
-};
-
-}  // namespace reader
-}  // namespace operators
-}  // namespace paddle
-
-namespace reader = paddle::operators::reader;
-REGISTER_DECORATED_READER_OPERATOR(create_threaded_reader,
-                                   reader::CreateThreadedReaderOp,
-                                   reader::CreateThreadedReaderOpMaker);
diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
index 30d962ba10a954a837f9771d21cedf0feb643439..4f7cfc24ec035349f3c85e84d876ad9b5b5493a6 100644
--- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
+++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
@@ -38,12 +38,10 @@ class LoDTensorBlockingQueue {
 
  public:
   bool Push(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
-    CheckDims(lod_tensor_vec);
     return queue_.Send(lod_tensor_vec);
   }
 
   bool Push(std::vector<framework::LoDTensor>&& lod_tensor_vec) {
-    CheckDims(lod_tensor_vec);
     return queue_.Send(std::move(lod_tensor_vec));
   }
 
@@ -58,25 +56,13 @@ class LoDTensorBlockingQueue {
 
   inline size_t Size() const { return queue_.Size(); }
 
-  inline void Close() { return queue_.Close(); }
+  inline void ReOpen() { queue_.ReOpen(); }
+
+  inline void Close() { queue_.Close(); }
 
   inline bool IsClosed() const { return queue_.IsClosed(); }
 
  private:
-  void CheckDims(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
-    PADDLE_ENFORCE(dims_.size() == lod_tensor_vec.size(),
-                   "Expect input size is %d but found %s", dims_.size(),
-                   lod_tensor_vec.size());
-    for (size_t i = 0; i < dims_.size(); ++i) {
-      const auto& in_dims = framework::slice_ddim(
-          lod_tensor_vec[i].dims(), 1, lod_tensor_vec[i].dims().size());
-      const auto& expect_dims =
-          framework::slice_ddim(dims_[i], 1, dims_[i].size());
-      PADDLE_ENFORCE(in_dims == expect_dims,
-                     "Dims of the %d-th input tensor do not match", i);
-    }
-  }
-
   BlockingQueue<std::vector<framework::LoDTensor>> queue_;
   std::vector<framework::DDim> dims_;
 };
diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc
index 31e5d81e55ed9703eb3a9ef2595fa2a280f1a734..38223e069975a08791d58d6ae10e2112b79a61fe 100644
--- a/paddle/fluid/operators/reader/open_files_op.cc
+++ b/paddle/fluid/operators/reader/open_files_op.cc
@@ -12,152 +12,200 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <cmath>
+#include <stdexcept>
 #include <thread>  // NOLINT
-
+#include "ThreadPool.h"
+#include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/operators/reader/blocking_queue.h"
+#include "paddle/fluid/operators/reader/buffered_reader.h"
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 
 namespace paddle {
 namespace operators {
 namespace reader {
 
-class MultiFileReader : public framework::ReaderBase {
+class IReaderContainer {
  public:
-  MultiFileReader(const std::vector<std::string>& file_names,
-                  const std::vector<framework::DDim>& dims, size_t thread_num,
-                  size_t buffer_size)
-      : buffer_size_(buffer_size) {
-    readers_.reserve(file_names.size());
-    for (const std::string& f_name : file_names) {
-      readers_.emplace_back(CreateReaderByFileName(f_name, dims));
+  virtual ~IReaderContainer() {}
+  virtual void AppendReader(
+      std::unique_ptr<framework::ReaderBase>&& readers) = 0;
+  virtual void Stop() = 0;
+  virtual void Start() = 0;
+  virtual void ReadNext(std::vector<framework::LoDTensor>* out) = 0;
+};
+
+class OrderedReaderContainer : public IReaderContainer {
+ public:
+  void AppendReader(std::unique_ptr<framework::ReaderBase>&& reader) override {
+    pending_.emplace(std::move(reader));
+  }
+
+  void Stop() override {
+    while (!pending_.empty()) {
+      MoveFrontPendingToDone();
     }
-    prefetchers_.resize(thread_num);
-    StartNewScheduler();
   }
 
-  void ReadNext(std::vector<framework::LoDTensor>* out) override;
-  void ReInit() override;
+  void Start() override { std::swap(done_, pending_); }
 
-  ~MultiFileReader() { EndScheduler(); }
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    if (!pending_.empty()) {
+      pending_.front()->ReadNext(out);
+      if (out->empty()) {
+        MoveFrontPendingToDone();
+        ReadNext(out);
+      }
+    } else {
+      out->clear();
+    }
+  }
 
  private:
-  void StartNewScheduler();
-  void EndScheduler();
-  void ScheduleThreadFunc();
-  void PrefetchThreadFunc(size_t reader_idx, size_t thread_idx);
-
-  std::vector<std::unique_ptr<framework::ReaderBase>> readers_;
-  std::thread scheduler_;
-  std::vector<std::thread> prefetchers_;
-  size_t buffer_size_;
-  reader::BlockingQueue<size_t>* waiting_reader_idx_;
-  reader::BlockingQueue<size_t>* available_thread_idx_;
-  reader::BlockingQueue<std::vector<framework::LoDTensor>>* buffer_;
+  void MoveFrontPendingToDone() {
+    pending_.front()->Shutdown();
+    pending_.front()->Start();
+    done_.emplace(move(pending_.front()));
+    pending_.pop();
+  }
+
+  std::queue<std::unique_ptr<framework::ReaderBase>> pending_;
+  std::queue<std::unique_ptr<framework::ReaderBase>> done_;
 };
 
-void MultiFileReader::ReadNext(std::vector<framework::LoDTensor>* out) {
-  if (!buffer_->Receive(out)) {
-    out->clear();
-  }
-}
-
-void MultiFileReader::ReInit() {
-  EndScheduler();
-  StartNewScheduler();
-}
-
-void MultiFileReader::StartNewScheduler() {
-  size_t thread_num = prefetchers_.size();
-  waiting_reader_idx_ = new reader::BlockingQueue<size_t>(readers_.size());
-  available_thread_idx_ = new reader::BlockingQueue<size_t>(thread_num);
-  buffer_ = new reader::BlockingQueue<std::vector<framework::LoDTensor>>(
-      buffer_size_);
-
-  for (size_t i = 0; i < readers_.size(); ++i) {
-    waiting_reader_idx_->Send(i);
-  }
-  waiting_reader_idx_->Close();
-  for (size_t i = 0; i < thread_num; ++i) {
-    available_thread_idx_->Send(i);
-  }
+class PreemptiveReaderContainer : public IReaderContainer {
+  using ReaderList = std::list<std::unique_ptr<framework::ReaderBase>>;
 
-  scheduler_ = std::thread([this] { ScheduleThreadFunc(); });
-}
+  struct FutureItem {
+    std::vector<framework::LoDTensor> data_;
+    ReaderList::iterator reader_it_;
+    std::exception_ptr exception_;
+  };
 
-void MultiFileReader::EndScheduler() {
-  available_thread_idx_->Close();
-  buffer_->Close();
-  waiting_reader_idx_->Close();
-  if (scheduler_.joinable()) {
-    scheduler_.join();
-  }
-  delete buffer_;
-  delete available_thread_idx_;
-  delete waiting_reader_idx_;
-}
-
-void MultiFileReader::ScheduleThreadFunc() {
-  VLOG(5) << "MultiFileReader schedule thread starts.";
-  size_t completed_thread_num = 0;
-  size_t thread_idx;
-  while (available_thread_idx_->Receive(&thread_idx)) {
-    std::thread& prefetcher = prefetchers_[thread_idx];
-    if (prefetcher.joinable()) {
-      prefetcher.join();
-    }
-    size_t reader_idx;
-    if (waiting_reader_idx_->Receive(&reader_idx)) {
-      // Still have files to read. Start a new prefetch thread.
-      prefetcher = std::thread([this, reader_idx, thread_idx] {
-        PrefetchThreadFunc(reader_idx, thread_idx);
-      });
-    } else {
-      // No more file to read.
-      ++completed_thread_num;
-      if (completed_thread_num == prefetchers_.size()) {
-        buffer_->Close();
-        break;
+  using FutureList = std::list<std::future<FutureItem>>;
+
+ public:
+  explicit PreemptiveReaderContainer(size_t thread_num) : pool_(thread_num) {}
+
+  void Stop() override {
+    if (!pending_.empty()) {
+      for (auto& reader : pending_) {
+        reader->Shutdown();
+      }
+      for (auto& fu : futures_) {
+        fu.wait();
       }
+      futures_.clear();
+      for (auto& reader : pending_) {
+        reader->Start();
+        done_.emplace_back(std::move(reader));
+      }
+      pending_.clear();
+      bool timeout;
+      complete_queue_.PopAll(1000, &timeout);
+      PADDLE_ENFORCE(!timeout);
     }
   }
-  // If users invoke ReInit() when scheduler is running, it will close the
-  // 'avaiable_thread_idx_' and prefecther threads have no way to tell scheduler
-  // to release their resource. So a check is needed before scheduler ends.
-  for (auto& p : prefetchers_) {
-    if (p.joinable()) {
-      p.join();
+
+  void Start() override {
+    for (auto& reader : done_) {
+      AppendReader(std::move(reader));
     }
+    done_.clear();
   }
-  VLOG(5) << "MultiFileReader schedule thread terminates.";
-}
-
-void MultiFileReader::PrefetchThreadFunc(size_t reader_idx, size_t thread_idx) {
-  VLOG(5) << "The prefetch thread of file idx '" << reader_idx << "' starts.";
-  std::unique_ptr<framework::ReaderBase>& reader = readers_[reader_idx];
-  while (true) {
-    std::vector<framework::LoDTensor> ins;
-    reader->ReadNext(&ins);
-    if (ins.empty()) {
-      reader->ReInit();
-      break;
+
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    if (!pending_.empty()) {
+      auto future_it = complete_queue_.Pop();
+      FutureItem item = future_it->get();
+      if (item.exception_) {
+        for (auto it = futures_.begin(); it != futures_.end(); ++it) {
+          if (it != future_it) {
+            it->wait();  // Wait all other threads complete.
+          }
+        }
+        std::rethrow_exception(item.exception_);
+
+      } else if (item.data_.empty()) {  // reader done.
+        done_.emplace_back(std::move(*item.reader_it_));
+        pending_.erase(item.reader_it_);
+        futures_.erase(future_it);
+        ReadNext(out);
+      } else {
+        *out = item.data_;
+        // continue read async
+        ReadAsync(item.reader_it_, &future_it);
+      }
+    } else {
+      out->clear();
     }
-    try {
-      buffer_->Send(std::move(ins));
-    } catch (paddle::platform::EnforceNotMet e) {
-      VLOG(5) << "WARNING: The buffer channel has been closed. The prefetch "
-                 "thread of file idx '"
-              << reader_idx << "' will terminate.";
-      break;
+  }
+
+ private:
+  void AppendReader(std::unique_ptr<framework::ReaderBase>&& reader) override {
+    pending_.emplace_back(std::move(reader));
+    auto reader_it = pending_.end();
+    --reader_it;
+
+    futures_.emplace_back();
+    auto future_it = futures_.end();
+    --future_it;
+
+    ReadAsync(reader_it, &future_it);
+  }
+
+  void ReadAsync(const ReaderList::iterator& reader_it,
+                 FutureList::iterator* future_it_ptr) {
+    auto& future_it = *future_it_ptr;
+    *future_it = pool_.enqueue([reader_it, future_it, this] {
+      try {
+        FutureItem item;
+        item.reader_it_ = reader_it;
+        (*reader_it)->ReadNext(&item.data_);
+        if (item.data_.empty()) {
+          (*reader_it)->Shutdown();
+          (*reader_it)->Start();
+        }
+        complete_queue_.Push(future_it);
+        return item;
+      } catch (...) {
+        FutureItem item;
+        item.exception_ = std::current_exception();
+        complete_queue_.Push(future_it);
+        return item;
+      }
+    });
+  }
+
+  FutureList futures_;
+  ThreadPool pool_;
+  framework::BlockingQueue<FutureList::iterator> complete_queue_;
+  std::list<std::unique_ptr<framework::ReaderBase>> pending_;
+  std::list<std::unique_ptr<framework::ReaderBase>> done_;
+};
+
+class MultiFileReader : public framework::ReaderBase {
+ public:
+  MultiFileReader(const std::vector<std::string>& file_names,
+                  std::unique_ptr<IReaderContainer>&& container)
+      : container_(std::move(container)) {
+    for (auto& fn : file_names) {
+      container_->AppendReader(CreateReaderByFileName(fn));
     }
   }
 
-  if (!available_thread_idx_->Send(thread_idx)) {
-    VLOG(5) << "WARNING: The available_thread_idx_ channel has been closed. "
-               "Fail to send thread_idx.";
+  ~MultiFileReader() { container_->Stop(); }
+
+ protected:
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
+    container_->ReadNext(out);
   }
-  VLOG(5) << "The prefetch thread of file idx '" << reader_idx
-          << "' terminates.";
-}
+  void ShutdownImpl() override { container_->Stop(); }
+  void StartImpl() override { container_->Start(); }
+
+ private:
+  std::unique_ptr<IReaderContainer> container_;
+};
 
 class OpenFilesOp : public framework::OperatorBase {
  public:
@@ -175,14 +223,27 @@ class OpenFilesOp : public framework::OperatorBase {
                       "shape concat's length.");
     const auto& file_names = Attr<std::vector<std::string>>("file_names");
     PADDLE_ENFORCE(!file_names.empty(), "No file to be read!");
-    const size_t thread_num = Attr<int>("thread_num");
-    const size_t buffer_size = Attr<int>("buffer_size");
+    bool is_test = Attr<bool>("is_test");
 
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new MultiFileReader(file_names,
-                                   RestoreShapes(shape_concat, ranks),
-                                   thread_num, buffer_size));
+    std::unique_ptr<IReaderContainer> container;
+
+    if (is_test) {
+      container.reset(new OrderedReaderContainer());
+    } else {
+      container.reset(new PreemptiveReaderContainer(
+          static_cast<size_t>(Attr<int>("thread_num"))));
+    }
+
+    std::shared_ptr<framework::ReaderBase> reader(
+        new MultiFileReader(file_names, std::move(container)));
+    auto buffer_size = Attr<int>("buffer_size");
+    if (buffer_size > 1) {
+      reader = framework::MakeDecoratedReader<BufferedReader>(
+          reader, platform::CPUPlace(), buffer_size);
+    }
+    out->Reset(reader);
   }
 };
 
@@ -190,9 +251,7 @@ class OpenFilesOpMaker : public FileReaderMakerBase {
  protected:
   void Apply() override {
     AddAttr<std::vector<std::string>>("file_names", "Files to be read.");
-    AddAttr<int>("thread_num", "The maximal concurrent prefetch thread number.")
-        .GreaterThan(0);
-    AddAttr<int>("buffer_size", "The size of prefetch buffer.").GreaterThan(0);
+    AddAttr<bool>("is_test", "Used for testing data.").SetDefault(false);
 
     AddComment(R"DOC(
       OpenFiles Operator
@@ -200,6 +259,11 @@ class OpenFilesOpMaker : public FileReaderMakerBase {
       An OpenFilesOp creates a MultiFileReader, which is able to
       read data multi-threaded from multiple files.
     )DOC");
+    AddAttr<int>("thread_num",
+                 "The maximal concurrent prefetch thread number. Used only "
+                 "when is_test = False");
+    AddAttr<int>("buffer_size", "The reading buffer of these files.")
+        .GreaterThan(0);
   }
 };
 
diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc
index e11256a49ffa6adc9410376cc8a71fa017df7e9c..b82aab1214992be73d876a42424234e3cea46455 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
@@ -39,7 +39,7 @@ std::unordered_map<std::string, FileReaderCreator>& FileReaderRegistry() {
 }
 
 std::unique_ptr<framework::ReaderBase> CreateReaderByFileName(
-    const std::string& file_name, const std::vector<framework::DDim>& dims) {
+    const std::string& file_name) {
   size_t separator_pos = file_name.find_last_of(kFileFormatSeparator);
   PADDLE_ENFORCE_NE(separator_pos, std::string::npos,
                     "File name illegal! A legal file name should be like: "
@@ -49,7 +49,7 @@ std::unique_ptr<framework::ReaderBase> CreateReaderByFileName(
   auto itor = FileReaderRegistry().find(filetype);
   PADDLE_ENFORCE(itor != FileReaderRegistry().end(),
                  "No file reader registered for '%s' format.", filetype);
-  framework::ReaderBase* reader = (itor->second)(file_name, dims);
+  framework::ReaderBase* reader = (itor->second)(file_name);
   return std::unique_ptr<framework::ReaderBase>(reader);
 }
 
diff --git a/paddle/fluid/operators/reader/reader_op_registry.h b/paddle/fluid/operators/reader/reader_op_registry.h
index 244bf15f068a47efc29ee54492cdbdeb10025020..25c3e7d77b788d38daf6dee1fc79e5c1c97e8842 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.h
+++ b/paddle/fluid/operators/reader/reader_op_registry.h
@@ -25,22 +25,21 @@ namespace reader {
 
 static constexpr char kFileFormatSeparator[] = ".";
 
-using FileReaderCreator = std::function<framework::ReaderBase*(
-    const std::string&, const std::vector<framework::DDim>&)>;
+using FileReaderCreator =
+    std::function<framework::ReaderBase*(const std::string&)>;
 
 std::unordered_map<std::string, FileReaderCreator>& FileReaderRegistry();
 
 template <typename Reader>
 int RegisterFileReader(const std::string& filetype) {
-  FileReaderRegistry()[filetype] = [](
-      const std::string& fn, const std::vector<framework::DDim>& dims) {
-    return new Reader(fn, dims);
+  FileReaderRegistry()[filetype] = [](const std::string& fn) {
+    return new Reader(fn);
   };
   return 0;
 }
 
 std::unique_ptr<framework::ReaderBase> CreateReaderByFileName(
-    const std::string& file_name, const std::vector<framework::DDim>& dims);
+    const std::string& file_name);
 
 extern std::vector<framework::DDim> RestoreShapes(
     const std::vector<int>& shape_concat, const std::vector<int>& ranks);
diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc
index 9854a31f5b10f5ecd940c0d41c2c3e468fc17bad..4a6ce938a5f337d035b21f562d46daf606236db0 100644
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@@ -40,8 +40,6 @@ class RecvOp : public framework::OperatorBase {
 
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
-    // For profiling
-    platform::RecordEvent record_event(Type(), &ctx);
 
     distributed::RPCClient* rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>();
@@ -51,7 +49,7 @@ class RecvOp : public framework::OperatorBase {
       rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]);
     }
     if (sync_mode) {
-      rpc_client->Wait();
+      PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
     }
   }
 };
diff --git a/paddle/fluid/operators/reduce_sum_op.cc b/paddle/fluid/operators/reduce_sum_op.cc
index c5b5398787b44e658b0f8390162df0e6c3006651..f0e5f6580fbc9e70562cb2fdd7e0c5d8729bc9a7 100644
--- a/paddle/fluid/operators/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_sum_op.cc
@@ -23,12 +23,13 @@ REGISTER_OP_CPU_KERNEL(
     ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::SumFunctor>,
     ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
                       ops::SumFunctor>);
-REGISTER_OP_CPU_KERNEL(reduce_sum_grad,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             float, ops::SumGradFunctor>,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             double, ops::SumGradFunctor>,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             int, ops::SumGradFunctor>,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             int64_t, ops::SumGradFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    reduce_sum_grad,
+    ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, float,
+                             ops::SumGradFunctor>,
+    ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, double,
+                             ops::SumGradFunctor>,
+    ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, int,
+                             ops::SumGradFunctor>,
+    ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, int64_t,
+                             ops::SumGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_sum_op.h b/paddle/fluid/operators/reduce_sum_op.h
index e67d7e1da5f0244d2dee346873692a80cbad2fc4..3e8d1bbdba504669bc06e0637094e3bee840adf2 100644
--- a/paddle/fluid/operators/reduce_sum_op.h
+++ b/paddle/fluid/operators/reduce_sum_op.h
@@ -14,11 +14,69 @@
 
 #pragma once
 
+#include <vector>
+
 #include "paddle/fluid/operators/reduce_op.h"
 
 namespace paddle {
 namespace operators {
 
+// use for loop to speed up Eigen broadcast. 4 timer faster then broadcast
+template <typename DeviceContext, typename T, typename Functor>
+class ReduceSumGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto dims = context.Attr<std::vector<int>>("dim");
+    if (context.GetPlace().type() == typeid(platform::CPUPlace) &&
+        dims.size() == 1) {
+      auto* input0 = context.Input<Tensor>("X");
+      auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto* output = context.Output<Tensor>(framework::GradVarName("X"));
+      output->mutable_data<T>(context.GetPlace());
+      const auto* input2_d = input2->data<T>();
+      auto* output_d = output->data<T>();
+
+      // handle reduce_all
+      if (input2->dims().size() == 1 && input2->dims()[0] == 1) {
+        for (int64_t i = 0; i < framework::product(input0->dims()); ++i) {
+          output_d[i] = input2_d[0];
+        }
+        return;
+      }
+
+      // handle reduce by one dimension
+      int reduce_dim_index = dims[0];
+      if (reduce_dim_index < 0) {
+        reduce_dim_index += input0->dims().size();
+      }
+
+      auto& input_dim = input0->dims();
+      int64_t before_dim = 1;
+      for (int i = 0; i < reduce_dim_index; ++i) {
+        before_dim *= input_dim[i];
+      }
+      int64_t reduce_dim = input_dim[reduce_dim_index];
+      int64_t after_dim = 1;
+      for (int i = reduce_dim_index + 1; i < input_dim.size(); ++i) {
+        after_dim *= input_dim[i];
+      }
+      for (int64_t i = 0; i < before_dim; ++i) {
+        for (int64_t j = 0; j < reduce_dim; ++j) {
+          for (int64_t k = 0; k < after_dim; ++k) {
+            output_d[i * reduce_dim * after_dim + j * after_dim + k] =
+                input2_d[i * after_dim + k];
+          }
+        }
+      }
+      return;
+    }
+
+    // default use Eigen broadcast
+    ReduceGradKernel<DeviceContext, T, Functor> kernel;
+    kernel.Compute(context);
+  }
+};
+
 struct SumFunctor {
   template <typename DeviceContext, typename X, typename Y, typename Dim>
   void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
@@ -31,7 +89,7 @@ struct SumGradFunctor {
             typename DY, typename Dim>
   void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
                   const Dim& dim, int size) {
-    dx->device(place) = dy->broadcast(dim);
+    dx->device(place) = dy->eval().broadcast(dim);
   }
 };
 
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 918f3be533d51367eade5f5108ad2eab954a9303..a1dfe39c3a4f84f5e4aaa2306813a7decf0e49ea 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -127,12 +127,6 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor). The output tensor of reshape operator.");
     AddAttr<std::vector<int>>(
         "shape", "(std::vector<int>) Target shape of reshape operator.");
-    AddAttr<bool>("inplace",
-                  "(default: false) Change the source tensor's shape without "
-                  "memory copy. When Attr(inplace) is set true, the output "
-                  "tensor shares memory with Input(X), otherwise, a new output "
-                  "tensor is created, and its data are copied from Input(x).")
-        .SetDefault(false);
     AddComment(R"DOC(
 Reshape Operator.
 
@@ -216,7 +210,7 @@ class ReshapeKernel {
     if (shape_tensor) {
       auto *shape_data = shape_tensor->data<int>();
       framework::Tensor cpu_shape_tensor;
-      if (platform::is_gpu_place(ctx.GetPlace())) {
+      if (platform::is_gpu_place(shape_tensor->place())) {
         TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
         shape_data = cpu_shape_tensor.data<int>();
       }
@@ -233,16 +227,9 @@ class ReshapeKernel {
           "sequence_reshape op.");
     }
 
-    bool inplace = ctx.Attr<bool>("inplace");
+    out->mutable_data(ctx.GetPlace(), in->type());
+    framework::TensorCopySync(*in, ctx.GetPlace(), out);
     out->Resize(out_dims);
-    if (!inplace) {
-      out->mutable_data(ctx.GetPlace(), in->type());
-      framework::TensorCopySync(*in, ctx.GetPlace(), out);
-      out->Resize(out_dims);
-    } else {
-      out->ShareDataWith(*in);
-      out->Resize(out_dims);
-    }
   }
 };
 
@@ -251,19 +238,11 @@ class ReshapeGradKernel {
   void operator()(const framework::ExecutionContext &ctx) const {
     auto *d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
     auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto in_dims = d_x->dims();
 
     d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    bool inplace = ctx.Attr<bool>("inplace");
-
-    auto in_dims = d_x->dims();
-    if (!inplace) {
-      framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
-      ctx.device_context().Wait();
-      d_x->Resize(in_dims);
-    } else {
-      d_x->ShareDataWith(*d_out);
-      d_x->Resize(in_dims);
-    }
+    framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
+    d_x->Resize(in_dims);
   }
 };
 
diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc
index 6b4572dcccc21e783f1df0b9bcde11d532ff4ba8..1866a86048acbefadcb4d82cd6309cd16f0352d6 100644
--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
@@ -39,24 +39,19 @@ class SendBarrierOp : public framework::OperatorBase {
     std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
     bool sync_mode = Attr<bool>("sync_mode");
 
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(place);
-    // For profiling
-    platform::RecordEvent record_event(Type(), &ctx);
-
     distributed::RPCClient* rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
     VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode;
 
     // need to wait before sending send_barrier message
-    rpc_client->Wait();
+    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
     if (sync_mode) {
       for (auto& ep : eps) {
         VLOG(3) << "send barrier, ep: " << ep;
         rpc_client->AsyncSendBatchBarrier(ep);
       }
-      rpc_client->Wait();
+      PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
     }
   }
 };
diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc
index 0cac329aafa8c4c67cae48ba62a48575f5edba92..3cd42f2d059532b7090e66ce21de8e5cb014adf1 100644
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -42,9 +42,6 @@ class SendOp : public framework::OperatorBase {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
 
-    // For profiling
-    platform::RecordEvent record_event(Type(), &ctx);
-
     distributed::RPCClient* rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
@@ -59,7 +56,7 @@ class SendOp : public framework::OperatorBase {
       }
     }
     if (sync_send) {
-      rpc_client->Wait();
+      PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
     }
   }
 };
diff --git a/paddle/fluid/operators/send_recv_util.h b/paddle/fluid/operators/send_recv_util.h
index deab005149027caffa962783df944fad7110382f..dc26c53c64f06ce21856fb5af8f2a5eb3fc75bb7 100644
--- a/paddle/fluid/operators/send_recv_util.h
+++ b/paddle/fluid/operators/send_recv_util.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+#include "paddle/fluid/framework/ir/node.h"
 
 namespace paddle {
 namespace operators {
@@ -22,7 +23,10 @@ inline bool NeedSend(const framework::Scope& scope,
                      const std::string& varname) {
   // dummy variable is only used in parallel executor to represent
   // some dependency relationship, we don't need to send/recv it.
-  if (varname == "dummy") return false;
+  // TODO(paddle-dev): Why would parallel executor logic leaked into here?
+  if (varname.find(framework::ir::Node::kControlDepVarName) !=
+      std::string::npos)
+    return false;
   auto* var = scope.FindVar(varname);
   PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
                           varname);
diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.cc b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
index 5596fa0648ccc151bc0d11de9c556599428a8d71..2bdb23e999621b10799b5163f326bc4b66a437e6 100644
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
@@ -30,8 +30,16 @@ class SoftmaxCUDNNKernel : public framework::OpKernel<T> {
     // allocate memory on device.
     Out->mutable_data<T>(context.GetPlace());
 
+    auto dims = X->dims();
+    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    framework::LoDTensor flattened_x;
+    framework::LoDTensor flattened_out;
+    flattened_x.ShareDataWith(*X).Resize(flattened_dims);
+    flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
+
     math::SoftmaxCUDNNFunctor<T>()(
-        context.template device_context<platform::CUDADeviceContext>(), X, Out);
+        context.template device_context<platform::CUDADeviceContext>(),
+        &flattened_x, &flattened_out);
   }
 };
 
@@ -46,9 +54,18 @@ class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
     // allocate memory on device.
     dX->mutable_data<T>(context.GetPlace());
 
+    auto dims = Out->dims();
+    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    framework::LoDTensor flattened_out;
+    framework::LoDTensor flattened_d_out;
+    framework::LoDTensor flattened_d_x;
+    flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
+    flattened_d_out.ShareDataWith(*dOut).Resize(flattened_dims);
+    flattened_d_x.ShareDataWith(*dX).Resize(flattened_dims);
+
     math::SoftmaxGradCUDNNFunctor<T>()(
-        context.template device_context<platform::CUDADeviceContext>(), Out,
-        dOut, dX);
+        context.template device_context<platform::CUDADeviceContext>(),
+        &flattened_out, &flattened_d_out, &flattened_d_x);
   }
 };
 
diff --git a/paddle/fluid/operators/softmax_mkldnn_op.cc b/paddle/fluid/operators/softmax_mkldnn_op.cc
index 6668e6b9e917eea7ba4a80ac78917b73eb827208..01819f53e3ab0973f6140c5a81f18f954b6a0376 100644
--- a/paddle/fluid/operators/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/softmax_mkldnn_op.cc
@@ -26,9 +26,9 @@ using paddle::platform::MKLDNNMemDesc;
 
 using mkldnn::memory;  // Note: paddle has also "memory" namespace
 using mkldnn::primitive;
-using mkldnn::softmax_forward;
-using mkldnn::softmax_backward;
 using mkldnn::prop_kind;
+using mkldnn::softmax_backward;
+using mkldnn::softmax_forward;
 using mkldnn::stream;
 using platform::to_void_cast;
 
@@ -113,17 +113,27 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
     auto mkldnn_engine = dev_ctx.GetEngine();
     const Tensor* input = ctx.Input<Tensor>("X");
     Tensor* output = ctx.Output<Tensor>("Out");
-    PADDLE_ENFORCE(input->dims().size() == 2UL,
-                   "The input of softmax op must be a 2D matrix.");
-    const T* input_data = input->data<T>();
-    // allocate memory for output
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
-    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
-    // MKL-DNN does support softmax over selected axis. Having 2D Tensor,
-    // we will make normalization after final eg. axis: 1
-    PADDLE_ENFORCE(((src_tz[0] == dst_tz[0]) && (src_tz[1] == dst_tz[1])),
-                   "Softmax input and output dimensions should match");
+    PADDLE_ENFORCE_EQ(
+        input->dims(), output->dims(),
+        "The shape of softmax's input and output must be identical.");
+
+    // make sure 'output' holds memory, which will be shared by
+    // 'flattened_output' later.
+    output->mutable_data<T>(ctx.GetPlace());
+
+    // flatten input and output to 2-D matrixs
+    auto dims = input->dims();  // input and output share the same shape
+    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    framework::Tensor flattened_input;
+    framework::Tensor flattened_output;
+    flattened_input.ShareDataWith(*input).Resize(flattened_dims);
+    flattened_output.ShareDataWith(*output).Resize(flattened_dims);
+
+    const T* input_data = flattened_input.data<T>();
+    T* output_data = flattened_output.mutable_data<T>(ctx.GetPlace());
+
+    std::vector<int> src_tz = paddle::framework::vectorize2int(flattened_dims);
+    std::vector<int> dst_tz = src_tz;
     // Same memory descriptor to be used for input and output
     memory::dims softmax_tz = {src_tz[0], src_tz[1]};
     // Generate keys for storing/retriving primitives for this operator
@@ -174,23 +184,34 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
     auto mkldnn_engine = dev_ctx.GetEngine();
     const Tensor* output = ctx.Input<Tensor>("Out");
-    const T* dst_data = output->data<T>();
-
     auto* dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
-    const auto* diff_dst_ptr = dout->template data<T>();
-
     auto* dx =
         ctx.template Output<framework::Tensor>(framework::GradVarName("X"));
-    T* diff_src_ptr = dx->template mutable_data<T>(ctx.GetPlace());
 
-    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    PADDLE_ENFORCE_EQ(
+        dout->dims(), dx->dims(),
+        "The shape of softmax_grad's input and output must be identical.");
+
+    // make sure 'dx' holds memory, which will be shared by 'flattened_dx'
+    // later.
+    dx->template mutable_data<T>(ctx.GetPlace());
+
+    auto dims = dout->dims();  // input and output share the same shape
+    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    framework::Tensor flattened_output;
+    framework::Tensor flattened_dout;
+    framework::Tensor flattened_dx;
+    flattened_output.ShareDataWith(*output).Resize(flattened_dims);
+    flattened_dout.ShareDataWith(*dout).Resize(flattened_dims);
+    flattened_dx.ShareDataWith(*dx).Resize(flattened_dims);
+
+    const T* dst_data = flattened_output.data<T>();
+    const T* diff_dst_ptr = flattened_dout.template data<T>();
+    T* diff_src_ptr = flattened_dx.template mutable_data<T>(ctx.GetPlace());
+
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(flattened_dims);
     std::vector<int> src_tz(dst_tz);
-    PADDLE_ENFORCE(output->dims().size() == 2UL,
-                   "The input of softmax op must be a 2D matrix.");
-    // MKL-DNN does support softmax over selected axis. Having 2D Tensor,
-    // we will make normalization after final eg. axis: 1
-    PADDLE_ENFORCE(((src_tz[0] == dst_tz[0]) && (src_tz[1] == dst_tz[1])),
-                   "Softmax input and output dimensions should match");
+
     // Same memory descriptor to be used for input and output
     memory::dims softmax_tz = {src_tz[0], src_tz[1]};
     // Currently only supports NC data format
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 31a7458f637921c290fc71ac748143867b4aae19..bb081238820b9ee3ae095442d21cfce11f7b41e5 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -37,10 +37,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of SoftmaxOp should not be null.");
 
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE(x_dims.size() == 2UL,
-                   "The input of softmax op must be a matrix.");
-    ctx->SetOutputDim("Out", x_dims);
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
     ctx->ShareLoD("X", /*->*/ "Out");
   }
 
@@ -81,8 +78,8 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
-             "The input tensor of softmax. "
-             "2-D with shape [batch_size, input_feature_dimensions].");
+             "The input tensor of softmax, "
+             "whose last dimension is the input_feature_dimensions.");
     AddOutput("Out", "The normalized values with the same shape as X.")
         .Reuse("X");
     AddAttr<bool>(
@@ -105,20 +102,23 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Softmax Operator.
 
-The input of the softmax operator is a 2-D tensor with shape N x K (N is the
-batch_size, K is the dimension of input feature). The output tensor has the
-same shape as the input tensor.
+The input of the softmax operator is a tensor of any rank. The output tensor 
+has the same shape as the input.
 
-For each row of the input tensor, the softmax operator squashes the
-K-dimensional vector of arbitrary real values to a K-dimensional vector of real
-values in the range [0, 1] that add up to 1.
+The input tensor will first be logically flattened to a 2-D matrix. The matrix's 
+second dimension(row length) is as same as the last dimension of the input 
+tensor, and the first dimension(column length) is the product of all other 
+dimensions of the input tensor. For each row of the matrix, the softmax operator 
+squashes the K-dimensional(K is the width of the matrix, which is also the size 
+of the input tensor's last dimension) vector of arbitrary real values to a 
+K-dimensional vector of real values in the range [0, 1] that add up to 1.
 It computes the exponential of the given dimension and the sum of exponential
 values of all the other dimensions in the K-dimensional vector input.
 Then the ratio of the exponential of the given dimension and the sum of
 exponential values of all the other dimensions is the output of the softmax
 operator.
 
-For each row $i$ and each column $j$ in Input(X), we have:
+For each row $i$ and each column $j$ in the matrix, we have:
     $$Out[i, j] = \frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}$$
 
 )DOC");
@@ -137,7 +137,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
                       ctx->GetInputDim(framework::GradVarName("Out")),
                       "Input(Out) and its gradients should have a same shape.");
 
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->SetOutputDim(framework::GradVarName("X"),
+                      ctx->GetInputDim(framework::GradVarName("Out")));
   }
 
  protected:
@@ -160,8 +161,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
       layout_ = framework::DataLayout::kMKLDNN;
     }
 #endif
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    auto input_data_type = framework::ToDataType(
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type());
     if (input_data_type == framework::proto::VarType::FP16) {
       PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                      "float16 can only be used on GPU place");
@@ -172,13 +173,31 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
   }
 };
 
+class SoftmaxOpGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType("softmax_grad");
+
+    op->SetInput("Out", Output("Out"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+
+    op->SetAttrMap(Attrs());
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::SoftmaxOpGradMaker);
 REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad);
 REGISTER_OP_CPU_KERNEL(
     softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h
index 600da45a0bbb69b76d59c981e195fc03a49b0504..1205bd0587f32caae04c27ecea581fc17988507f 100644
--- a/paddle/fluid/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
@@ -31,8 +31,16 @@ class SoftmaxKernel : public framework::OpKernel<T> {
     // allocate memory on device.
     Out->mutable_data<T>(context.GetPlace());
 
+    auto dims = X->dims();
+    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    framework::LoDTensor flattened_x;
+    framework::LoDTensor flattened_out;
+    flattened_x.ShareDataWith(*X).Resize(flattened_dims);
+    flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
+
     math::SoftmaxFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), X, Out);
+        context.template device_context<DeviceContext>(), &flattened_x,
+        &flattened_out);
   }
 };
 
@@ -47,8 +55,18 @@ class SoftmaxGradKernel : public framework::OpKernel<T> {
     // allocate memory on device.
     dX->mutable_data<T>(context.GetPlace());
 
+    auto dims = Out->dims();
+    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    framework::LoDTensor flattened_out;
+    framework::LoDTensor flattened_d_out;
+    framework::LoDTensor flattened_d_x;
+    flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
+    flattened_d_out.ShareDataWith(*dOut).Resize(flattened_dims);
+    flattened_d_x.ShareDataWith(*dX).Resize(flattened_dims);
+
     math::SoftmaxGradFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), Out, dOut, dX);
+        context.template device_context<DeviceContext>(), &flattened_out,
+        &flattened_d_out, &flattened_d_x);
   }
 };
 
diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h
index d263426e073d95ad6d584c7370baf596587a993d..c4af5a65fc5f81c1af7c1fdcca637ca37c940637 100644
--- a/paddle/fluid/operators/split_ids_op.h
+++ b/paddle/fluid/operators/split_ids_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
@@ -67,10 +68,15 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
       const auto &ids_rows = ids_selected_rows->rows();
       auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
       const size_t shard_num = outs.size();
+      for (auto &out : outs) {
+        out->mutable_rows()->clear();
+      }
       // get rows for outputs
-      for (auto &id : ids_rows) {
-        size_t shard_id = static_cast<size_t>(id) % shard_num;
-        outs[shard_id]->mutable_rows()->push_back(id);
+      std::unordered_map<int64_t, size_t> id_to_index;
+      for (size_t i = 0; i < ids_rows.size(); ++i) {
+        id_to_index[ids_rows[i]] = i;
+        size_t shard_id = static_cast<size_t>(ids_rows[i]) % shard_num;
+        outs[shard_id]->mutable_rows()->push_back(ids_rows[i]);
       }
 
       int64_t row_width = ids_dims[1];
@@ -80,7 +86,8 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
             {static_cast<int64_t>(out->rows().size()), row_width});
         T *output = out->mutable_value()->mutable_data<T>(ddim, place);
         for (int64_t i = 0; i < ddim[0]; ++i) {
-          memcpy(output + i * row_width, ids + out->rows()[i] * row_width,
+          memcpy(output + i * row_width,
+                 ids + id_to_index[out->rows()[i]] * row_width,
                  row_width * sizeof(T));
         }
       }
diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6c507baf3a0ab0a557d29a53700685753616193b
--- /dev/null
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -0,0 +1,202 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class SqueezeOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SqueezeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SqueezeOp should not be null.");
+
+    const auto &x_dims = ctx->GetInputDim("X");
+    // Check input tensor dims (<6) Eigen limit.
+    PADDLE_ENFORCE(x_dims.size() <= 6,
+                   "Invalid dimnesions, the rank of Input(X) "
+                   "should be in the range of [1, 6] (Eigen limit).");
+
+    const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
+    for (int a : axes) {
+      PADDLE_ENFORCE_LT(a, x_dims.size(),
+                        "The squeeze axis should be less than input "
+                        "tensor's rank.");
+    }
+
+    auto out_dims = GetOutputShape(axes, x_dims);
+    ctx->SetOutputDim("Out", out_dims);
+    if (x_dims[0] == out_dims[0]) {
+      // Only pass LoD when the first dimension of output and Input(X)
+      // are the same.
+      ctx->ShareLoD("X", "Out");
+    }
+  }
+
+  static framework::DDim GetOutputShape(const std::vector<int> squeeze_dims,
+                                        const framework::DDim &in_dims) {
+    size_t num_squeeze_dims = squeeze_dims.size();
+    int cnt_squeezed_dims = 0;
+    bool should_squeeze[9] = {false};
+
+    // Determines number of dimensions of output tensor after squeeze.
+    // Mark and count the dimensions need to be squeezed
+    if (num_squeeze_dims == 0) {
+      for (int idx = 0; idx < in_dims.size(); ++idx) {
+        if (in_dims[idx] == 1) {
+          should_squeeze[idx] = true;
+          ++cnt_squeezed_dims;
+        }
+      }
+    } else {
+      for (size_t idx = 0; idx < num_squeeze_dims; ++idx) {
+        int current = squeeze_dims[idx] < 0 ? squeeze_dims[idx] + in_dims.size()
+                                            : squeeze_dims[idx];
+        // Check current index, the upper limit has beed checked in line 36.
+        PADDLE_ENFORCE(current >= 0,
+                       "Invalid axis, the negative axis is out of range.");
+        PADDLE_ENFORCE(in_dims[current] == 1,
+                       "Invalid axis index, the axis that will be squeezed "
+                       "should be equal to 1.");
+
+        if (!(should_squeeze[current])) {
+          ++cnt_squeezed_dims;
+        }
+        should_squeeze[current] = true;
+      }
+    }
+
+    // Make output dimensions
+    std::vector<int64_t> output_shape(in_dims.size() - cnt_squeezed_dims, 0);
+    for (int in_idx = 0, out_idx = 0; in_idx < in_dims.size(); ++in_idx) {
+      if (!should_squeeze[in_idx]) {
+        output_shape[out_idx++] = in_dims[in_idx];
+      }
+    }
+
+    return framework::make_ddim(output_shape);
+  }
+};
+
+class SqueezeOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axes = Attr<std::vector<int>>("axes");
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    auto out_dims = SqueezeOpInferShape::GetOutputShape(axes, x_dims);
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(out_dims);
+    attrs["inplace"] = Attr<bool>("inplace");
+    // Invoke Reshape Op
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor). The input tensor of squeeze operator.");
+    AddOutput("Out", "(Tensor). The output tensor of squeeze operator.");
+    AddAttr<std::vector<int>>("axes",
+                              "(std::vector<int>). List of integers,"
+                              " indicating the dimensions to squeeze.")
+        .SetDefault({});
+    AddAttr<bool>("inplace",
+                  "(default: false) Squeeze the source tensor's shape without "
+                  "memory copy. When Attr(inplace) is set true, the output "
+                  "tensor shares memory with Input(X), otherwise, a new output "
+                  "tensor is created, and its data are copied from Input(x).")
+        .SetDefault(false);
+    AddComment(R"DOC(
+        Squeeze Operator.
+        
+        Remove single-dimensional entries from the shape of a tensor. 
+        Takes a parameter axes with a list of axes to squeeze. 
+        If axes is not provided, all the single dimensions will be removed from the shape. 
+        If an axis is selected with shape entry not equal to one, an error is raised.
+        
+        Examples:
+        Case 1:
+          Given 
+            X.shape = (1, 3, 1, 5)
+          and
+            axes = [0]
+          we get:
+            Out.shape = (3, 1, 5)
+
+        Case 2:
+          Given
+            X.shape = (1, 3, 1, 5)
+          and 
+            axes = []
+          we get:
+            Out.shape = (3, 5)
+    )DOC");
+  }
+};
+
+class SqueezeGradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    context->SetOutputDim(framework::GradVarName("X"),
+                          context->GetInputDim("X"));
+    context->ShareLoD("X", framework::GradVarName("X"));
+  }
+};
+
+class SqueezeGradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(x_dims);
+    attrs["inplace"] = Attr<bool>("inplace");
+
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}},
+        attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+// Tell linker to use reshape op
+USE_OP(reshape);
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker,
+                  ops::SqueezeOpInferShape,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp, ops::SqueezeGradInferShape);
diff --git a/paddle/fluid/operators/sum_mkldnn_op.cc b/paddle/fluid/operators/sum_mkldnn_op.cc
index f78d977760f18c9eb1270e515e68acb208a7c9a4..d2035777ee2289291a02594ee289156504df09d9 100644
--- a/paddle/fluid/operators/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/sum_mkldnn_op.cc
@@ -88,7 +88,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         input_format = memory::format::nc;
       }
 
-      for (int i = in_place ? 1 : 0; i < N; i++) {
+      for (int i = 0; i < N; i++) {
         PADDLE_ENFORCE(in_vars[i]->IsType<LoDTensor>(),
                        "all inputs must be all LoDTensors");
         auto& input = in_vars[i]->Get<LoDTensor>();
diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc
index 647cfc0a0af2be85e2868c6f68cab962c6631a8d..ee3078876c15b06a887064f08dc0c05d450b5f77 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -24,6 +24,9 @@
 #include "paddle/fluid/operators/tensorrt_engine_op.h"
 
 namespace paddle {
+
+DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT");
+
 namespace operators {
 
 using inference::Singleton;
@@ -52,18 +55,8 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
                     "TensorRT' tensor input requires at least 2 dimensions");
   PADDLE_ENFORCE_LE(shape.size(), 4UL,
                     "TensorRT' tensor input requires at most 4 dimensions");
-
-  switch (shape.size()) {
-    case 2:
-      return nvinfer1::Dims2(shape[0], shape[1]);
-    case 3:
-      return nvinfer1::Dims3(shape[0], shape[1], shape[2]);
-    case 4:
-      return nvinfer1::Dims4(shape[0], shape[1], shape[2], shape[3]);
-    default:
-      return nvinfer1::Dims();
-  }
-  return nvinfer1::Dims();
+  PADDLE_ENFORCE_EQ(shape.size(), 4UL);
+  return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
 }
 
 }  // namespace
@@ -83,6 +76,9 @@ void TensorRTEngineKernel<DeviceContext, T>::Prepare(
     parameters.insert(param);
   }
 
+  std::vector<std::string> output_maps =
+      context.Attr<std::vector<std::string>>("output_name_mapping");
+
   // TODO(Superjomn) replace this with a different stream
   auto *engine = Singleton<TRT_EngineManager>::Global().Create(
       max_batch, max_workspace, nullptr /*engine hold its own stream*/,
@@ -90,27 +86,37 @@ void TensorRTEngineKernel<DeviceContext, T>::Prepare(
   engine->InitNetwork();
 
   framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
+  VLOG(4) << "parsed var size " << block.AllVars().size();
   // Add inputs
   VLOG(4) << "declare inputs";
   for (auto &input : context.Inputs("Xs")) {
+    if (parameters.count(input)) continue;
     VLOG(4) << "declare input " << input;
     auto *var = block.FindVar(input);
+    // TensorRT engine need to create parameters. The parameter's description
+    // should be set in
+    PADDLE_ENFORCE(var, "no variable called %s", input);
     PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
                       "TensorRT engine only takes LoDTensor as input");
     auto shape = var->GetShape();
+    // For the special batch_size placeholder -1, drop it and pass the real
+    // shape of data.
+    // TODO(Superjomn) fix this with batch broadcast, or it can't handle
+    // variational batch size.
+    if (shape[0] == -1) {
+      shape[0] = FLAGS_tensorrt_engine_batch_size;
+    }
     engine->DeclareInput(
         input, FluidDataType2TRT(
                    var->Proto()->type().lod_tensor().tensor().data_type()),
-        Vec2TRT_Dims(var->GetShape()));
+        Vec2TRT_Dims(shape));
   }
 
   inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
       block_desc, parameters, context.scope(), engine);
 
   // Add outputs
-  VLOG(4) << "declare outputs";
-  for (auto &output : context.Outputs("Ys")) {
-    VLOG(4) << "declare output " << output;
+  for (auto &output : output_maps) {
     engine->DeclareOutput(output);
   }
 
diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h
index 1602a913aeebe43fabe2f9c9036edd18ac4c70fd..2cbe1213a2f428a3ce56b06f97636baeb4b66c26 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -24,6 +24,9 @@
 #include "paddle/fluid/inference/tensorrt/engine.h"
 
 namespace paddle {
+
+DECLARE_int32(tensorrt_engine_batch_size);
+
 namespace operators {
 
 using inference::Singleton;
@@ -53,7 +56,6 @@ template <typename DeviceContext, typename T>
 class TensorRTEngineKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    VLOG(4) << "TensorRTEngineKernel executing";
     auto engine_name = context.Attr<std::string>("engine_uniq_key");
     if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
       Prepare(context);
@@ -61,14 +63,20 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
     auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
     auto input_names = context.op().Inputs("Xs");
     PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
-    // Try to determine a batch_size
-    auto& tensor0 = inference::analysis::GetFromScope<framework::LoDTensor>(
-        context.scope(), input_names.front());
-    int batch_size = tensor0.dims()[0];
-    PADDLE_ENFORCE_LE(batch_size, context.Attr<int>("max_batch"));
+    PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size,
+                      context.Attr<int>("max_batch"));
+
+    std::vector<std::string> output_maps =
+        context.Attr<std::vector<std::string>>("output_name_mapping");
 
+    auto params = context.Attr<std::vector<std::string>>("parameters");
+    std::unordered_set<std::string> parameters;
+    for (const auto& param : params) {
+      parameters.insert(param);
+    }
     // Convert input tensor from fluid to engine.
     for (const auto& x : context.Inputs("Xs")) {
+      if (parameters.count(x)) continue;
       // convert input and copy to TRT engine's buffer
       auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(
           context.scope(), x);
@@ -81,12 +89,14 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
       }
     }
     // Execute the engine.
-    PADDLE_ENFORCE_GT(batch_size, 0);
-    engine->Execute(batch_size);
+    PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0);
+    engine->Execute(FLAGS_tensorrt_engine_batch_size);
+
     // Convert output tensor from engine to fluid
+    int output_index = 0;
     for (const auto& y : context.Outputs("Ys")) {
       // convert output and copy to fluid.
-      nvinfer1::ITensor* trt_t = engine->GetITensor(y);
+      nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]);
       auto dims = trt_t->getDimensions();
       // Use the output ITensor's dims to reshape the Fluid Tensor.
       std::vector<int> ddim(dims.d, dims.d + dims.nbDims);
@@ -94,18 +104,24 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
       auto* fluid_v = context.scope().FindVar(y);
       PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
       auto* fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
+
       fluid_t->Resize(framework::make_ddim(ddim));
-      auto size = inference::analysis::AccuDims(dims.d, dims.nbDims);
-      if (platform::is_cpu_place(fluid_t->place())) {
-        // TODO(Superjomn) change this float to dtype size.
-        engine->GetOutputInCPU(
-            y, fluid_t->mutable_data<float>(platform::CPUPlace()),
-            size * sizeof(float));
-      } else {
-        engine->GetOutputInGPU(
-            y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
-            size * sizeof(float));
-      }
+
+      // TODO(Superjomn) find some way to determine which device to output the
+      // tensor.
+      // if (platform::is_cpu_place(fluid_t->place())) {
+      // TODO(Superjomn) change this float to dtype size.
+      auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) *
+                  FLAGS_tensorrt_engine_batch_size;
+      engine->GetOutputInCPU(output_maps[output_index],
+                             fluid_t->mutable_data<float>(platform::CPUPlace()),
+                             size * sizeof(float));
+      //} else {
+      // engine->GetOutputInGPU(
+      // y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
+      // size * sizeof(float));
+      //}
+      output_index += 1;
     }
 
     cudaStreamSynchronize(*engine->stream());
diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc
index 82a16361e40513aeaf6f510e450f58989369fcdb..37657fa0b0498986fe67027415279af1775e58b9 100644
--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@@ -64,36 +64,37 @@ TEST(TensorRTEngineOp, manual) {
 
   LOG(INFO) << "create block desc";
   framework::BlockDesc block_desc(&program, block_);
-  LOG(INFO) << "create mul op";
-  auto* mul = block_desc.AppendOp();
-  mul->SetType("mul");
-  mul->SetInput("X", std::vector<std::string>({"x"}));     // 2 x 4
-  mul->SetInput("Y", std::vector<std::string>({"y"}));     // 4 x 6
-  mul->SetOutput("Out", std::vector<std::string>({"z"}));  // 2 x 6
+  LOG(INFO) << "create fc op";
+  auto* fc0 = block_desc.AppendOp();
+  fc0->SetType("fc");
+  fc0->SetInput("X", std::vector<std::string>({"x"}));     // 4 x 1 x 1
+  fc0->SetInput("Y", std::vector<std::string>({"y"}));     // 4 x 6
+  fc0->SetOutput("Out", std::vector<std::string>({"z"}));  // 6 x 1 x 1
 
   LOG(INFO) << "create fc op";
-  auto* fc = block_desc.AppendOp();
-  fc->SetType("mul");
-  fc->SetInput("X", std::vector<std::string>({"z"}));
-  fc->SetInput("Y", std::vector<std::string>({"y0"}));     // 6 x 8
-  fc->SetOutput("Out", std::vector<std::string>({"z0"}));  // 2 x 8
+  auto* fc1 = block_desc.AppendOp();
+  fc1->SetType("fc");
+  fc1->SetInput("X", std::vector<std::string>({"z"}));
+  fc1->SetInput("Y", std::vector<std::string>({"y0"}));     // 6 x 8
+  fc1->SetOutput("Out", std::vector<std::string>({"z0"}));  // 8 x 1 x 1
 
   // Set inputs' variable shape in BlockDesc
-  AddTensorToBlockDesc(block_, "x", std::vector<int64_t>({2, 4}));
+  // the batch size is 2, so the dims of 'x' is {2, 4, 1, 1}
+  AddTensorToBlockDesc(block_, "x", std::vector<int64_t>({2, 4, 1, 1}));
   AddTensorToBlockDesc(block_, "y", std::vector<int64_t>({4, 6}));
   AddTensorToBlockDesc(block_, "y0", std::vector<int64_t>({6, 8}));
   AddTensorToBlockDesc(block_, "z", std::vector<int64_t>({2, 6}));
 
   // It is wired, need to copy manually.
-  *block_->add_ops() = *mul->Proto();
-  *block_->add_ops() = *fc->Proto();
+  *block_->add_ops() = *fc0->Proto();
+  *block_->add_ops() = *fc1->Proto();
 
   ASSERT_EQ(block_->ops_size(), 2);
 
   LOG(INFO) << "create tensorrt desc";
   framework::OpDesc engine_op_desc(nullptr);
   engine_op_desc.SetType("tensorrt_engine");
-  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x", "y", "y0"}));
+  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x"}));
   engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
   SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
                        block_->SerializeAsString());
@@ -102,6 +103,9 @@ TEST(TensorRTEngineOp, manual) {
   SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
   SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
                                     std::vector<std::string>({}));
+  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(),
+                                    "output_name_mapping",
+                                    std::vector<std::string>({"z0"}));
 
   LOG(INFO) << "create engine op";
   auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
@@ -195,6 +199,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
       std::vector<std::string>({"y0", "y1", "y2", "y3"}));
   SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "b_engine");
 
+  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(),
+                                    "output_name_mapping",
+                                    std::vector<std::string>({"z3"}));
+
   auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
 
   // Execute them.
@@ -207,5 +215,4 @@ TEST(TensorRTEngineOp, fc) { Execute(40, 28, 28); }
 }  // namespace operators
 }  // namespace paddle
 
-USE_TRT_CONVERTER(mul)
 USE_TRT_CONVERTER(fc)
diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h
index 7ddb82ef6ff063868a4b9b603b8ab89700b9dd13..054dd481994d03f71b0ed5dc73e103085f6c91aa 100644
--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@@ -60,6 +60,7 @@ class TopkKernel : public framework::OpKernel<T> {
 #endif
     for (size_t i = 0; i < row; i++) {
       std::vector<std::pair<T, size_t>> vec;
+      vec.reserve(col);
       for (size_t j = 0; j < col; j++) {
         vec.push_back(std::pair<T, size_t>(eg_input(i, j), j));
       }
diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f2a15fdf572e0de30f9949dda5020e130b0c5585
--- /dev/null
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -0,0 +1,191 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class UnsqueezeOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of UnsqueezeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of UnsqueezeOp should not be null.");
+
+    const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
+    const auto &x_dims = ctx->GetInputDim("X");
+    // Validity Check: input tensor dims (<6).
+    PADDLE_ENFORCE(x_dims.size() <= 6,
+                   "Invalid dimensions, the rank of Input(X) "
+                   "should be in the range of [1, 6] (Eigen limit)");
+    auto out_dims = GetOutputShape(axes, x_dims);
+    ctx->SetOutputDim("Out", out_dims);
+    if (x_dims[0] == out_dims[0]) {
+      // Only pass LoD when the first dimension of output and Input(X)
+      // are the same.
+      ctx->ShareLoD("X", "Out");
+    }
+  }
+
+  static framework::DDim GetOutputShape(const std::vector<int> unsqz_dims,
+                                        const framework::DDim &in_dims) {
+    int output_size = in_dims.size() + static_cast<int>(unsqz_dims.size());
+    int cur_output_size = in_dims.size();
+    std::vector<int64_t> output_shape(output_size, 0);
+
+    // Validity Check: rank range.
+    PADDLE_ENFORCE(output_size <= 6,
+                   "The output tensor's rank should be less than 6.");
+
+    for (int axis : unsqz_dims) {
+      int cur = axis < 0 ? axis + cur_output_size + 1 : axis;
+      // Vaildity Check: the axis bound
+      PADDLE_ENFORCE(
+          cur >= 0 && cur <= cur_output_size,
+          "The unsqueeze dims must be within range of current rank.");
+      // Move old axis, and insert new axis
+      for (int i = cur_output_size; i >= cur; --i) {
+        if (output_shape[i] == 1) {
+          // Move axis
+          output_shape[i + 1] = 1;
+          output_shape[i] = 0;
+        }
+      }
+      output_shape[cur] = 1;
+      // Add the output size.
+      cur_output_size++;
+    }
+
+    // Make output shape
+    for (int in_idx = 0, out_idx = 0; out_idx < output_size; ++out_idx) {
+      if (output_shape[out_idx] == 0) {
+        output_shape[out_idx] = in_dims[in_idx++];
+      }
+    }
+
+    return framework::make_ddim(output_shape);
+  }
+};
+
+class UnsqueezeOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axes = Attr<std::vector<int>>("axes");
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    auto out_dims = UnsqueezeOpInferShape::GetOutputShape(axes, x_dims);
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(out_dims);
+    attrs["inplace"] = Attr<bool>("inplace");
+    // Invoke Reshape op.
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor). The input tensor of unsqueeze operator.");
+    AddOutput("Out", "(Tensor). The output tensor of unsqueeze operator.");
+    AddAttr<std::vector<int>>("axes",
+                              "(std::vector<int>). List of integers,"
+                              " indicating the dimensions to be inserted")
+        .AddCustomChecker([](const std::vector<int> &axes) {
+          PADDLE_ENFORCE(!axes.empty(),
+                         "Invalid axes, The unsqueeze axes is empty.");
+          // Validity Check: axes dims (<6).
+          PADDLE_ENFORCE(static_cast<int>(axes.size()) < 6,
+                         "Invalid dimensions, dynamic dimensions should be "
+                         "within [1, 6] dimensions (Eigen limit).");
+          // Validity Check: the range of unsqueeze aixs.
+          for (int axis : axes) {
+            PADDLE_ENFORCE(axis < 6,
+                           "Invalid dimensions, input axis should be"
+                           " within [1, 6] dimensions (Eigen limit).");
+          }
+        });
+    AddAttr<bool>(
+        "inplace",
+        "(default: false) Unsqueeze the source tensor's shape without "
+        "memory copy. When Attr(inplace) is set true, the output "
+        "tensor shares memory with Input(X), otherwise, a new output "
+        "tensor is created, and its data are copied from Input(x).")
+        .SetDefault(false);
+    AddComment(R"DOC(
+    Unsqueeze Operator.
+    
+    Insert single-dimensional entries to the shape of a tensor. 
+    Takes one required argument axes, a list of dimensions that will be inserted. 
+    Dimension indices in axes are as seen in the output tensor. 
+
+    For example: 
+      Given a tensor such that tensor with shape [3, 4, 5], 
+      then Unsqueeze(tensor, axes=[0, 4]) has shape [1, 3, 4, 5, 1]
+    )DOC");
+  }
+};
+
+class UnsqueezeGradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", framework::GradVarName("X"));
+  }
+};
+
+class UnsqueezeGradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(x_dims);
+    attrs["inplace"] = Attr<bool>("inplace");
+
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}},
+        attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+// Tell linker to use reshape op.
+USE_OP(reshape);
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(unsqueeze, ops::UnsqueezeOp, ops::UnsqueezeOpMaker,
+                  ops::UnsqueezeOpInferShape,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp,
+                  ops::UnsqueezeGradInferShape);
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 20037d0764056c2a093af801c9cc1eb788dd46d6..a6f68f8b0c0a9b07c326888e30c0c911e7861607 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -46,7 +46,7 @@ ENDIF()
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc init.cc DEPS malloc
-    place eigen3 stringpiece cpu_helper ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
+    place eigen3 stringpiece cpu_helper framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
 
 cc_test(init_test SRCS init_test.cc DEPS device_context)
@@ -60,3 +60,7 @@ cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
 
 nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
 cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
+
+IF(WITH_GPU)
+  nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
+ENDIF()
diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc
index 77ecb170111d63f23312d06fa8a8172bc45f2a4e..234a04b5c2eb5ee643e8a4e723b28331cd8e6ee0 100644
--- a/paddle/fluid/platform/cpu_helper.cc
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 
 #ifdef PADDLE_WITH_MKLML
+#include <omp.h>
 #include "paddle/fluid/platform/dynload/mklml.h"
 #endif
 
@@ -33,6 +34,7 @@ void SetNumThreads(int num_threads) {
 #elif defined(PADDLE_WITH_MKLML)
   int real_num_threads = num_threads > 1 ? num_threads : 1;
   platform::dynload::MKL_Set_Num_Threads(real_num_threads);
+  omp_set_num_threads(num_threads);
 #else
   PADDLE_ENFORCE(false, "To be implemented.");
 #endif
diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h
index ecec4178f2d9937920e52eb74bf9068b84e741a0..23457ff5fe1ec27094113ba0dde26adc64c716b5 100644
--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/cuda_device_function.h
@@ -14,6 +14,10 @@ limitations under the License. */
 
 #pragma once
 #include <cuda.h>
+// NOTE(): support float16 to half in header file.
+#define PADDLE_CUDA_FP16
+#include <cuda_fp16.h>
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace platform {
@@ -36,6 +40,18 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
 #endif
 }
 
+// CUDA 9.0 have native compatible float16 shfl_down
+#if CUDA_VERSION < 9000
+template <>
+__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
+                                                       float16 val, int delta,
+                                                       int width) {
+  half tmp = static_cast<half>(val);
+  __shfl_down(tmp, static_cast<unsigned>(delta), width);
+  return float16(tmp);
+}
+#endif
+
 template <typename T>
 __forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line,
                                              int width = 32) {
@@ -46,6 +62,11 @@ __forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line,
 #endif
 }
 
+template <typename T>
+HOSTDEVICE T Infinity() {
+  return INFINITY;
+}
+
 template <typename T>
 __device__ T reduceSum(T val, int tid, int len) {
   // NOTE(zcd): The warp size should be taken from the
diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/cuda_helper_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ca5ca1caeb23f01c047feeccf9c276b2dcd1cb68
--- /dev/null
+++ b/paddle/fluid/platform/cuda_helper_test.cu
@@ -0,0 +1,153 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <random>
+
+#define PADDLE_CUDA_FP16
+#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/float16.h"
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using paddle::platform::float16;
+
+template <typename T>
+__global__ void AddKernel(const T* data_a, T* data_b, size_t num) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+       i += blockDim.x * gridDim.x) {
+    paddle::platform::CudaAtomicAdd(&data_b[i], data_a[i]);
+  }
+}
+
+template <typename T>
+struct AddFunctor {
+  T operator()(const T& a, const T& b) { return a + b; }
+};
+
+template <typename T>
+void TestCase(size_t num) {
+  T *in1, *in2, *out;
+  T *d_in1, *d_in2;
+  size_t size = sizeof(T) * num;
+  cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
+  cudaMalloc(reinterpret_cast<void**>(&d_in2), size);
+  in1 = reinterpret_cast<T*>(malloc(size));
+  in2 = reinterpret_cast<T*>(malloc(size));
+  out = reinterpret_cast<T*>(malloc(size));
+  std::minstd_rand engine;
+  std::uniform_real_distribution<double> dist(0.0, 1.0);
+  for (size_t i = 0; i < num; ++i) {
+    in1[i] = static_cast<T>(dist(engine));
+    in2[i] = static_cast<T>(dist(engine));
+  }
+  cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);
+  AddKernel<T><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num);
+  cudaDeviceSynchronize();
+  cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost);
+  cudaDeviceSynchronize();
+  for (size_t i = 0; i < num; ++i) {
+    // NOTE(dzhwinter): the float16 add has small underflow/overflow
+    // so we use EXPECT_NEAR to check the result.
+    EXPECT_NEAR(static_cast<float>(out[i]),
+                static_cast<float>(AddFunctor<T>()(in1[i], in2[i])), 0.001);
+  }
+  free(in1);
+  free(in2);
+  free(out);
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+}
+
+// cuda primitives
+TEST(CudaAtomic, Add) {
+  TestCase<float>(static_cast<size_t>(10));
+  TestCase<float>(static_cast<size_t>(1024 * 1024));
+
+  TestCase<double>(static_cast<size_t>(10));
+  TestCase<double>(static_cast<size_t>(1024 * 1024));
+}
+
+TEST(CudaAtomic, float16) {
+  TestCase<float16>(static_cast<size_t>(1));
+  TestCase<float16>(static_cast<size_t>(2));
+  TestCase<float16>(static_cast<size_t>(3));
+
+  TestCase<float16>(static_cast<size_t>(10));
+  TestCase<float16>(static_cast<size_t>(1024 * 1024));
+}
+
+// unalignment of uint8
+void TestUnalign(size_t num, const int shift_bit) {
+  PADDLE_ENFORCE(num % 2 == 0, "must be a multiple of 2");
+  float16 *in1, *in2, *out;
+  float16 *d_in1, *d_in2;
+  size_t size = sizeof(uint8_t) * (num + shift_bit);
+  size_t array_size = sizeof(float16) * (num / 2);
+
+  cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
+  cudaMalloc(reinterpret_cast<void**>(&d_in2), size);
+  in1 = reinterpret_cast<float16*>(malloc(size));
+  in2 = reinterpret_cast<float16*>(malloc(size));
+  out = reinterpret_cast<float16*>(malloc(size));
+
+  // right shift 1, mimic the unalignment of address
+  float16* r_in1 =
+      reinterpret_cast<float16*>(reinterpret_cast<uint8_t*>(in1) + shift_bit);
+  float16* r_in2 =
+      reinterpret_cast<float16*>(reinterpret_cast<uint8_t*>(in2) + shift_bit);
+
+  std::minstd_rand engine;
+  std::uniform_real_distribution<double> dist(0.0, 1.0);
+  for (size_t i = 0; i < num / 2; ++i) {
+    r_in1[i] = static_cast<float16>(dist(engine));
+    r_in2[i] = static_cast<float16>(dist(engine));
+  }
+  cudaMemcpy(d_in1, r_in1, array_size, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, r_in2, array_size, cudaMemcpyHostToDevice);
+  AddKernel<float16><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num / 2);
+  cudaDeviceSynchronize();
+  cudaMemcpy(out, d_in2, array_size, cudaMemcpyDeviceToHost);
+  cudaDeviceSynchronize();
+  for (size_t i = 0; i < num / 2; ++i) {
+    // NOTE(dzhwinter): the float16 add has small underflow/overflow
+    // so we use EXPECT_NEAR to check the result.
+    EXPECT_NEAR(static_cast<float>(out[i]),
+                static_cast<float>(AddFunctor<float16>()(r_in1[i], r_in2[i])),
+                0.001);
+  }
+  free(in1);
+  free(in2);
+  free(out);
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+}
+
+TEST(CudaAtomic, float16Unalign) {
+  // same with float16 testcase
+  TestUnalign(static_cast<size_t>(2), /*shift_bit*/ 2);
+  TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 2);
+  TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 2);
+
+  // shift the address.
+  TestUnalign(static_cast<size_t>(2), /*shift_bit*/ 1);
+  TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 1);
+  TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 1);
+
+  TestUnalign(static_cast<size_t>(2), /*shift_bit*/ 3);
+  TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 3);
+  TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 3);
+}
diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h
index d535ed2f89df6a0b311ec068ecd92c8e3183cee7..67ea64833d3b844d88a2e5996f860ef165bd8ffd 100644
--- a/paddle/fluid/platform/cuda_primitives.h
+++ b/paddle/fluid/platform/cuda_primitives.h
@@ -14,12 +14,14 @@ limitations under the License. */
 
 #pragma once
 #include <cuda.h>
+#include <stdio.h>
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace platform {
 
 #define CUDA_ATOMIC_WRAPPER(op, T) \
-  __device__ __forceinline__ T CudaAtomic##op(T* address, const T val)
+  __device__ __forceinline__ T CudaAtomic##op(T *address, const T val)
 
 #define USE_CUDA_ATOMIC(op, T) \
   CUDA_ATOMIC_WRAPPER(op, T) { return atomic##op(address, val); }
@@ -42,17 +44,17 @@ CUDA_ATOMIC_WRAPPER(Add, int64_t) {
   static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
                 "long long should be int64");
   return CudaAtomicAdd(
-      reinterpret_cast<unsigned long long int*>(address),  // NOLINT
-      static_cast<unsigned long long int>(val));           // NOLINT
+      reinterpret_cast<unsigned long long int *>(address),  // NOLINT
+      static_cast<unsigned long long int>(val));            // NOLINT
 }
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
 USE_CUDA_ATOMIC(Add, double);
 #else
 CUDA_ATOMIC_WRAPPER(Add, double) {
-  unsigned long long int* address_as_ull =                 // NOLINT
-      reinterpret_cast<unsigned long long int*>(address);  // NOLINT
-  unsigned long long int old = *address_as_ull, assumed;   // NOLINT
+  unsigned long long int *address_as_ull =                  // NOLINT
+      reinterpret_cast<unsigned long long int *>(address);  // NOLINT
+  unsigned long long int old = *address_as_ull, assumed;    // NOLINT
 
   do {
     assumed = old;
@@ -64,6 +66,67 @@ CUDA_ATOMIC_WRAPPER(Add, double) {
 
   return __longlong_as_double(old);
 }
+#endif
+
+#ifdef PADDLE_CUDA_FP16
+// NOTE(dzhwinter): cuda do not have atomicCAS for half.
+// Just use the half address as a unsigned value address and
+// do the atomicCAS. According to the value store at high 16 bits
+// or low 16 bits, then do a different sum and CAS.
+// Given most warp-threads will failed on the atomicCAS, so this
+// implemented should be avoided in high concurrency. It's will be
+// slower than the way convert value into 32bits and do a full atomicCAS.
+
+// convert the value into float and do the add arithmetic.
+// then store the result into a uint32.
+inline static __device__ uint32_t add_to_low_half(uint32_t val, float x) {
+  float16 low_half;
+  // the float16 in lower 16bits
+  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
+  low_half = static_cast<float16>(static_cast<float>(low_half) + x);
+  return (val & 0xFFFF0000u) | low_half.x;
+}
+
+inline static __device__ uint32_t add_to_high_half(uint32_t val, float x) {
+  float16 high_half;
+  // the float16 in higher 16bits
+  high_half.x = static_cast<uint16_t>(val >> 16);
+  high_half = static_cast<float16>(static_cast<float>(high_half) + x);
+  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
+}
+
+CUDA_ATOMIC_WRAPPER(Add, float16) {
+  // concrete packed float16 value may exsits in lower or higher 16bits
+  // of the 32bits address.
+  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+      reinterpret_cast<char *>(address) -
+      (reinterpret_cast<uintptr_t>(address) & 0x02));
+  float val_f = static_cast<float>(val);
+  uint32_t old = *address_as_ui;
+  uint32_t sum;
+  uint32_t newval;
+  uint32_t assumed;
+  if (((uintptr_t)address & 0x02) == 0) {
+    // the float16 value stay at lower 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ui, assumed, add_to_low_half(assumed, val_f));
+    } while (old != assumed);
+    float16 ret;
+    ret.x = old & 0xFFFFu;
+    return ret;
+  } else {
+    // the float16 value stay at higher 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ui, assumed, add_to_high_half(assumed, val_f));
+    } while (old != assumed);
+    float16 ret;
+    ret.x = old >> 16;
+    return ret;
+  }
+}
+
 #endif
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index 6ea4f8b7cba18ce7f803dbd9b15a7ae70c3055f2..bb8b14bb9fa41942c3aa653ca224c0842fbf9a00 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -59,13 +59,12 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
 #define CUDNN_VERSION_MIN(major, minor, patch) \
   (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch)))
 
-#define CUDNN_ENFORCE(condition)                                  \
-  do {                                                            \
-    cudnnStatus_t status = condition;                             \
-    if (status != CUDNN_STATUS_SUCCESS) {                         \
-      VLOG(1) << ::paddle::platform::cudnnGetErrorString(status); \
-      PADDLE_THROW("cuDNN call failed");                          \
-    }                                                             \
+#define CUDNN_ENFORCE(condition)                                     \
+  do {                                                               \
+    cudnnStatus_t status = condition;                                \
+    if (UNLIKELY(status != CUDNN_STATUS_SUCCESS)) {                  \
+      PADDLE_THROW(::paddle::platform::cudnnGetErrorString(status)); \
+    }                                                                \
   } while (false)
 
 enum class DataLayout {  // Not use
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index d9e2afadaf8ec439d158e57c94d3e6e684bce116..8fa8dbd67c936439840cffa073b6fa6693dd31a1 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -30,9 +30,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 namespace {
-// Current thread's id. Note, we don't distinguish nested threads
-// for now.
-thread_local int cur_thread_id = 0;
 // Tracking the nested block stacks of each thread.
 thread_local std::deque<int> block_id_stack;
 // Tracking the nested event stacks.
@@ -413,12 +410,5 @@ void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); }
 void ClearCurBlock() { block_id_stack.pop_back(); }
 
 int BlockDepth() { return block_id_stack.size(); }
-
-void SetCurThread(int thread_id) { cur_thread_id = thread_id; }
-
-void ClearCurThread() { cur_thread_id = 0; }
-
-int CurThread() { return cur_thread_id; }
-
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h
index 0375c7439c29d4122e8ff6b58734dad4f504b7a2..d2a571f4345b544ad5e74f4629c3967593d6d628 100644
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -99,9 +99,5 @@ std::string CurAnnotation();
 void SetCurBlock(int block_id);
 void ClearCurBlock();
 int BlockDepth();
-
-void SetCurThread(int thread_id);
-void ClearCurThread();
-int CurThread();
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h
index ffd183af68514dbb1a8b3de39000c9ca3f56ddc3..efb021c838e3680ab2cdd1c4b298cf7ec2186478 100644
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -67,8 +67,11 @@ struct float16;
 }  // namespace platform
 }  // namespace paddle
 
+// NOTE():
+// Do not move the eigen.h header, otherwise the eigen_vector<bool> will failed.
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/platform/hostdevice.h"
+#include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
 namespace platform {
@@ -898,6 +901,30 @@ struct is_pod<paddle::platform::float16> {
       is_standard_layout<paddle::platform::float16>::value;
 };
 
+template <>
+struct is_floating_point<paddle::platform::float16>
+    : std::integral_constant<
+          bool, std::is_same<paddle::platform::float16,
+                             typename std::remove_cv<
+                                 paddle::platform::float16>::type>::value> {};
+template <>
+struct is_signed<paddle::platform::float16> {
+  static const bool value = true;
+};
+
+template <>
+struct is_unsigned<paddle::platform::float16> {
+  static const bool value = false;
+};
+
+inline bool isnan(const paddle::platform::float16& a) {
+  return paddle::platform::isnan(a);
+}
+
+inline bool isinf(const paddle::platform::float16& a) {
+  return paddle::platform::isinf(a);
+}
+
 template <>
 struct numeric_limits<paddle::platform::float16> {
   static const bool is_specialized = true;
diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc
index ede294be1e2e26693bd3ead2ccd5e6a6c8a075bc..27e930e6e0a76982b3f27619f38a4a08d82cafa1 100644
--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
@@ -141,10 +141,36 @@ TEST(float16, lod_tensor_cpu) {
   }
 }
 
+TEST(float16, floating) {
+  // compile time assert.
+  PADDLE_ASSERT(std::is_floating_point<float16>::value);
+}
+
 TEST(float16, print) {
   float16 a = float16(1.0f);
   std::cout << a << std::endl;
 }
 
+// CPU test
+TEST(float16, isinf) {
+  float16 a;
+  a.x = 0x7c00;
+  float16 b = float16(INFINITY);
+  float16 c = static_cast<float16>(INFINITY);
+  EXPECT_EQ(std::isinf(a), true);
+  EXPECT_EQ(std::isinf(b), true);
+  EXPECT_EQ(std::isinf(c), true);
+}
+
+TEST(float16, isnan) {
+  float16 a;
+  a.x = 0x7fff;
+  float16 b = float16(NAN);
+  float16 c = static_cast<float16>(NAN);
+  EXPECT_EQ(std::isnan(a), true);
+  EXPECT_EQ(std::isnan(b), true);
+  EXPECT_EQ(std::isnan(c), true);
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
index 1b9cf9b5d3fa2121b588c31d7cf2f4c50cb951bc..e2b7ca9b03809113c31af8ff4d3ad3713748f330 100644
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -11,11 +11,13 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/float16.h"
 
+#include <glog/logging.h>
 #include <gtest/gtest.h>
+#include <bitset>
+#include <iostream>
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/legacy/utils/Logging.h"
 
 #define ARITHMETIC_KERNEL(op_type, sign)                                 \
   __global__ void op_type(const half* in1, const half* in2, half* out) { \
@@ -241,6 +243,72 @@ TEST(float16, lod_tensor_on_gpu) {
   }
 }
 
+template <typename T>
+struct Functor {
+  bool operator()(const T& val) {
+    return std::type_index(typeid(T)) ==
+           std::type_index(typeid(platform::float16));
+  }
+};
+
+TEST(float16, typeid) {
+  // the framework heavily used typeid hash
+  Functor<float16> functor;
+  float16 a = float16(.0f);
+  Functor<int> functor2;
+  int b(0);
+
+  // compile time assert
+  PADDLE_ASSERT(functor(a) == true);
+  PADDLE_ASSERT(functor2(b) == false);
+}
+
+// GPU test
+TEST(float16, isinf) {
+  float16 a;
+  a.x = 0x7c00;
+  float16 b = float16(INFINITY);
+  // underflow to 0
+  float16 native_a(5e-40f);
+  // overflow to inf
+  float16 native_b(5e40f);
+  EXPECT_EQ(std::isinf(a), true);
+  EXPECT_EQ(std::isinf(b), true);
+  EXPECT_EQ(std::isinf(native_b), true);
+  EXPECT_EQ(native_a, float16(0));
+}
+
+TEST(float16, isnan) {
+  float16 a;
+  a.x = 0x7fff;
+  float16 b = float16(NAN);
+  float16 c = float16(5e40);
+  // inf * +-0 will get a nan
+  float16 d = c * float16(0);
+  EXPECT_EQ(std::isnan(a), true);
+  EXPECT_EQ(std::isnan(b), true);
+  EXPECT_EQ(std::isnan(d), true);
+}
+
+TEST(float16, cast) {
+  float16 a;
+  a.x = 0x0070;
+  auto b = a;
+  {
+    // change semantic, keep the same value
+    float16 c = reinterpret_cast<float16&>(reinterpret_cast<unsigned&>(b));
+    EXPECT_EQ(b, c);
+  }
+
+  {
+    // use uint32 low 16 bit store float16
+    uint32_t c = reinterpret_cast<uint32_t&>(b);
+    float16 d;
+    d.x = c;
+    EXPECT_EQ(b, d);
+  }
+}
+
 }  // namespace platform
 }  // namespace paddle
 #endif  // PADDLE_CUDA_FP16
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 0b776528414735e8a7c1e3763e7ccb662bb9f285..6f1f0c4796f3bae2fb419bf103cb6c0c5489bf65 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -23,6 +23,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/piece.h"
 
+DEFINE_int32(paddle_num_threads, 1,
+             "Number of threads for each paddle instance.");
+
 namespace paddle {
 namespace framework {
 
@@ -115,7 +118,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
   places.emplace_back(platform::CPUPlace());
   platform::DeviceContextPool::Init(places);
 #ifndef PADDLE_WITH_MKLDNN
-  platform::SetNumThreads(1);
+  platform::SetNumThreads(FLAGS_paddle_num_threads);
 #endif
 }
 
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 33fec2c1073819d88d85a8872227adcb9df3e8f4..a8f93e6848a1db1f5aa0ee266a076af2b5d0c964 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -222,15 +222,16 @@ class MKLDNNHandler {
 
   static std::string GetHash(mkldnn::memory::dims& operand_dims,  // NOLINT
                              const std::string& suffix) {
-    auto dims2str = [](const mkldnn::memory::dims& operand_dims) {
-      std::string dstr = "";
-      for (size_t i = 0; i < operand_dims.size(); ++i) {
-        dstr += std::to_string(operand_dims[i]) + "-";
-      }
-      return dstr;
-    };
-
     return dims2str(operand_dims) + suffix;
+  };
+
+ protected:
+  static std::string dims2str(const mkldnn::memory::dims& operand_dims) {
+    std::string dstr = "";
+    for (size_t i = 0; i < operand_dims.size(); ++i) {
+      dstr += std::to_string(operand_dims[i]) + "-";
+    }
+    return dstr;
   }
 
  protected:
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 01de9d7041bf3eb40884e2a6295027cccfaebd2a..7c8d8a5964fa5258bebaf2c8522886ae5886ab2c 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -110,6 +110,8 @@ Event::Event(EventType type, std::string name, uint32_t thread_id,
   has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false;
   if (has_cuda_) {
     auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
+    PADDLE_ENFORCE(cudaSetDevice(
+        boost::get<platform::CUDAPlace>(cuda_dev_ctx->GetPlace()).device));
     PADDLE_ENFORCE(cudaGetDevice(&device_));
     PADDLE_ENFORCE(cudaEventCreate(&event_));
     auto stream = cuda_dev_ctx->stream();
@@ -176,6 +178,7 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
 
 RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
     : is_enabled_(false), start_ns_(PosixInNsec()) {
+  std::lock_guard<std::mutex> l(profiler_mu);
   if (g_state == ProfilerState::kDisabled) return;
   is_enabled_ = true;
   dev_ctx_ = dev_ctx;
@@ -186,11 +189,12 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
 }
 
 RecordEvent::~RecordEvent() {
+  std::lock_guard<std::mutex> l(profiler_mu);
   if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
   DeviceTracer* tracer = GetDeviceTracer();
   if (tracer) {
     tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(),
-                          BlockDepth(), CurThread());
+                          BlockDepth(), g_thread_id);
   }
   ClearCurAnnotation();
   PopEvent(name_, dev_ctx_);
@@ -198,6 +202,7 @@ RecordEvent::~RecordEvent() {
 
 RecordBlock::RecordBlock(int block_id)
     : is_enabled_(false), start_ns_(PosixInNsec()) {
+  std::lock_guard<std::mutex> l(profiler_mu);
   if (g_state == ProfilerState::kDisabled) return;
   is_enabled_ = true;
   SetCurBlock(block_id);
@@ -205,27 +210,18 @@ RecordBlock::RecordBlock(int block_id)
 }
 
 RecordBlock::~RecordBlock() {
+  std::lock_guard<std::mutex> l(profiler_mu);
   if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
   DeviceTracer* tracer = GetDeviceTracer();
   if (tracer) {
     // We try to put all blocks at the same nested depth in the
     // same timeline lane. and distinguish the using thread_id.
     tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(),
-                          CurThread());
+                          g_thread_id);
   }
   ClearCurBlock();
 }
 
-RecordThread::RecordThread(int thread_id) {
-  if (g_state == ProfilerState::kDisabled) return;
-  SetCurThread(thread_id);
-}
-
-RecordThread::~RecordThread() {
-  if (g_state == ProfilerState::kDisabled) return;
-  ClearCurThread();
-}
-
 void EnableProfiler(ProfilerState state) {
   PADDLE_ENFORCE(state != ProfilerState::kDisabled,
                  "Can't enbale profling, since the input state is ",
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index bf43925373a12cd9ff2155d68c42d0266ba4df60..c99d9c807d1bfb45d1ce0725b84b9fff09049511 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -95,11 +95,6 @@ struct RecordBlock {
   uint64_t start_ns_;
 };
 
-struct RecordThread {
-  explicit RecordThread(int thread_id);
-  ~RecordThread();
-};
-
 // Return the event list of all threads. Assumed the returned value calls
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
 std::vector<std::vector<Event>> GetAllEvents();
diff --git a/paddle/fluid/platform/variant.h b/paddle/fluid/platform/variant.h
index 45f60fc9d76560b133fa06198a24c7eaccc24088..dc9fad29f281a1c6ac300b48f9e600ff802a5752 100644
--- a/paddle/fluid/platform/variant.h
+++ b/paddle/fluid/platform/variant.h
@@ -38,6 +38,7 @@ limitations under the License. */
 #endif
 #endif
 
+#include <boost/any.hpp>
 #include <boost/mpl/comparison.hpp>
 #include <boost/mpl/less_equal.hpp>
 #include <boost/variant.hpp>
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index fcd3356d44ee592233c3883d439d0677714900b8..2199f5311fd3728e624fc222a1b876eb947cc0aa 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -145,14 +145,14 @@ void BindBlockDesc(pybind11::module *m) {
       .def_property_readonly("id", &pd::BlockDesc::ID)
       .def_property_readonly("parent", &pd::BlockDesc::Parent)
       .def("get_forward_block_idx", &pd::BlockDesc::ForwardBlockID)
-      .def("set_forward_block_idx", &pd::BlockDesc::SetForwardBlockID)
+      .def("_set_forward_block_idx", &pd::BlockDesc::SetForwardBlockID)
       .def("append_op", &pd::BlockDesc::AppendOp,
            pybind11::return_value_policy::reference)
-      .def("prepend_op", &pd::BlockDesc::PrependOp,
+      .def("_prepend_op", &pd::BlockDesc::PrependOp,
            pybind11::return_value_policy::reference)
-      .def("insert_op", &pd::BlockDesc::InsertOp,
+      .def("_insert_op", &pd::BlockDesc::InsertOp,
            pybind11::return_value_policy::reference)
-      .def("remove_op", &pd::BlockDesc::RemoveOp)
+      .def("_remove_op", &pd::BlockDesc::RemoveOp)
       .def("var",
            [](pd::BlockDesc &self, pybind11::bytes byte_name) {
              std::string name = byte_name;
@@ -165,7 +165,7 @@ void BindBlockDesc(pybind11::module *m) {
              return self.HasVar(name);
            },
            pybind11::return_value_policy::reference)
-      .def("rename_var",
+      .def("_rename_var",
            [](pd::BlockDesc &self, const pybind11::bytes &byte_name,
               const pybind11::bytes &byte_name_new) {
              std::string name = byte_name;
@@ -189,7 +189,7 @@ void BindBlockDesc(pybind11::module *m) {
              return self.FindVarRecursive(name);
            },
            pybind11::return_value_policy::reference)
-      .def("remove_var",
+      .def("_remove_var",
            [](pd::BlockDesc &self, pybind11::bytes byte_name) {
              std::string name = byte_name;
              return self.RemoveVar(name);
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 7a8bb712452538b7e2a349d56a15de3284f82b39..ee1c8d46ddfb4f0c09591bb78dc720555dc735b4 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #include <Python.h>
 #include <algorithm>
 #include <map>
+#include <memory>
 #include <mutex>  // NOLINT // for call_once
 #include <string>
 #include <unordered_map>
@@ -66,6 +67,14 @@ bool IsCompiledWithCUDA() {
 #endif
 }
 
+bool IsCompiledWithDIST() {
+#ifdef PADDLE_WITH_DISTRIBUTE
+  return true;
+#else
+  return false;
+#endif
+}
+
 PYBIND11_PLUGIN(core) {
   py::module m("core", "C++ core of PaddlePaddle");
 
@@ -78,37 +87,37 @@ PYBIND11_PLUGIN(core) {
   py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
       .def_buffer(
           [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
-      .def("get_dims",
+      .def("_get_dims",
            [](const Tensor &self) { return vectorize(self.dims()); })
-      .def("set_dims",
+      .def("_set_dims",
            [](Tensor &self, const std::vector<int64_t> &dim) {
              self.Resize(make_ddim(dim));
            })
-      .def("set_layout",
+      .def("_set_layout",
            [](Tensor &self, const std::string &layout) {
              self.set_layout(StringToDataLayout(layout));
            })
-      .def("alloc_float",
+      .def("_alloc_float",
            [](Tensor &self, paddle::platform::CUDAPlace &place) {
              self.mutable_data<float>(place);
            })
-      .def("alloc_float",
+      .def("_alloc_float",
            [](Tensor &self, paddle::platform::CPUPlace &place) {
              self.mutable_data<float>(place);
            })
-      .def("alloc_int",
+      .def("_alloc_int",
            [](Tensor &self, paddle::platform::CPUPlace &place) {
              self.mutable_data<int>(place);
            })
-      .def("alloc_int",
+      .def("_alloc_int",
            [](Tensor &self, paddle::platform::CUDAPlace &place) {
              self.mutable_data<int>(place);
            })
-      .def("alloc_int",
+      .def("_alloc_int",
            [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
              self.mutable_data<int>(place);
            })
-      .def("alloc_float",
+      .def("_alloc_float",
            [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
              self.mutable_data<float>(place);
            })
@@ -136,11 +145,11 @@ PYBIND11_PLUGIN(core) {
       .def("set", PyCUDAPinnedTensorSetFromArray<uint8_t>)
 #endif
       .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
-      .def("set_float_element", TensorSetElement<float>)
-      .def("get_float_element", TensorGetElement<float>)
-      .def("set_double_element", TensorSetElement<double>)
-      .def("get_double_element", TensorGetElement<double>)
-      .def("dtype", [](Tensor &self) { return ToDataType(self.type()); });
+      .def("_set_float_element", TensorSetElement<float>)
+      .def("_get_float_element", TensorGetElement<float>)
+      .def("_set_double_element", TensorSetElement<double>)
+      .def("_get_double_element", TensorGetElement<double>)
+      .def("_dtype", [](Tensor &self) { return ToDataType(self.type()); });
 
   py::class_<LoDTensor, Tensor>(m, "LoDTensor")
       .def_buffer(
@@ -239,15 +248,11 @@ PYBIND11_PLUGIN(core) {
 #endif
            })
       .def("rows", [](SelectedRows &self) {
-#ifndef PADDLE_WITH_CUDA
-        return self.rows();
-#else
-         auto rows = self.rows();
-         std::vector<int64_t> new_rows;
-         new_rows.reserve(rows.size());
-         std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows));
-         return new_rows;
-#endif
+        auto rows = self.rows();
+        std::vector<int64_t> new_rows;
+        new_rows.reserve(rows.size());
+        std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows));
+        return new_rows;
       });
 
   py::class_<Variable>(m, "Variable", R"DOC(Variable Class.
@@ -296,13 +301,14 @@ All parameter, weight, gradient are variables in Paddle.
            py::return_value_policy::reference);
 
   py::class_<framework::ReaderHolder>(m, "Reader", "")
-      .def("reset", &framework::ReaderHolder::ReInit);
+      .def("reset", &framework::ReaderHolder::ResetAll);
 
   using LoDTensorBlockingQueue =
       ::paddle::operators::reader::LoDTensorBlockingQueue;
   using LoDTensorBlockingQueueHolder =
       ::paddle::operators::reader::LoDTensorBlockingQueueHolder;
-  py::class_<LoDTensorBlockingQueue>(m, "LoDTensorBlockingQueue", "")
+  py::class_<LoDTensorBlockingQueue, std::shared_ptr<LoDTensorBlockingQueue>>(
+      m, "LoDTensorBlockingQueue", "")
       .def("push",
            [](LoDTensorBlockingQueue &self,
               const std::vector<framework::LoDTensor> &lod_tensor_vec) {
@@ -317,7 +323,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("init_lod_tensor_blocking_queue",
         [](Variable &var, size_t capacity,
            const std::vector<std::vector<int64_t>> &shapes)
-            -> LoDTensorBlockingQueue * {
+            -> std::shared_ptr<LoDTensorBlockingQueue> {
               std::vector<DDim> dims(shapes.size());
               std::transform(shapes.begin(), shapes.end(), dims.begin(),
                              [](const std::vector<int64_t> &shape) {
@@ -325,9 +331,9 @@ All parameter, weight, gradient are variables in Paddle.
                              });
               auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>();
               holder->InitOnce(capacity, dims);
-              return holder->GetQueue().get();
+              return holder->GetQueue();
             },
-        py::return_value_policy::reference);
+        py::return_value_policy::copy);
 
   py::class_<Scope>(m, "Scope", "")
       .def("var",
@@ -492,10 +498,7 @@ All parameter, weight, gradient are variables in Paddle.
 
   py::class_<framework::Executor>(m, "Executor")
       .def(py::init<const platform::Place &>())
-#ifdef PADDLE_WITH_DISTRIBUTE
-      .def("begin_pass", &Executor::BeginPass)
-      .def("end_pass", &Executor::EndPass)
-#endif
+      .def("close", &Executor::Close)
       .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope,
                      int block_id, bool create_local_scope, bool create_vars) {
         pybind11::gil_scoped_release release;
@@ -508,6 +511,7 @@ All parameter, weight, gradient are variables in Paddle.
         [](bool init_p2p) { framework::InitDevices(init_p2p); });
 
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
+  m.def("is_compiled_with_dist", IsCompiledWithDIST);
 #ifdef PADDLE_WITH_CUDA
   m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
     // Only GPUs with Compute Capability >= 53 support float16
@@ -534,6 +538,8 @@ All parameter, weight, gradient are variables in Paddle.
       });
 
   py::class_<LoDTensorArray>(m, "LoDTensorArray")
+      .def("__init__",
+           [](LoDTensorArray &instance) { new (&instance) LoDTensorArray(); })
       .def("__getitem__",
            [](LoDTensorArray &self, size_t i) { return &self.at(i); },
            py::return_value_policy::reference)
@@ -656,7 +662,7 @@ All parameter, weight, gradient are variables in Paddle.
                   const std::string &, Scope *, std::vector<Scope *> &,
                   const ExecutionStrategy &, const BuildStrategy &, size_t,
                   size_t>())
-      .def("bcast_params", &ParallelExecutor::BCastParamsToGPUs)
+      .def("bcast_params", &ParallelExecutor::BCastParamsToDevices)
       // NOTE: even we return a vec<Scope*>* to Python use reference policy.
       // We still cannot get local_scope from this vector, since the element
       // of vec<Scope*> will be freed by Python GC. We can only return Scope*
diff --git a/paddle/fluid/pybind/recordio.cc b/paddle/fluid/pybind/recordio.cc
index 330d104e0a774d905e463566f85bd2e64a080190..f83b026d4d50772b969c4316964b70a68b27442b 100644
--- a/paddle/fluid/pybind/recordio.cc
+++ b/paddle/fluid/pybind/recordio.cc
@@ -30,7 +30,9 @@ class RecordIOWriter {
  public:
   RecordIOWriter(const std::string& filename, recordio::Compressor compressor,
                  size_t max_num_record)
-      : stream_(filename), writer_(&stream_, compressor, max_num_record) {}
+      : closed_(false),
+        stream_(filename),
+        writer_(&stream_, compressor, max_num_record) {}
 
   void AppendTensor(const framework::LoDTensor& tensor) {
     tensors_.push_back(tensor);
@@ -47,9 +49,17 @@ class RecordIOWriter {
     PADDLE_ENFORCE(tensors_.empty());
     writer_.Flush();
     stream_.close();
+    closed_ = true;
+  }
+
+  ~RecordIOWriter() {
+    if (!closed_) {
+      Close();
+    }
   }
 
  private:
+  bool closed_;
   std::vector<framework::LoDTensor> tensors_;
   std::ofstream stream_;
   recordio::Writer writer_;
diff --git a/paddle/fluid/recordio/scanner.cc b/paddle/fluid/recordio/scanner.cc
index 06a13e6c5b6ea76456e231e3f7b1eb33492b16ea..a0a2f984228db0e7a015630655a3176aa4d1a5a4 100644
--- a/paddle/fluid/recordio/scanner.cc
+++ b/paddle/fluid/recordio/scanner.cc
@@ -28,6 +28,7 @@ Scanner::Scanner(std::unique_ptr<std::istream> &&stream)
 
 Scanner::Scanner(const std::string &filename)
     : stream_(new std::ifstream(filename)), parser_(*stream_) {
+  PADDLE_ENFORCE(static_cast<bool>(*stream_), "Cannot open file %s", filename);
   Reset();
 }
 
diff --git a/paddle/fluid/train/demo/demo_trainer.cc b/paddle/fluid/train/demo/demo_trainer.cc
index 4425f062efa6eab552caee1a429746528cd66926..a0757b53f37b29de0b3802c345b1ad9db69f16e9 100644
--- a/paddle/fluid/train/demo/demo_trainer.cc
+++ b/paddle/fluid/train/demo/demo_trainer.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <time.h>
 #include <fstream>
 
 #include "paddle/fluid/framework/executor.h"
@@ -21,6 +22,7 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace train {
@@ -93,11 +95,21 @@ int main() {
 
   auto loss_var = scope.Var(loss_name);
 
+  paddle::platform::ProfilerState pf_state;
+  pf_state = paddle::platform::ProfilerState::kCPU;
+  paddle::platform::EnableProfiler(pf_state);
+  clock_t t1 = clock();
+
   for (int i = 0; i < 10; ++i) {
     executor.Run(*train_program.get(), &scope, 0, false, true);
     std::cout << "step: " << i << " loss: "
               << loss_var->Get<paddle::framework::LoDTensor>().data<float>()[0]
               << std::endl;
   }
+
+  clock_t t2 = clock();
+  paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kTotal,
+                                    "run_paddle_op_profiler");
+  std::cout << "run_time = " << t2 - t1 << std::endl;
   return 0;
 }
diff --git a/paddle/legacy/capi/Arguments.cpp b/paddle/legacy/capi/Arguments.cpp
index 87fac3d6c6abe37b128213d4ffd66f8c1573a910..0ce1770c76c2e145d0b2bf71332cc4593517f195 100644
--- a/paddle/legacy/capi/Arguments.cpp
+++ b/paddle/legacy/capi/Arguments.cpp
@@ -66,6 +66,17 @@ paddle_error paddle_arguments_get_value(paddle_arguments args,
   return kPD_NO_ERROR;
 }
 
+PD_API paddle_error paddle_arguments_get_prob(paddle_arguments args,
+                                              uint64_t ID,
+                                              paddle_matrix mat) {
+  if (args == nullptr || mat == nullptr) return kPD_NULLPTR;
+  auto m = paddle::capi::cast<paddle::capi::CMatrix>(mat);
+  auto a = castArg(args);
+  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
+  m->mat = a->args[ID].in;
+  return kPD_NO_ERROR;
+}
+
 paddle_error paddle_arguments_get_ids(paddle_arguments args,
                                       uint64_t ID,
                                       paddle_ivector ids) {
diff --git a/paddle/legacy/capi/arguments.h b/paddle/legacy/capi/arguments.h
index 69a66bb012c318bc8317c246d690a7f4baffd248..ceb64ee6aa74a8ba4b5cb9045b366dcda8f8cc90 100644
--- a/paddle/legacy/capi/arguments.h
+++ b/paddle/legacy/capi/arguments.h
@@ -87,6 +87,18 @@ PD_API paddle_error paddle_arguments_get_value(paddle_arguments args,
                                                uint64_t ID,
                                                paddle_matrix mat);
 
+/**
+ * @brief paddle_arguments_get_prob Get the prob matrix of beam search, which
+ *        slot ID is `ID`
+ * @param [in] args arguments array
+ * @param [in] ID array index
+ * @param [out] mat matrix pointer
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_arguments_get_prob(paddle_arguments args,
+                                              uint64_t ID,
+                                              paddle_matrix mat);
+
 /**
  * @brief PDArgsGetIds Get the integer vector of one argument in array, which
  *        index is `ID`.
diff --git a/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
deleted file mode 120000
index 3c1b3533523cf1709720d11df7b8e311e0577fe7..0000000000000000000000000000000000000000
--- a/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
+++ /dev/null
@@ -1 +0,0 @@
-../dense/convert_protobin.sh
\ No newline at end of file
diff --git a/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b29f2cd21418ecbd2fb2ba626138e5aa11bf77f3
--- /dev/null
+++ b/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
@@ -0,0 +1 @@
+../dense/convert_protobin.sh
diff --git a/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
deleted file mode 120000
index 3c1b3533523cf1709720d11df7b8e311e0577fe7..0000000000000000000000000000000000000000
--- a/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
+++ /dev/null
@@ -1 +0,0 @@
-../dense/convert_protobin.sh
\ No newline at end of file
diff --git a/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b29f2cd21418ecbd2fb2ba626138e5aa11bf77f3
--- /dev/null
+++ b/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
@@ -0,0 +1 @@
+../dense/convert_protobin.sh
diff --git a/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
deleted file mode 120000
index 3c1b3533523cf1709720d11df7b8e311e0577fe7..0000000000000000000000000000000000000000
--- a/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
+++ /dev/null
@@ -1 +0,0 @@
-../dense/convert_protobin.sh
\ No newline at end of file
diff --git a/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b29f2cd21418ecbd2fb2ba626138e5aa11bf77f3
--- /dev/null
+++ b/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
@@ -0,0 +1 @@
+../dense/convert_protobin.sh
diff --git a/paddle/legacy/utils/PythonUtil.cpp b/paddle/legacy/utils/PythonUtil.cpp
index 7faeff55c28b9065179ad27b3b604a9f411249e5..21ed049c4d2743d1fa914d6948d6c8c2862f0bfc 100644
--- a/paddle/legacy/utils/PythonUtil.cpp
+++ b/paddle/legacy/utils/PythonUtil.cpp
@@ -136,7 +136,13 @@ std::string callPythonFunc(const std::string& moduleName,
                            const std::string& funcName,
                            const std::vector<std::string>& args) {
   PyObjectPtr obj = callPythonFuncRetPyObj(moduleName, funcName, args);
+#if PY_MAJOR_VERSION >= 3
+  Py_ssize_t str_size = 0u;
+  const char* str = PyUnicode_AsUTF8AndSize(obj.get(), &str_size);
+  return std::string(str, (size_t)str_size);
+#else
   return std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
+#endif  // PY_MAJOR_VERSION >= 3
 }
 
 PyObjectPtr createPythonClass(
diff --git a/paddle/legacy/utils/PythonUtil.h b/paddle/legacy/utils/PythonUtil.h
index b0c8612c378fbe12cdf24e51a5b6546740b2d4c8..d5b2dbddde21f5c2a0696aadeda2b057175fc5e9 100644
--- a/paddle/legacy/utils/PythonUtil.h
+++ b/paddle/legacy/utils/PythonUtil.h
@@ -88,6 +88,33 @@ PyObjectPtr createPythonClass(const std::string& moduleName,
 namespace py {
 PyObjectPtr import(const std::string& moduleName);
 
+#if PY_MAJOR_VERSION >= 3
+/**
+ * Cast a PyLong to int type T.
+ * @tparam T return type.
+ * @param [in] obj PyLong object.
+ * @param [out] ok status for casting. False if error occured. nullptr if user
+ *                 don't care is ok or not.
+ * @return The value of python object, or 0 if not ok.
+ */
+template <typename T>
+T castInt(PyObject* obj, bool* ok = nullptr) {
+  // Refer to https://www.python.org/dev/peps/pep-0237/, the int and long object
+  // were unified to long since python3
+  if (PyLong_Check(obj)) {
+    if (ok) *ok = true;
+    return (T)PyLong_AsUnsignedLong(obj);
+  } else {
+    if (ok) *ok = false;
+    return (T)0;
+  }
+}
+
+// Convert PyAPI from 2.x to 3.x
+#define PyString_FromString PyUnicode_FromString
+#define PyString_AsString PyUnicode_AsUTF8
+
+#else
 /**
  * Cast a PyLong or PyInt to int type T.
  * @tparam T return type.
@@ -109,6 +136,7 @@ T castInt(PyObject* obj, bool* ok = nullptr) {
     return (T)0;
   }
 }
+#endif  // PY_MAJOR_VERSION >= 3
 
 /**
  * Invoke repr of python object.
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index d173b41e86f61954954b6a5ea9957d2e172deca0..8460f93b841fe136db138e0dc7576f3aacdbeb5f 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -19,6 +19,8 @@
 #                   Utils
 #=================================================
 
+set -ex
+
 function print_usage() {
     echo -e "\n${RED}Usage${NONE}:
     ${BOLD}${SCRIPT_NAME}${NONE} [OPTION]"
@@ -37,6 +39,7 @@ function print_usage() {
     ${BLUE}fluid_inference_lib${NONE}: deploy fluid inference library
     ${BLUE}check_style${NONE}: run code style check
     ${BLUE}cicheck${NONE}: run CI tasks
+    ${BLUE}assert_api_not_changed${NONE}: check api compability
     "
 }
 
@@ -78,6 +81,12 @@ function cmake_gen() {
             PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
         -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
         -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
+        elif [ "$1" == "cp35-cp35m" ]; then
+            export LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH}
+            export PATH=/opt/_internal/cpython-3.5.1/bin/:${PATH}
+            export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.5.1/include/python3.5m
+            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so"
         fi
     fi
 
@@ -108,6 +117,7 @@ function cmake_gen() {
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
         -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON}
+        -DPY_VERSION=${PY_VERSION:-2.7}
     ========================================
 EOF
     # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -136,7 +146,8 @@ EOF
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
-        -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON}
+        -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON} \
+        -DPY_VERSION=${PY_VERSION:-2.7}
 }
 
 function abort(){
@@ -318,11 +329,22 @@ function assert_api_not_changed() {
     virtualenv .env
     source .env/bin/activate
     pip install ${PADDLE_ROOT}/build/python/dist/*whl
-    curl ${PADDLE_API_SPEC_URL:-https://raw.githubusercontent.com/PaddlePaddle/FluidAPISpec/master/API.spec} \
-        > origin.spec
     python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec
-    python ${PADDLE_ROOT}/tools/diff_api.py origin.spec new.spec
+    python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec
     deactivate
+
+    API_CHANGE=`git diff --name-only upstream/develop | grep "paddle/fluid/API.spec" || true`
+    echo "checking API.spec change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
+    if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then
+        # TODO: curl -H 'Authorization: token ${TOKEN}'
+        APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews | \
+        python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433`
+        echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
+        if [ "${APPROVALS}" == "FALSE" ]; then
+            echo "You must have at least 2 approvals for the api change!"
+        exit 1
+        fi
+    fi
 }
 
 
@@ -397,6 +419,25 @@ EOF
     linkchecker doc/v2/en/html/index.html
     linkchecker doc/v2/cn/html/index.html
     linkchecker doc/v2/api/en/html/index.html
+
+    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
+
+    # Deploy to the the content server if its a "develop" or "release/version" branch
+    # The "develop_doc" branch is reserved to test full deploy process without impacting the real content.
+    if [ "$TRAVIS_BRANCH" == "develop_doc" ]; then
+        PPO_SCRIPT_BRANCH=develop
+    elif [[ "$TRAVIS_BRANCH" == "develop"  ||  "$TRAVIS_BRANCH" =~ ^v|release/[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then
+        PPO_SCRIPT_BRANCH=master
+    else
+        # Early exit, this branch doesn't require documentation build
+        return 0;
+    fi
+     # Fetch the paddlepaddle.org deploy_docs.sh from the appopriate branch
+    export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/$PPO_SCRIPT_BRANCH/scripts/deploy/deploy_docs.sh
+    export PYTHONPATH=$PYTHONPATH:${PADDLE_ROOT}/build/python:/paddle/build/python
+    cd ..
+    curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH ${PADDLE_ROOT} ${PADDLE_ROOT}/build/doc/ ${PPO_SCRIPT_BRANCH}
+    cd -
 }
 
 function gen_html() {
@@ -508,15 +549,28 @@ function gen_fluid_inference_lib() {
     Deploying fluid inference library ...
     ========================================
 EOF
+        cmake .. -DWITH_DISTRIBUTE=OFF
         make -j `nproc` inference_lib_dist
         cd ${PADDLE_ROOT}/build
-        mv fluid_install_dir fluid
-        tar -cf fluid.tgz fluid
+        cp -r fluid_install_dir fluid
+        tar -czf fluid.tgz fluid
+      fi
+}
+
+function test_fluid_inference_lib() {
+    if [ ${WITH_C_API:-OFF} == "OFF" ] ; then
+        cat <<EOF
+    ========================================
+    Testing fluid inference library ...
+    ========================================
+EOF
+        cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci
+        ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF}
+        ./clean.sh
       fi
 }
 
 function main() {
-    set -e
     local CMD=$1
     init
     case $CMD in
@@ -557,6 +611,7 @@ function main() {
       fluid_inference_lib)
         cmake_gen ${PYTHON_ABI:-""}
         gen_fluid_inference_lib
+        test_fluid_inference_lib
         ;;
       check_style)
         check_style
@@ -564,10 +619,11 @@ function main() {
       cicheck)
         cmake_gen ${PYTHON_ABI:-""}
         build
-        assert_api_not_changed
         run_test
         gen_capi_package
         gen_fluid_inference_lib
+        test_fluid_inference_lib
+        assert_api_not_changed
         ;;
       *)
         print_usage
diff --git a/paddle/scripts/paddle_docker_build.sh b/paddle/scripts/paddle_docker_build.sh
index 3462deb9c2f88b6da643d6aa833449ed5f4a9b34..174c2a12f007b282a5182c0aec9b0a6bec9e55fa 100755
--- a/paddle/scripts/paddle_docker_build.sh
+++ b/paddle/scripts/paddle_docker_build.sh
@@ -52,6 +52,9 @@ EOL
     ${DOCKER_CMD} run -it \
         ${DOCKER_ENV} \
         -e SCRIPT_NAME=$0 \
+        -e CONTENT_DEC_PASSWD=$CONTENT_DEC_PASSWD \
+        -e TRAVIS_BRANCH=$TRAVIS_BRANCH \
+        -e TRAVIS_PULL_REQUEST=$TRAVIS_PULL_REQUEST \
         -v $PADDLE_ROOT:/paddle \
         -v ${HOME}/.ccache:/root/.ccache \
         -w /paddle \
diff --git a/patches/grpc/completion_queue.h b/patches/grpc/completion_queue.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e92c60ea2db00cc6e227830228888f9a06735c4
--- /dev/null
+++ b/patches/grpc/completion_queue.h
@@ -0,0 +1,386 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/// A completion queue implements a concurrent producer-consumer queue, with
+/// two main API-exposed methods: \a Next and \a AsyncNext. These
+/// methods are the essential component of the gRPC C++ asynchronous API.
+/// There is also a \a Shutdown method to indicate that a given completion queue
+/// will no longer have regular events. This must be called before the
+/// completion queue is destroyed.
+/// All completion queue APIs are thread-safe and may be used concurrently with
+/// any other completion queue API invocation; it is acceptable to have
+/// multiple threads calling \a Next or \a AsyncNext on the same or different
+/// completion queues, or to call these methods concurrently with a \a Shutdown
+/// elsewhere.
+/// \remark{All other API calls on completion queue should be completed before
+/// a completion queue destructor is called.}
+#ifndef GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
+#define GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
+
+#include <typeinfo>
+
+#include <grpc/impl/codegen/atm.h>
+#include <grpcpp/impl/codegen/completion_queue_tag.h>
+#include <grpcpp/impl/codegen/core_codegen_interface.h>
+#include <grpcpp/impl/codegen/grpc_library.h>
+#include <grpcpp/impl/codegen/status.h>
+#include <grpcpp/impl/codegen/time.h>
+
+struct grpc_completion_queue;
+
+namespace grpc {
+
+template <class R>
+class ClientReader;
+template <class W>
+class ClientWriter;
+template <class W, class R>
+class ClientReaderWriter;
+template <class R>
+class ServerReader;
+template <class W>
+class ServerWriter;
+namespace internal {
+template <class W, class R>
+class ServerReaderWriterBody;
+}  // namespace internal
+
+class Channel;
+class ChannelInterface;
+class ClientContext;
+class CompletionQueue;
+class Server;
+class ServerBuilder;
+class ServerContext;
+class ServerInterface;
+
+namespace internal {
+class CompletionQueueTag;
+class RpcMethod;
+template <class ServiceType, class RequestType, class ResponseType>
+class RpcMethodHandler;
+template <class ServiceType, class RequestType, class ResponseType>
+class ClientStreamingHandler;
+template <class ServiceType, class RequestType, class ResponseType>
+class ServerStreamingHandler;
+template <class ServiceType, class RequestType, class ResponseType>
+class BidiStreamingHandler;
+class UnknownMethodHandler;
+template <class Streamer, bool WriteNeeded>
+class TemplatedBidiStreamingHandler;
+template <class InputMessage, class OutputMessage>
+class BlockingUnaryCallImpl;
+}  // namespace internal
+
+extern CoreCodegenInterface* g_core_codegen_interface;
+
+/// A thin wrapper around \ref grpc_completion_queue (see \ref
+/// src/core/lib/surface/completion_queue.h).
+/// See \ref doc/cpp/perf_notes.md for notes on best practices for high
+/// performance servers.
+class CompletionQueue : private GrpcLibraryCodegen {
+ public:
+  /// Default constructor. Implicitly creates a \a grpc_completion_queue
+  /// instance.
+  CompletionQueue()
+      : CompletionQueue(grpc_completion_queue_attributes{
+            GRPC_CQ_CURRENT_VERSION, GRPC_CQ_NEXT, GRPC_CQ_DEFAULT_POLLING}) {}
+
+  /// Wrap \a take, taking ownership of the instance.
+  ///
+  /// \param take The completion queue instance to wrap. Ownership is taken.
+  explicit CompletionQueue(grpc_completion_queue* take);
+
+  /// Destructor. Destroys the owned wrapped completion queue / instance.
+  ~CompletionQueue() {
+    if (typeid(*g_core_codegen_interface).hash_code() !=
+        typeid(CoreCodegenInterface).hash_code()) {
+      g_core_codegen_interface->grpc_completion_queue_destroy(cq_);
+    }
+  }
+
+  /// Tri-state return for AsyncNext: SHUTDOWN, GOT_EVENT, TIMEOUT.
+  enum NextStatus {
+    SHUTDOWN,   ///< The completion queue has been shutdown and fully-drained
+    GOT_EVENT,  ///< Got a new event; \a tag will be filled in with its
+                ///< associated value; \a ok indicating its success.
+    TIMEOUT     ///< deadline was reached.
+  };
+
+  /// Read from the queue, blocking until an event is available or the queue is
+  /// shutting down.
+  ///
+  /// \param tag[out] Updated to point to the read event's tag.
+  /// \param ok[out] true if read a successful event, false otherwise.
+  ///
+  /// Note that each tag sent to the completion queue (through RPC operations
+  /// or alarms) will be delivered out of the completion queue by a call to
+  /// Next (or a related method), regardless of whether the operation succeeded
+  /// or not. Success here means that this operation completed in the normal
+  /// valid manner.
+  ///
+  /// Server-side RPC request: \a ok indicates that the RPC has indeed
+  /// been started. If it is false, the server has been Shutdown
+  /// before this particular call got matched to an incoming RPC.
+  ///
+  /// Client-side StartCall/RPC invocation: \a ok indicates that the RPC is
+  /// going to go to the wire. If it is false, it not going to the wire. This
+  /// would happen if the channel is either permanently broken or
+  /// transiently broken but with the fail-fast option. (Note that async unary
+  /// RPCs don't post a CQ tag at this point, nor do client-streaming
+  /// or bidi-streaming RPCs that have the initial metadata corked option set.)
+  ///
+  /// Client-side Write, Client-side WritesDone, Server-side Write,
+  /// Server-side Finish, Server-side SendInitialMetadata (which is
+  /// typically included in Write or Finish when not done explicitly):
+  /// \a ok means that the data/metadata/status/etc is going to go to the
+  /// wire. If it is false, it not going to the wire because the call
+  /// is already dead (i.e., canceled, deadline expired, other side
+  /// dropped the channel, etc).
+  ///
+  /// Client-side Read, Server-side Read, Client-side
+  /// RecvInitialMetadata (which is typically included in Read if not
+  /// done explicitly): \a ok indicates whether there is a valid message
+  /// that got read. If not, you know that there are certainly no more
+  /// messages that can ever be read from this stream. For the client-side
+  /// operations, this only happens because the call is dead. For the
+  /// server-sider operation, though, this could happen because the client
+  /// has done a WritesDone already.
+  ///
+  /// Client-side Finish: \a ok should always be true
+  ///
+  /// Server-side AsyncNotifyWhenDone: \a ok should always be true
+  ///
+  /// Alarm: \a ok is true if it expired, false if it was canceled
+  ///
+  /// \return true if got an event, false if the queue is fully drained and
+  ///         shut down.
+  bool Next(void** tag, bool* ok) {
+    return (AsyncNextInternal(tag,
+                              ok,
+                              g_core_codegen_interface->gpr_inf_future(
+                                  GPR_CLOCK_REALTIME)) != SHUTDOWN);
+  }
+
+  /// Read from the queue, blocking up to \a deadline (or the queue's shutdown).
+  /// Both \a tag and \a ok are updated upon success (if an event is available
+  /// within the \a deadline).  A \a tag points to an arbitrary location usually
+  /// employed to uniquely identify an event.
+  ///
+  /// \param tag[out] Upon sucess, updated to point to the event's tag.
+  /// \param ok[out] Upon sucess, true if a successful event, false otherwise
+  ///        See documentation for CompletionQueue::Next for explanation of ok
+  /// \param deadline[in] How long to block in wait for an event.
+  ///
+  /// \return The type of event read.
+  template <typename T>
+  NextStatus AsyncNext(void** tag, bool* ok, const T& deadline) {
+    TimePoint<T> deadline_tp(deadline);
+    return AsyncNextInternal(tag, ok, deadline_tp.raw_time());
+  }
+
+  /// EXPERIMENTAL
+  /// First executes \a F, then reads from the queue, blocking up to
+  /// \a deadline (or the queue's shutdown).
+  /// Both \a tag and \a ok are updated upon success (if an event is available
+  /// within the \a deadline).  A \a tag points to an arbitrary location usually
+  /// employed to uniquely identify an event.
+  ///
+  /// \param F[in] Function to execute before calling AsyncNext on this queue.
+  /// \param tag[out] Upon sucess, updated to point to the event's tag.
+  /// \param ok[out] Upon sucess, true if read a regular event, false otherwise.
+  /// \param deadline[in] How long to block in wait for an event.
+  ///
+  /// \return The type of event read.
+  template <typename T, typename F>
+  NextStatus DoThenAsyncNext(F&& f, void** tag, bool* ok, const T& deadline) {
+    CompletionQueueTLSCache cache = CompletionQueueTLSCache(this);
+    f();
+    if (cache.Flush(tag, ok)) {
+      return GOT_EVENT;
+    } else {
+      return AsyncNext(tag, ok, deadline);
+    }
+  }
+
+  /// Request the shutdown of the queue.
+  ///
+  /// \warning This method must be called at some point if this completion queue
+  /// is accessed with Next or AsyncNext. \a Next will not return false
+  /// until this method has been called and all pending tags have been drained.
+  /// (Likewise for \a AsyncNext returning \a NextStatus::SHUTDOWN .)
+  /// Only once either one of these methods does that (that is, once the queue
+  /// has been \em drained) can an instance of this class be destroyed.
+  /// Also note that applications must ensure that no work is enqueued on this
+  /// completion queue after this method is called.
+  void Shutdown();
+
+  /// Returns a \em raw pointer to the underlying \a grpc_completion_queue
+  /// instance.
+  ///
+  /// \warning Remember that the returned instance is owned. No transfer of
+  /// owership is performed.
+  grpc_completion_queue* cq() { return cq_; }
+
+ protected:
+  /// Private constructor of CompletionQueue only visible to friend classes
+  CompletionQueue(const grpc_completion_queue_attributes& attributes) {
+    cq_ = g_core_codegen_interface->grpc_completion_queue_create(
+        g_core_codegen_interface->grpc_completion_queue_factory_lookup(
+            &attributes),
+        &attributes,
+        NULL);
+    InitialAvalanching();  // reserve this for the future shutdown
+  }
+
+ private:
+  // Friend synchronous wrappers so that they can access Pluck(), which is
+  // a semi-private API geared towards the synchronous implementation.
+  template <class R>
+  friend class ::grpc::ClientReader;
+  template <class W>
+  friend class ::grpc::ClientWriter;
+  template <class W, class R>
+  friend class ::grpc::ClientReaderWriter;
+  template <class R>
+  friend class ::grpc::ServerReader;
+  template <class W>
+  friend class ::grpc::ServerWriter;
+  template <class W, class R>
+  friend class ::grpc::internal::ServerReaderWriterBody;
+  template <class ServiceType, class RequestType, class ResponseType>
+  friend class ::grpc::internal::RpcMethodHandler;
+  template <class ServiceType, class RequestType, class ResponseType>
+  friend class ::grpc::internal::ClientStreamingHandler;
+  template <class ServiceType, class RequestType, class ResponseType>
+  friend class ::grpc::internal::ServerStreamingHandler;
+  template <class Streamer, bool WriteNeeded>
+  friend class ::grpc::internal::TemplatedBidiStreamingHandler;
+  friend class ::grpc::internal::UnknownMethodHandler;
+  friend class ::grpc::Server;
+  friend class ::grpc::ServerContext;
+  friend class ::grpc::ServerInterface;
+  template <class InputMessage, class OutputMessage>
+  friend class ::grpc::internal::BlockingUnaryCallImpl;
+
+  /// EXPERIMENTAL
+  /// Creates a Thread Local cache to store the first event
+  /// On this completion queue queued from this thread.  Once
+  /// initialized, it must be flushed on the same thread.
+  class CompletionQueueTLSCache {
+   public:
+    CompletionQueueTLSCache(CompletionQueue* cq);
+    ~CompletionQueueTLSCache();
+    bool Flush(void** tag, bool* ok);
+
+   private:
+    CompletionQueue* cq_;
+    bool flushed_;
+  };
+
+  NextStatus AsyncNextInternal(void** tag, bool* ok, gpr_timespec deadline);
+
+  /// Wraps \a grpc_completion_queue_pluck.
+  /// \warning Must not be mixed with calls to \a Next.
+  bool Pluck(internal::CompletionQueueTag* tag) {
+    auto deadline =
+        g_core_codegen_interface->gpr_inf_future(GPR_CLOCK_REALTIME);
+    auto ev = g_core_codegen_interface->grpc_completion_queue_pluck(
+        cq_, tag, deadline, nullptr);
+    bool ok = ev.success != 0;
+    void* ignored = tag;
+    GPR_CODEGEN_ASSERT(tag->FinalizeResult(&ignored, &ok));
+    GPR_CODEGEN_ASSERT(ignored == tag);
+    // Ignore mutations by FinalizeResult: Pluck returns the C API status
+    return ev.success != 0;
+  }
+
+  /// Performs a single polling pluck on \a tag.
+  /// \warning Must not be mixed with calls to \a Next.
+  ///
+  /// TODO: sreek - This calls tag->FinalizeResult() even if the cq_ is already
+  /// shutdown. This is most likely a bug and if it is a bug, then change this
+  /// implementation to simple call the other TryPluck function with a zero
+  /// timeout. i.e:
+  ///      TryPluck(tag, gpr_time_0(GPR_CLOCK_REALTIME))
+  void TryPluck(internal::CompletionQueueTag* tag) {
+    auto deadline = g_core_codegen_interface->gpr_time_0(GPR_CLOCK_REALTIME);
+    auto ev = g_core_codegen_interface->grpc_completion_queue_pluck(
+        cq_, tag, deadline, nullptr);
+    if (ev.type == GRPC_QUEUE_TIMEOUT) return;
+    bool ok = ev.success != 0;
+    void* ignored = tag;
+    // the tag must be swallowed if using TryPluck
+    GPR_CODEGEN_ASSERT(!tag->FinalizeResult(&ignored, &ok));
+  }
+
+  /// Performs a single polling pluck on \a tag. Calls tag->FinalizeResult if
+  /// the pluck() was successful and returned the tag.
+  ///
+  /// This exects tag->FinalizeResult (if called) to return 'false' i.e expects
+  /// that the tag is internal not something that is returned to the user.
+  void TryPluck(internal::CompletionQueueTag* tag, gpr_timespec deadline) {
+    auto ev = g_core_codegen_interface->grpc_completion_queue_pluck(
+        cq_, tag, deadline, nullptr);
+    if (ev.type == GRPC_QUEUE_TIMEOUT || ev.type == GRPC_QUEUE_SHUTDOWN) {
+      return;
+    }
+
+    bool ok = ev.success != 0;
+    void* ignored = tag;
+    GPR_CODEGEN_ASSERT(!tag->FinalizeResult(&ignored, &ok));
+  }
+
+  /// Manage state of avalanching operations : completion queue tags that
+  /// trigger other completion queue operations. The underlying core completion
+  /// queue should not really shutdown until all avalanching operations have
+  /// been finalized. Note that we maintain the requirement that an avalanche
+  /// registration must take place before CQ shutdown (which must be maintained
+  /// elsehwere)
+  void InitialAvalanching() {
+    gpr_atm_rel_store(&avalanches_in_flight_, static_cast<gpr_atm>(1));
+  }
+  void RegisterAvalanching() {
+    gpr_atm_no_barrier_fetch_add(&avalanches_in_flight_,
+                                 static_cast<gpr_atm>(1));
+  }
+  void CompleteAvalanching();
+
+  grpc_completion_queue* cq_;  // owned
+
+  gpr_atm avalanches_in_flight_;
+};
+
+/// A specific type of completion queue used by the processing of notifications
+/// by servers. Instantiated by \a ServerBuilder.
+class ServerCompletionQueue : public CompletionQueue {
+ public:
+  bool IsFrequentlyPolled() { return polling_type_ != GRPC_CQ_NON_LISTENING; }
+
+ private:
+  grpc_cq_polling_type polling_type_;
+  friend class ServerBuilder;
+  /// \param is_frequently_polled Informs the GRPC library about whether the
+  /// server completion queue would be actively polled (by calling Next() or
+  /// AsyncNext()). By default all server completion queues are assumed to be
+  /// frequently polled.
+  ServerCompletionQueue(grpc_cq_polling_type polling_type)
+      : CompletionQueue(grpc_completion_queue_attributes{
+            GRPC_CQ_CURRENT_VERSION, GRPC_CQ_NEXT, polling_type}),
+        polling_type_(polling_type) {}
+};
+
+}  // namespace grpc
+
+#endif  // GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
diff --git a/patches/grpc/grpc_library.h b/patches/grpc/grpc_library.h
new file mode 100644
index 0000000000000000000000000000000000000000..4870a1cda4b2a6489bc379fe53cf3e9659fffc47
--- /dev/null
+++ b/patches/grpc/grpc_library.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
+#define GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
+
+#include <typeinfo>
+
+#include <grpcpp/impl/codegen/core_codegen_interface.h>
+
+namespace grpc {
+
+class GrpcLibraryInterface {
+ public:
+  virtual ~GrpcLibraryInterface() = default;
+  virtual void init() = 0;
+  virtual void shutdown() = 0;
+};
+
+/// Initialized by \a grpc::GrpcLibraryInitializer from
+/// <grpcpp/impl/grpc_library.h>
+extern GrpcLibraryInterface* g_glip;
+
+/// Classes that require gRPC to be initialized should inherit from this class.
+class GrpcLibraryCodegen {
+ public:
+  GrpcLibraryCodegen(bool call_grpc_init = true) : grpc_init_called_(false) {
+    if (call_grpc_init) {
+      GPR_CODEGEN_ASSERT(g_glip &&
+                         "gRPC library not initialized. See "
+                         "grpc::internal::GrpcLibraryInitializer.");
+      g_glip->init();
+      grpc_init_called_ = true;
+    }
+  }
+  virtual ~GrpcLibraryCodegen() {
+    if (grpc_init_called_ &&
+        typeid(*g_glip).hash_code() !=
+            typeid(GrpcLibraryInterface).hash_code()) {
+      GPR_CODEGEN_ASSERT(g_glip &&
+                         "gRPC library not initialized. See "
+                         "grpc::internal::GrpcLibraryInitializer.");
+      g_glip->shutdown();
+    }
+  }
+
+ private:
+  bool grpc_init_called_;
+};
+
+}  // namespace grpc
+
+#endif  // GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 797c0fbcc4a2d61f5cbbf691db19b4cba5d38630..25900811509aee8b37fdaf09cf902ea2ae3eee57 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -91,3 +91,16 @@ endif()
 install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
     DESTINATION opt/paddle/share/wheels
 )
+
+if(APPLE)
+  find_program(INSTALL_NAME_TOOL_EXECUTABLE install_name_tool)
+  if(NOT INSTALL_NAME_TOOL_EXECUTABLE)
+    message(FATAL_ERROR "install_name_tool not found, please check.\n")
+  endif()
+else(APPLE)
+  find_program(PATCHELF_EXECUTABLE patchelf)
+  if(NOT PATCHELF_EXECUTABLE)
+    message(FATAL_ERROR "patchelf not found, please install it.\n"
+            "For Ubuntu, the command is: apt-get install -y patchelf.")
+  endif()
+endif(APPLE)
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index d1cf04161ae4444ebc7da7fbc20e37dafe6c0fb1..241a07a35297e85763781a42696fd727733459a3 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -12,16 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 try:
-    from version import full_version as __version__
-    from version import commit as __git_commit__
+    from paddle.version import full_version as __version__
+    from paddle.version import commit as __git_commit__
 
 except ImportError:
     import sys
-    sys.stderr.write('''Warning with import paddle: you should not 
+    sys.stderr.write('''Warning with import paddle: you should not
      import paddle from the source directory; please install paddlepaddle*.whl firstly.'''
                      )
 
-import reader
-import dataset
-import batch
+import paddle.reader
+import paddle.dataset
+import paddle.batch
 batch = batch.batch
diff --git a/python/paddle/batch.py b/python/paddle/batch.py
index 3c6a53db3c2287e8ef5931a06ca5dad455665ee0..008509660739d61245526278735064472b8b06dd 100644
--- a/python/paddle/batch.py
+++ b/python/paddle/batch.py
@@ -15,7 +15,7 @@
 __all__ = ['batch']
 
 
-def batch(reader, batch_size, drop_last=True):
+def batch(reader, batch_size, drop_last=False):
     """
     Create a batched reader.
 
@@ -40,4 +40,10 @@ def batch(reader, batch_size, drop_last=True):
         if drop_last == False and len(b) != 0:
             yield b
 
+    # Batch size check
+    batch_size = int(batch_size)
+    if batch_size <= 0:
+        raise ValueError("batch_size should be a positive integeral value, "
+                         "but got batch_size={}".format(batch_size))
+
     return batch_reader
diff --git a/python/paddle/dataset/__init__.py b/python/paddle/dataset/__init__.py
index 3315e826e82a33dfeb9c5223ce196cffb1ae7234..54aa3edc51d3734633ce077a59bd86cec8d09032 100644
--- a/python/paddle/dataset/__init__.py
+++ b/python/paddle/dataset/__init__.py
@@ -15,20 +15,20 @@
 Dataset package.
 """
 
-import mnist
-import imikolov
-import imdb
-import cifar
-import movielens
-import conll05
-import uci_housing
-import sentiment
-import wmt14
-import wmt16
-import mq2007
-import flowers
-import voc2012
-import image
+import paddle.dataset.mnist
+import paddle.dataset.imikolov
+import paddle.dataset.imdb
+import paddle.dataset.cifar
+import paddle.dataset.movielens
+import paddle.dataset.conll05
+import paddle.dataset.uci_housing
+import paddle.dataset.sentiment
+import paddle.dataset.wmt14
+import paddle.dataset.wmt16
+import paddle.dataset.mq2007
+import paddle.dataset.flowers
+import paddle.dataset.voc2012
+import paddle.dataset.image
 
 __all__ = [
     'mnist',
diff --git a/python/paddle/dataset/cifar.py b/python/paddle/dataset/cifar.py
index 07f4dcbdab2fecf84a0a7042a48a8c8a9e5f880d..79ddd8b7e6f31383fa531f398ef37315b92a9807 100644
--- a/python/paddle/dataset/cifar.py
+++ b/python/paddle/dataset/cifar.py
@@ -43,7 +43,7 @@ CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
 CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
 
 
-def reader_creator(filename, sub_name):
+def reader_creator(filename, sub_name, cycle=False):
     def read_batch(batch):
         data = batch['data']
         labels = batch.get('labels', batch.get('fine_labels', None))
@@ -56,10 +56,13 @@ def reader_creator(filename, sub_name):
             names = (each_item.name for each_item in f
                      if sub_name in each_item.name)
 
-            for name in names:
-                batch = cPickle.load(f.extractfile(name))
-                for item in read_batch(batch):
-                    yield item
+            while True:
+                for name in names:
+                    batch = cPickle.load(f.extractfile(name))
+                    for item in read_batch(batch):
+                        yield item
+                if not cycle:
+                    break
 
     return reader
 
@@ -94,34 +97,40 @@ def test100():
         'test')
 
 
-def train10():
+def train10(cycle=False):
     """
     CIFAR-10 training set creator.
 
     It returns a reader creator, each sample in the reader is image pixels in
     [0, 1] and label in [0, 9].
 
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
     :return: Training reader creator
     :rtype: callable
     """
     return reader_creator(
         paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'data_batch')
+        'data_batch',
+        cycle=cycle)
 
 
-def test10():
+def test10(cycle=False):
     """
     CIFAR-10 test set creator.
 
     It returns a reader creator, each sample in the reader is image pixels in
     [0, 1] and label in [0, 9].
 
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
     :return: Test reader creator.
     :rtype: callable
     """
     return reader_creator(
         paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'test_batch')
+        'test_batch',
+        cycle=cycle)
 
 
 def fetch():
diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
index 527044b415533cc640e3cfc5837c08ab0f8b74b1..2354987d20b908a32209f9ac22a2065ee43c3dfd 100644
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -76,7 +76,8 @@ def reader_creator(data_file,
                    dataset_name,
                    mapper,
                    buffered_size=1024,
-                   use_xmap=True):
+                   use_xmap=True,
+                   cycle=False):
     '''
     1. read images from tar file and
         merge images into batch files in 102flowers.tgz_batch/
@@ -96,6 +97,8 @@ def reader_creator(data_file,
     :type mapper: callable
     :param buffered_size: the size of buffer used to process images
     :type buffered_size: int
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
     :return: data reader
     :rtype: callable
     '''
@@ -108,15 +111,18 @@ def reader_creator(data_file,
     file_list = batch_images_from_tar(data_file, dataset_name, img2label)
 
     def reader():
-        for file in open(file_list):
-            file = file.strip()
-            batch = None
-            with open(file, 'r') as f:
-                batch = cPickle.load(f)
-            data = batch['data']
-            labels = batch['label']
-            for sample, label in itertools.izip(data, batch['label']):
-                yield sample, int(label) - 1
+        while True:
+            for file in open(file_list):
+                file = file.strip()
+                batch = None
+                with open(file, 'r') as f:
+                    batch = cPickle.load(f)
+                data = batch['data']
+                labels = batch['label']
+                for sample, label in itertools.izip(data, batch['label']):
+                    yield sample, int(label) - 1
+            if not cycle:
+                break
 
     if use_xmap:
         cpu_num = int(os.environ.get('CPU_NUM', cpu_count()))
@@ -125,7 +131,7 @@ def reader_creator(data_file,
         return map_readers(mapper, reader)
 
 
-def train(mapper=train_mapper, buffered_size=1024, use_xmap=True):
+def train(mapper=train_mapper, buffered_size=1024, use_xmap=True, cycle=False):
     '''
     Create flowers training set reader.
     It returns a reader, each sample in the reader is
@@ -138,17 +144,23 @@ def train(mapper=train_mapper, buffered_size=1024, use_xmap=True):
     :type mapper: callable
     :param buffered_size: the size of buffer used to process images
     :type buffered_size: int
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
     :return: train data reader
     :rtype: callable
     '''
     return reader_creator(
         download(DATA_URL, 'flowers', DATA_MD5),
         download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), TRAIN_FLAG, mapper,
-        buffered_size, use_xmap)
+        download(SETID_URL, 'flowers', SETID_MD5),
+        TRAIN_FLAG,
+        mapper,
+        buffered_size,
+        use_xmap,
+        cycle=cycle)
 
 
-def test(mapper=test_mapper, buffered_size=1024, use_xmap=True):
+def test(mapper=test_mapper, buffered_size=1024, use_xmap=True, cycle=False):
     '''
     Create flowers test set reader.
     It returns a reader, each sample in the reader is
@@ -161,14 +173,20 @@ def test(mapper=test_mapper, buffered_size=1024, use_xmap=True):
     :type mapper: callable
     :param buffered_size: the size of buffer used to process images
     :type buffered_size: int
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
     :return: test data reader
     :rtype: callable
     '''
     return reader_creator(
         download(DATA_URL, 'flowers', DATA_MD5),
         download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), TEST_FLAG, mapper,
-        buffered_size, use_xmap)
+        download(SETID_URL, 'flowers', SETID_MD5),
+        TEST_FLAG,
+        mapper,
+        buffered_size,
+        use_xmap,
+        cycle=cycle)
 
 
 def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True):
diff --git a/python/paddle/dataset/mnist.py b/python/paddle/dataset/mnist.py
index 9d05aeeb95c4f936cb773ece20407ecb32cbbf21..6259cc35b4f7bb781886bb5da9d16924831d7246 100644
--- a/python/paddle/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
@@ -68,8 +68,14 @@ def reader_creator(image_filename, label_filename, buffer_size):
                 for i in xrange(buffer_size):
                     yield images[i, :], int(labels[i])
         finally:
-            m.terminate()
-            l.terminate()
+            try:
+                m.terminate()
+            except:
+                pass
+            try:
+                l.terminate()
+            except:
+                pass
 
     return reader
 
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 3034c1a0875a71421bcba172c16ee32d809df152..3b38c42801e0a4b503d929ca422b354f4c51bb0c 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -35,6 +35,7 @@ import io
 import evaluator
 import initializer
 import layers
+import contrib
 import nets
 import optimizer
 import backward
@@ -44,9 +45,9 @@ import metrics
 import transpiler
 from param_attr import ParamAttr, WeightNormParamAttr
 from data_feeder import DataFeeder
-from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
+from core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
 from transpiler import DistributeTranspiler, InferenceTranspiler, \
-    memory_optimize, release_memory
+    memory_optimize, release_memory, DistributeTranspilerConfig
 from concurrency import (Go, make_channel, channel_send, channel_recv,
                          channel_close, Select)
 from lod_tensor import create_lod_tensor, create_random_int_lodtensor
@@ -56,35 +57,38 @@ import unique_name
 import recordio_writer
 import parallel_executor
 from parallel_executor import *
+from paddle.fluid.layers.math_op_patch import monkey_patch_variable
 
 Tensor = LoDTensor
 
 __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \
-          trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
-          parallel_executor.__all__ + lod_tensor.__all__ + [
-              'io',
-              'initializer',
-              'layers',
-              'transpiler'
-              'nets',
-              'optimizer',
-              'learning_rate_decay',
-              'backward',
-              'regularizer',
-              'LoDTensor',
-              'CPUPlace',
-              'CUDAPlace',
-              'CUDAPinnedPlace',
-              'Tensor',
-              'ParamAttr',
-              'WeightNormParamAttr',
-              'DataFeeder',
-              'clip',
-              'profiler',
-              'unique_name',
-              'recordio_writer',
-              'Scope',
-          ]
+    trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
+    parallel_executor.__all__ + lod_tensor.__all__ + [
+        'io',
+        'initializer',
+        'layers',
+        'contrib',
+        'transpiler',
+        'nets',
+        'optimizer',
+        'learning_rate_decay',
+        'backward',
+        'regularizer',
+        'LoDTensor',
+        'LoDTensorArray',
+        'CPUPlace',
+        'CUDAPlace',
+        'CUDAPinnedPlace',
+        'Tensor',
+        'ParamAttr',
+        'WeightNormParamAttr',
+        'DataFeeder',
+        'clip',
+        'profiler',
+        'unique_name',
+        'recordio_writer',
+        'Scope',
+    ]
 
 
 def __bootstrap__():
@@ -119,8 +123,13 @@ def __bootstrap__():
     read_env_flags = [
         'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
         'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
-        'init_allocated_mem'
+        'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
+        'cpu_deterministic'
     ]
+    if core.is_compiled_with_dist():
+        read_env_flags.append('rpc_deadline')
+        read_env_flags.append('listen_and_serv_profile_period')
+
     if core.is_compiled_with_cuda():
         read_env_flags += [
             'fraction_of_gpu_memory_to_use', 'cudnn_deterministic'
@@ -134,5 +143,5 @@ def __bootstrap__():
 
 # TODO(panyx0718): Avoid doing complex initialization logic in __init__.py.
 # Consider paddle.init(args) or paddle.main(args)
-layers.monkey_patch_variable()
+monkey_patch_variable()
 __bootstrap__()
diff --git a/python/paddle/fluid/annotations.py b/python/paddle/fluid/annotations.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb8756a4664013643c278c013ca21bb237a6b4a7
--- /dev/null
+++ b/python/paddle/fluid/annotations.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import sys
+
+__all__ = ['deprecated']
+
+
+def deprecated(since, instead, extra_message=""):
+    def decorator(func):
+        err_msg = "API {0} is deprecated since {1}. Please use {2} instead.".format(
+            func.__name__, since, instead)
+        if len(extra_message) != 0:
+            err_msg += "\n"
+            err_msg += extra_message
+
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            print >> sys.stderr, err_msg
+            return func(*args, **kwargs)
+
+        wrapper.__doc__ += "\n    "
+        wrapper.__doc__ += err_msg
+        return wrapper
+
+    return decorator
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 4faa06303170488d0de2fda4c1461cfe2d623d35..3fb7019a450da5903952c59ee483d88fde42701a 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -18,10 +18,7 @@ import collections
 import copy
 import unique_name
 
-__all__ = [
-    'append_backward',
-    'calc_gradient',
-]
+__all__ = ['append_backward']
 
 
 def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
@@ -123,7 +120,8 @@ def _append_grad_suffix_(name):
 def _addup_repetitive_outputs_(op_descs):
     """
     In backward part, an variable may be the output of more than one ops.
-    In this case, the variable should be the accumulation of all the outputs.
+    And one op may yield its multiple outputs to the same variable.
+    In these cases, the variable should be the accumulation of all the outputs.
     `sum_op`s are added to implement the accumulate.
     """
     pending_sum_ops = []
@@ -136,29 +134,46 @@ def _addup_repetitive_outputs_(op_descs):
                     "sum", {"X": renamed_vars[var_name]}, {"Out": [var_name]},
                     {"use_mkldnn": False}), idx))
                 renamed_vars[var_name] = [var_name]
-        for var_name in op_desc.output_arg_names():
-            if var_name == core.empty_var_name(
-            ) or var_name in op_desc.input_arg_names():
-                # empty variable or inplace op
-                continue
-            if len(renamed_vars[var_name]) == 0:
-                # it's the first time we get the variable
-                renamed_vars[var_name] = [var_name]
-            else:
-                if len(renamed_vars[var_name]) == 1:
+        for param_idx, param_name in enumerate(op_desc.output_names()):
+            arg_names = op_desc.output(param_name)
+            for arg_idx, var_name in enumerate(arg_names):
+                if var_name == core.empty_var_name(
+                ) or var_name in op_desc.input_arg_names():
+                    # empty variable or inplace op
+                    continue
+                if len(renamed_vars[var_name]) == 0:
+                    # it's the first time we get the variable
+                    renamed_vars[var_name] = [var_name]
+                else:
+                    if len(renamed_vars[var_name]) == 1:
+                        new_name = var_name + "@RENAME@" + \
+                            str(var_rename_count[var_name])
+                        var_rename_count[var_name] += 1
+                        # rename original var_name
+                        renamed_vars[var_name][0] = new_name
+                        _rename_arg_(op_descs, var_name, new_name, 0, idx)
+                        _rename_arg_(pending_sum_ops, var_name, new_name)
+
+                        for p in op_desc.output_names()[:param_idx]:
+                            p_arg_names = op_desc.output(p)
+                            if var_name in p_arg_names:
+                                op_desc.set_output(p, [
+                                    new_name if x == var_name else x
+                                    for x in p_arg_names
+                                ])
+
+                        arg_names = [
+                            new_name if x == var_name else x
+                            for x in arg_names[:arg_idx]
+                        ] + arg_names[arg_idx:]
+
                     new_name = var_name + "@RENAME@" + \
                         str(var_rename_count[var_name])
                     var_rename_count[var_name] += 1
-                    # rename original var_name
-                    renamed_vars[var_name][0] = new_name
-                    _rename_arg_(op_descs, var_name, new_name, 0, idx)
-                    _rename_arg_(pending_sum_ops, var_name, new_name)
-
-                new_name = var_name + "@RENAME@" + \
-                    str(var_rename_count[var_name])
-                var_rename_count[var_name] += 1
-                op_desc.rename_output(var_name, new_name)
-                renamed_vars[var_name].append(new_name)
+                    arg_names[arg_idx] = new_name
+                    op_desc.set_output(param_name, arg_names)
+                    renamed_vars[var_name].append(new_name)
+
     for var_name, inputs in renamed_vars.iteritems():
         if len(inputs) > 1:
             pending_sum_ops.append(
@@ -313,7 +328,7 @@ def _append_backward_ops_(block,
         if op.has_attr("sub_block"):
             sub_block = program.block(op.block_attr("sub_block"))
             grad_sub_block = program.create_block()
-            grad_sub_block.set_forward_block_idx(sub_block.idx)
+            grad_sub_block._set_forward_block_idx(sub_block.idx)
             cb = _callback_lookup_(op)
             if cb is not None:
                 if callbacks is None:
@@ -556,9 +571,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
     _append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map)
 
     program.current_block_idx = current_block_idx
-    program.sync_with_cpp()
-    # FIXME(zcd): prevent loss.grad optimized by mem_opt.
-    loss.block.var(_append_grad_suffix_(loss.name)).persistable = True
+    program._sync_with_cpp()
 
     if parameter_list is not None:
         parameters = parameter_list
@@ -729,7 +742,7 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
     _rename_grad_(block, fwd_op_num, grad_to_var, target_grad_map)
 
     _append_backward_vars_(block, fwd_op_num, grad_to_var, grad_info_map)
-    prog.sync_with_cpp()
+    prog._sync_with_cpp()
 
     grad_vars = []
     for input_var in inputs:
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 18e2f3045e272fb4712391f87bffd3f367c1c744..c029662ebc1b7e7f7d1ea44b4ebd4b08b812a579 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -31,7 +31,7 @@ class BaseErrorClipAttr(object):
     def __str__(self):
         raise NotImplementedError()
 
-    def append_clip_op(self, block, grad_name):
+    def _append_clip_op(self, block, grad_name):
         raise NotImplementedError()
 
 
@@ -67,7 +67,7 @@ class ErrorClipByValue(BaseErrorClipAttr):
     def __str__(self):
         return "ByValue, min=%f, max=%f" % (self.min, self.max)
 
-    def append_clip_op(self, block, grad_name):
+    def _append_clip_op(self, block, grad_name):
         clip_op_desc = block.desc.append_op()
         clip_op_desc.set_type("clip")
         clip_op_desc.set_input("X", [grad_name])
@@ -82,7 +82,7 @@ def error_clip_callback(block, context):
     op_desc = block.desc.op(block.desc.op_size() - 1)
     for grad_n in filter(lambda n: grad_to_var.has_key(n),
                          op_desc.output_arg_names()):
-        fwd_var = block.var_recursive(grad_to_var[grad_n])
+        fwd_var = block._var_recursive(grad_to_var[grad_n])
         error_clip = getattr(fwd_var, "error_clip", None)
         if not (error_clip is None or isinstance(error_clip,
                                                  BaseErrorClipAttr)):
@@ -90,17 +90,17 @@ def error_clip_callback(block, context):
                 "Variable's error_clip should be an instance of BaseErrorClipAttr or None."
             )
         if error_clip is not None:
-            error_clip.append_clip_op(block, grad_n)
+            error_clip._append_clip_op(block, grad_n)
 
 
 class BaseGradientClipAttr(object):
     def __str__(self):
         raise NotImplementedError()
 
-    def process_context(self, context, param, grad):
+    def _process_context(self, context, param, grad):
         raise NotImplementedError()
 
-    def create_operators(self, param, grad):
+    def _create_operators(self, param, grad):
         raise NotImplementedError()
 
 
@@ -108,10 +108,10 @@ class NullGradientClipAttr(BaseGradientClipAttr):
     def __str__(self):
         return "Null"
 
-    def process_context(self, context, param, grad):
+    def _process_context(self, context, param, grad):
         pass
 
-    def create_operators(self, param, grad):
+    def _create_operators(self, param, grad):
         return param, grad
 
 
@@ -153,10 +153,10 @@ class GradientClipByValue(BaseGradientClipAttr):
     def __str__(self):
         return "ByValue, min=%f, max=%f" % (self.min, self.max)
 
-    def process_context(self, context, param, grad):
+    def _process_context(self, context, param, grad):
         pass
 
-    def create_operators(self, param, grad):
+    def _create_operators(self, param, grad):
         new_grad = layers.clip(x=grad, min=self.min, max=self.max)
         return param, new_grad
 
@@ -199,10 +199,10 @@ class GradientClipByNorm(BaseGradientClipAttr):
     def __str__(self):
         return "ByNorm, clip_norm=%f" % self.clip_norm
 
-    def process_context(self, context, param, grad):
+    def _process_context(self, context, param, grad):
         pass
 
-    def create_operators(self, param, grad):
+    def _create_operators(self, param, grad):
         new_grad = layers.clip_by_norm(x=grad, max_norm=self.clip_norm)
         return param, new_grad
 
@@ -257,7 +257,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
         return "ByGlobalNorm, group_name=%s, clip_norm=%f" % (self.group_name,
                                                               self.clip_norm)
 
-    def process_context(self, context, param, grad):
+    def _process_context(self, context, param, grad):
         if self.group_name not in context:
             context[self.group_name] = []
             context[self.group_name + "_clip_value"] = self.clip_norm
@@ -274,7 +274,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
 
         self.context = context
 
-    def create_operators(self, param, grad):
+    def _create_operators(self, param, grad):
         group_scale_name = self.group_name + "_scale"
         if group_scale_name not in self.context:
             group_norm_var = layers.sums(input=self.context[self.group_name])
@@ -324,10 +324,12 @@ def set_gradient_clip(clip, param_list=None, program=None):
         param.gradient_clip_attr = copy.deepcopy(clip)
 
 
-def append_gradient_clip_ops(param_grad):
+def append_gradient_clip_ops(param_grads):
     context = dict()
-    for p, g in param_grad:
-        with p.block.program.optimized_guard(p):
+    for p, g in param_grads:
+        if g is None:
+            continue
+        with p.block.program.optimized_guard([p, g]):
             clip_attr = getattr(p, 'gradient_clip_attr', NullGradientClipAttr())
             if clip_attr is None:
                 clip_attr = NullGradientClipAttr()
@@ -336,12 +338,14 @@ def append_gradient_clip_ops(param_grad):
                     "clip attribute should be an instance of BaseGradientClipAttr"
                 )
 
-            clip_attr.process_context(context=context, param=p, grad=g)
+            clip_attr._process_context(context=context, param=p, grad=g)
 
     res = []
-    for p, g in param_grad:
-        with p.block.program.optimized_guard(p):
-            res.append(clip_attr.create_operators(param=p, grad=g))
+    for p, g in param_grads:
+        if g is None:
+            continue
+        with p.block.program.optimized_guard([p, g]):
+            res.append(clip_attr._create_operators(param=p, grad=g))
 
     return res
 
diff --git a/python/paddle/fluid/concurrency.py b/python/paddle/fluid/concurrency.py
index 470dd0df524936a773f6e740c8079f0efa8ef7b4..b8fe9bd4c1988dd3f6fa82df391c3059dfbfcf93 100644
--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
@@ -69,8 +69,10 @@ class Go(BlockGuard):
         parent_block.append_op(
             type='go',
             inputs={
-                'X':
-                [parent_block.var_recursive(x_name) for x_name in x_name_list]
+                'X': [
+                    parent_block._var_recursive(x_name)
+                    for x_name in x_name_list
+                ]
             },
             outputs={},
             attrs={'sub_block': go_block})
@@ -259,7 +261,7 @@ class Select(BlockGuard):
             if var_name in intermediate
         ]
 
-        X = [select_block.var_recursive(x_name) for x_name in params]
+        X = [select_block._var_recursive(x_name) for x_name in params]
 
         # Needs to be used by `equal` inside the cases block.
         X.append(self.case_to_execute)
diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9de9e9504510baec9aefb47f91793c364450795a
--- /dev/null
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -0,0 +1,20 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import decoder
+from decoder import *
+import memory_usage_calc
+from memory_usage_calc import *
+
+__all__ = decoder.__all__ + memory_usage_calc.__all__
diff --git a/paddle/contrib/CMakeLists.txt b/python/paddle/fluid/contrib/decoder/__init__.py
similarity index 69%
rename from paddle/contrib/CMakeLists.txt
rename to python/paddle/fluid/contrib/decoder/__init__.py
index 4b19256ef4533a09162edf907f6cd51146517e46..22cfe692690a686f32eba34ee34b9193f0d5ba35 100644
--- a/paddle/contrib/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/decoder/__init__.py
@@ -1,16 +1,18 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-# http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
 
-add_subdirectory(inference)
+import beam_search_decoder
+from beam_search_decoder import *
+
+__all__ = beam_search_decoder.__all__
diff --git a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba6e13878291ad9f30e92f998767df6d8c6f32c3
--- /dev/null
+++ b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
@@ -0,0 +1,838 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module provides a general beam search decoder API for RNN based decoders.
+The purpose of this API is to allow users to highly customize the behavior
+within their RNN decoder(vanilla RNN, LSTM, attention + LSTM, future etc.),
+without using the low level API such as while ops.
+
+This API is still under active development and may change drastically.
+"""
+
+import contextlib
+import numpy as np
+
+from ... import layers
+from ...framework import Variable
+from ... import core
+from ... import framework, unique_name
+from ...layer_helper import LayerHelper
+
+__all__ = ['InitState', 'StateCell', 'TrainingDecoder', 'BeamSearchDecoder']
+
+
+class _DecoderType:
+    TRAINING = 1
+    BEAM_SEARCH = 2
+
+
+class InitState(object):
+    """
+    The initial hidden state object. The state objects holds a variable, and may
+    use it to initialize the hidden state cell of RNN. Usually used as input to
+    `StateCell` class.
+
+    Args:
+        init (Variable): The initial variable of the hidden state. If set None,
+            the variable will be created as a tensor with constant value based
+            on `shape` and `value` param.
+        shape (tuple|list): If `init` is None, new Variable's shape. Default
+            None.
+        value (float): If `init` is None, new Variable's value. Default None.
+        init_boot (Variable): If provided, the initial variable will be created
+            with the same shape as this variable.
+        need_reorder (bool): If set true, the init will be sorted by its lod
+            rank within its batches. This should be used if `batch_size > 1`.
+        dtype (np.dtype|core.VarDesc.VarType|str): Data type of the initial
+            variable.
+
+    Returns:
+        An initialized state object.
+
+    Examples:
+        See `StateCell`.
+    """
+
+    def __init__(self,
+                 init=None,
+                 shape=None,
+                 value=0.0,
+                 init_boot=None,
+                 need_reorder=False,
+                 dtype='float32'):
+        if init is not None:
+            self._init = init
+        elif init_boot is None:
+            raise ValueError(
+                'init_boot must be provided to infer the shape of InitState .\n')
+        else:
+            self._init = layers.fill_constant_batch_size_like(
+                input=init_boot, value=value, shape=shape, dtype=dtype)
+
+        self._shape = shape
+        self._value = value
+        self._need_reorder = need_reorder
+        self._dtype = dtype
+
+    @property
+    def value(self):
+        return self._init
+
+    @property
+    def need_reorder(self):
+        return self._need_reorder
+
+
+class _MemoryState(object):
+    def __init__(self, state_name, rnn_obj, init_state):
+        self._state_name = state_name  # each is a rnn.memory
+        self._rnn_obj = rnn_obj
+        self._state_mem = self._rnn_obj.memory(
+            init=init_state.value, need_reorder=init_state.need_reorder)
+
+    def get_state(self):
+        return self._state_mem
+
+    def update_state(self, state):
+        self._rnn_obj.update_memory(self._state_mem, state)
+
+
+class _ArrayState(object):
+    def __init__(self, state_name, block, init_state):
+        self._state_name = state_name
+        self._block = block
+
+        self._state_array = self._block.create_var(
+            name=unique_name.generate('array_state_array'),
+            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+            dtype=init_state.value.dtype)
+
+        self._counter = self._block.create_var(
+            name=unique_name.generate('array_state_counter'),
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            dtype='int64')
+
+        # initialize counter
+        self._block.append_op(
+            type='fill_constant',
+            inputs={},
+            outputs={'Out': [self._counter]},
+            attrs={
+                'shape': [1],
+                'dtype': self._counter.dtype,
+                'value': float(0.0),
+                'force_cpu': True
+            })
+
+        self._counter.stop_gradient = True
+
+        # write initial state
+        block.append_op(
+            type='write_to_array',
+            inputs={'X': init_state.value,
+                    'I': self._counter},
+            outputs={'Out': self._state_array})
+
+    def get_state(self):
+        state = layers.array_read(array=self._state_array, i=self._counter)
+        return state
+
+    def update_state(self, state):
+        layers.increment(x=self._counter, value=1, in_place=True)
+        layers.array_write(state, array=self._state_array, i=self._counter)
+
+
+class StateCell(object):
+    """
+    The state cell class stores the hidden state of the RNN cell. A typical RNN
+    cell has one or more hidden states, and one or more step inputs. This class
+    allows you to defines the name of hidden states as well as step inputs, and
+    their associated variables.
+
+    Args:
+        inputs (dict): A feeding dict of {name(str) : Variable}. It specifies
+            the names of step inputs for RNN cell, and the associated variables.
+            The variable could initially be None and set manually during each
+            RNN step.
+        states (dict): A feeding dict of {name(str) : InitState object}. It
+            specifies the names of hidden states and their initialized state.
+        out_state (str): A string that specifies the name of hidden state that
+            will be used to compute the score in beam search process.
+        name (str): The name of the RNN cell. Default None.
+
+    Raises:
+        `ValueError`: If the initial state is not an instance of InitState, or
+            the out_state is not in the dict of states.
+
+    Returns:
+        StateCell: The initialized StateCell object.
+
+    Examples:
+        .. code-block:: python
+          hidden_state = InitState(init=encoder_out, need_reorder=True)
+          state_cell = StateCell(
+              inputs={'current_word': None},
+              states={'h': hidden_state},
+              out_state='h')
+    """
+
+    def __init__(self, inputs, states, out_state, name=None):
+        self._helper = LayerHelper('state_cell', name=name)
+        self._cur_states = {}
+        self._state_names = []
+        for state_name, state in states.items():
+            if not isinstance(state, InitState):
+                raise ValueError('state must be an InitState object.')
+            self._cur_states[state_name] = state
+            self._state_names.append(state_name)
+        self._inputs = inputs  # inputs is place holder here
+        self._cur_decoder_obj = None
+        self._in_decoder = False
+        self._states_holder = {}
+        self._switched_decoder = False
+        self._state_updater = None
+        self._out_state = out_state
+        if self._out_state not in self._cur_states:
+            raise ValueError('out_state must be one state in states')
+
+    def _enter_decoder(self, decoder_obj):
+        if self._in_decoder == True or self._cur_decoder_obj is not None:
+            raise ValueError('StateCell has already entered a decoder.')
+        self._in_decoder = True
+        self._cur_decoder_obj = decoder_obj
+        self._switched_decoder = False
+
+    def _leave_decoder(self, decoder_obj):
+        if not self._in_decoder:
+            raise ValueError('StateCell not in decoder, '
+                             'invalid leaving operation.')
+
+        if self._cur_decoder_obj != decoder_obj:
+            raise ValueError('Inconsistent decoder object in StateCell.')
+
+        self._in_decoder = False
+        self._cur_decoder_obj = None
+        self._switched_decoder = False
+
+    def _switch_decoder(self):  # lazy switch
+        if not self._in_decoder:
+            raise ValueError('StateCell must be enter a decoder.')
+
+        if self._switched_decoder:
+            raise ValueError('StateCell already done switching.')
+
+        for state_name in self._state_names:
+            if state_name not in self._states_holder:
+                state = self._cur_states[state_name]
+
+                if not isinstance(state, InitState):
+                    raise ValueError('Current type of state is %s, should be '
+                                     'an InitState object.' % type(state))
+
+                self._states_holder[state_name] = {}
+
+                if self._cur_decoder_obj.type == _DecoderType.TRAINING:
+                    self._states_holder[state_name][id(self._cur_decoder_obj)] \
+                        = _MemoryState(state_name,
+                                       self._cur_decoder_obj.dynamic_rnn,
+                                       state)
+                elif self._cur_decoder_obj.type == _DecoderType.BEAM_SEARCH:
+                    self._states_holder[state_name][id(self._cur_decoder_obj)] \
+                        = _ArrayState(state_name,
+                                      self._cur_decoder_obj._parent_block(),
+                                      state)
+                else:
+                    raise ValueError('Unknown decoder type, only support '
+                                     '[TRAINING, BEAM_SEARCH]')
+
+            # Read back, since current state should be LoDTensor
+            self._cur_states[state_name] = \
+                self._states_holder[state_name][
+                    id(self._cur_decoder_obj)].get_state()
+
+        self._switched_decoder = True
+
+    def get_state(self, state_name):
+        """
+        The getter of state object. Find the state variable by its name.
+
+        Args:
+            state_name (str): A string of the state's name.
+
+        Returns:
+            The associated state object.
+        """
+        if self._in_decoder and not self._switched_decoder:
+            self._switch_decoder()
+
+        if state_name not in self._cur_states:
+            raise ValueError(
+                'Unknown state %s. Please make sure _switch_decoder() '
+                'invoked.' % state_name)
+
+        return self._cur_states[state_name]
+
+    def get_input(self, input_name):
+        """
+        The getter of input variable. Find the input variable by its name.
+
+        Args:
+            input_name (str): The string of the input's name.
+
+        Returns:
+            The associated input variable.
+        """
+        if input_name not in self._inputs or self._inputs[input_name] is None:
+            raise ValueError('Invalid input %s.' % input_name)
+        return self._inputs[input_name]
+
+    def set_state(self, state_name, state_value):
+        """
+        The setter of the state variable. Change the variable of the given
+        `state_name`.
+
+        Args:
+            state_name (str): The name of the state to change.
+            state_value (Var): The variable of the new state.
+        """
+        self._cur_states[state_name] = state_value
+
+    def state_updater(self, updater):
+        """
+        Set up the updater to update the hidden state every RNN step. The
+        behavior of updater could be customized by users. The updater should be
+        a function that takes a `StateCell` object as input and update the
+        hidden state within it. The hidden state could be accessed through
+        `get_state` method.
+
+        Args:
+            updater (func): the updater to update the state cell.
+        """
+        self._state_updater = updater
+
+        def _decorator(state_cell):
+            if state_cell == self:
+                raise TypeError('Updater should only accept a StateCell object '
+                                'as argument.')
+            updater(state_cell)
+
+        return _decorator
+
+    def compute_state(self, inputs):
+        """
+        Provide the step input of RNN cell, and compute the new hidden state
+        with updater and give step input.
+
+        Args:
+            inputs (dict): A feed dict, {name(str): Variable}. name should be
+            the names of step inputs for this RNN cell, and Variable should be
+            the associated variables.
+
+        Examples:
+        .. code-block:: python
+          state_cell.compute_state(inputs={'x': current_word})
+        """
+        if self._in_decoder and not self._switched_decoder:
+            self._switch_decoder()
+
+        for input_name, input_value in inputs.items():
+            if input_name not in self._inputs:
+                raise ValueError('Unknown input %s. '
+                                 'Please make sure %s in input '
+                                 'place holder.' % (input_name, input_name))
+            self._inputs[input_name] = input_value
+        self._state_updater(self)
+
+    def update_states(self):
+        """
+        Update and record state information after each RNN step.
+        """
+        if self._in_decoder and not self._switched_decoder:
+            self._switched_decoder()
+
+        for state_name, decoder_state in self._states_holder.items():
+            if id(self._cur_decoder_obj) not in decoder_state:
+                raise ValueError('Unknown decoder object, please make sure '
+                                 'switch_decoder been invoked.')
+            decoder_state[id(self._cur_decoder_obj)].update_state(
+                self._cur_states[state_name])
+
+    def out_state(self):
+        """
+        Get the output state variable. This must be called after update_states.
+
+        Returns:
+            The output variable of the RNN cell.
+        """
+        return self._cur_states[self._out_state]
+
+
+class TrainingDecoder(object):
+    """
+    A decoder that can only be used for training. The decoder could be
+    initialized with a `StateCell` object. The computation within the RNN cell
+    could be defined with decoder's block.
+
+    Args:
+        state_cell (StateCell): A StateCell object that handles the input and
+            state variables.
+        name (str): The name of this decoder. Default None.
+
+    Returns:
+        TrainingDecoder: The initialized TrainingDecoder object.
+
+    Examples:
+        .. code-block:: python
+          decoder = TrainingDecoder(state_cell)
+          with decoder.block():
+              current_word = decoder.step_input(trg_embedding)
+              decoder.state_cell.compute_state(inputs={'x': current_word})
+              current_score = layers.fc(input=decoder.state_cell.get_state('h'),
+                                        size=32,
+                                        act='softmax')
+              decoder.state_cell.update_states()
+              decoder.output(current_score)
+    """
+    BEFORE_DECODER = 0
+    IN_DECODER = 1
+    AFTER_DECODER = 2
+
+    def __init__(self, state_cell, name=None):
+        self._helper = LayerHelper('training_decoder', name=name)
+        self._status = TrainingDecoder.BEFORE_DECODER
+        self._dynamic_rnn = layers.DynamicRNN()
+        self._type = _DecoderType.TRAINING
+        self._state_cell = state_cell
+        self._state_cell._enter_decoder(self)
+
+    @contextlib.contextmanager
+    def block(self):
+        """
+        Define the behavior of the decoder for each RNN time step.
+        """
+        if self._status != TrainingDecoder.BEFORE_DECODER:
+            raise ValueError('decoder.block() can only be invoked once')
+        self._status = TrainingDecoder.IN_DECODER
+        with self._dynamic_rnn.block():
+            yield
+        self._status = TrainingDecoder.AFTER_DECODER
+        self._state_cell._leave_decoder(self)
+
+    @property
+    def state_cell(self):
+        self._assert_in_decoder_block('state_cell')
+        return self._state_cell
+
+    @property
+    def dynamic_rnn(self):
+        return self._dynamic_rnn
+
+    @property
+    def type(self):
+        return self._type
+
+    def step_input(self, x):
+        """
+        Set the input variable as a step input to the RNN cell. For example,
+        in machine translation, each time step we read one word from the target
+        sentences, then the target sentence is a step input to the RNN cell.
+
+        Args:
+            x (Variable): the variable to be used as step input.
+
+        Returns:
+            Variable: The variable as input of current step.
+
+        Examples:
+        .. code-block:: python
+          current_word = decoder.step_input(trg_embedding)
+        """
+        self._assert_in_decoder_block('step_input')
+        return self._dynamic_rnn.step_input(x)
+
+    def static_input(self, x):
+        """
+        Set the input variable as a static input of RNN cell. In contrast to
+        step input, this variable will be used as a whole within the RNN decode
+        loop and will not be scattered into time steps.
+
+        Args:
+            x (Variable): the variable to be used as static input.
+
+        Returns:
+            Variable: The variable as input of current step.
+
+        Examples:
+        .. code-block:: python
+          encoder_vec = decoder.static_input(encoded_vector)
+        """
+        self._assert_in_decoder_block('static_input')
+        return self._dynamic_rnn.static_input(x)
+
+    def __call__(self, *args, **kwargs):
+        """
+        Get the output of RNN. This API should only be invoked after RNN.block()
+
+        Returns:
+            Variable: The specified output of the RNN cell.
+        """
+        if self._status != TrainingDecoder.AFTER_DECODER:
+            raise ValueError('Output of training decoder can only be visited '
+                             'outside the block.')
+        return self._dynamic_rnn(*args, **kwargs)
+
+    def output(self, *outputs):
+        """
+        Set the output variable of the RNN cell.
+
+        Args:
+            *outputs (Variables): a series of variables that treated as output
+                of the RNN cell.
+
+        Examples:
+        .. code-block:: python
+          out = fluid.layers.fc(input=h,
+                                size=32,
+                                bias_attr=True,
+                                act='softmax')
+          decoder.output(out)
+        """
+        self._assert_in_decoder_block('output')
+        self._dynamic_rnn.output(*outputs)
+
+    def _assert_in_decoder_block(self, method):
+        if self._status != TrainingDecoder.IN_DECODER:
+            raise ValueError('%s should be invoked inside block of '
+                             'TrainingDecoder object.' % method)
+
+
+class BeamSearchDecoder(object):
+    """
+    A beam search decoder that can be used for inference. The decoder should be
+    initialized with a `StateCell` object. The decode process can be defined
+    within its block.
+
+    Args:
+        state_cell (StateCell): A StateCell object that handles the input and
+            state variables.
+        init_ids (Variable): The init beam search token ids.
+        init_scores (Variable): The associated score of each id.
+        target_dict_dim (int): Size of dictionary.
+        word_dim (int): Word embedding dimension.
+        input_var_dict (dict): A feeding dict to feed the required input
+            variables to the state cell. It will be used by state_cell 's
+            compute method. Default empty.
+        topk_size (int): The topk size used for beam search. Default 50.
+        max_len (int): The maximum allowed length of the generated sentence.
+            Default 100.
+        beam_size (int): The beam width of beam search decode. Default 1.
+        end_id (int): The id of end token within beam search.
+        name (str): The name of this decoder. Default None.
+
+    Returns:
+        BeamSearchDecoder: A initialized BeamSearchDecoder object.
+
+    Examples:
+    .. code-block:: python
+      decoder = BeamSearchDecoder(
+          state_cell=state_cell,
+          init_ids=init_ids,
+          init_scores=init_scores,
+          target_dict_dim=target_dict_dim,
+          word_dim=word_dim,
+          init_var_dict={},
+          topk_size=topk_size,
+          sparse_emb=IS_SPARSE,
+          max_len=max_length,
+          beam_size=beam_size,
+          end_id=1,
+          name=None
+      )
+      decoder.decode()
+      translation_ids, translation_scores = decoder()
+    """
+    BEFORE_BEAM_SEARCH_DECODER = 0
+    IN_BEAM_SEARCH_DECODER = 1
+    AFTER_BEAM_SEARCH_DECODER = 2
+
+    def __init__(self,
+                 state_cell,
+                 init_ids,
+                 init_scores,
+                 target_dict_dim,
+                 word_dim,
+                 input_var_dict={},
+                 topk_size=50,
+                 sparse_emb=True,
+                 max_len=100,
+                 beam_size=1,
+                 end_id=1,
+                 name=None):
+        self._helper = LayerHelper('beam_search_decoder', name=name)
+        self._counter = layers.zeros(shape=[1], dtype='int64')
+        self._counter.stop_gradient = True
+        self._type = _DecoderType.BEAM_SEARCH
+        self._max_len = layers.fill_constant(
+            shape=[1], dtype='int64', value=max_len)
+        self._cond = layers.less_than(
+            x=self._counter,
+            y=layers.fill_constant(
+                shape=[1], dtype='int64', value=max_len))
+        self._while_op = layers.While(self._cond)
+        self._state_cell = state_cell
+        self._state_cell._enter_decoder(self)
+        self._status = BeamSearchDecoder.BEFORE_BEAM_SEARCH_DECODER
+        self._zero_idx = layers.fill_constant(
+            shape=[1], value=0, dtype='int64', force_cpu=True)
+        self._array_dict = {}
+        self._array_link = []
+        self._ids_array = None
+        self._scores_array = None
+        self._beam_size = beam_size
+        self._end_id = end_id
+
+        self._init_ids = init_ids
+        self._init_scores = init_scores
+        self._target_dict_dim = target_dict_dim
+        self._topk_size = topk_size
+        self._sparse_emb = sparse_emb
+        self._word_dim = word_dim
+        self._input_var_dict = input_var_dict
+
+    @contextlib.contextmanager
+    def block(self):
+        """
+        Define the behavior of the decoder for each RNN time step.
+        """
+        if self._status != BeamSearchDecoder.BEFORE_BEAM_SEARCH_DECODER:
+            raise ValueError('block() can only be invoke once.')
+
+        self._status = BeamSearchDecoder.IN_BEAM_SEARCH_DECODER
+
+        with self._while_op.block():
+            yield
+            with layers.Switch() as switch:
+                with switch.case(self._cond):
+                    layers.increment(x=self._counter, value=1.0, in_place=True)
+
+                    for value, array in self._array_link:
+                        layers.array_write(
+                            x=value, i=self._counter, array=array)
+
+                    layers.less_than(
+                        x=self._counter, y=self._max_len, cond=self._cond)
+
+        self._status = BeamSearchDecoder.AFTER_BEAM_SEARCH_DECODER
+        self._state_cell._leave_decoder(self)
+
+    @property
+    def type(self):
+        return self._type
+
+    def early_stop(self):
+        """
+        Stop the generation process in advance. Could be used as "break".
+        """
+        layers.fill_constant(
+            shape=[1], value=0, dtype='bool', force_cpu=True, out=self._cond)
+
+    def decode(self):
+        """
+        Set up the computation within the decoder. Then you could call the
+        decoder to get the result of beam search decode. If you want to define
+        a more specific decoder, you could override this function.
+
+        Examples:
+        .. code-block:: python
+          decoder.decode()
+          translation_ids, translation_scores = decoder()
+        """
+        with self.block():
+            prev_ids = self.read_array(init=self._init_ids, is_ids=True)
+            prev_scores = self.read_array(
+                init=self._init_scores, is_scores=True)
+            prev_ids_embedding = layers.embedding(
+                input=prev_ids,
+                size=[self._target_dict_dim, self._word_dim],
+                dtype='float32',
+                is_sparse=self._sparse_emb)
+
+            feed_dict = {}
+            update_dict = {}
+
+            for init_var_name, init_var in self._input_var_dict.items():
+                if init_var_name not in self.state_cell._inputs:
+                    raise ValueError('Variable ' + init_var_name +
+                                     ' not found in StateCell!\n')
+
+                read_var = self.read_array(init=init_var)
+                update_dict[init_var_name] = read_var
+                feed_var_expanded = layers.sequence_expand(read_var,
+                                                           prev_scores)
+                feed_dict[init_var_name] = feed_var_expanded
+
+            for state_str in self._state_cell._state_names:
+                prev_state = self.state_cell.get_state(state_str)
+                prev_state_expanded = layers.sequence_expand(prev_state,
+                                                             prev_scores)
+                self.state_cell.set_state(state_str, prev_state_expanded)
+
+            for i, input_name in enumerate(self._state_cell._inputs):
+                if input_name not in feed_dict:
+                    feed_dict[input_name] = prev_ids_embedding
+
+            self.state_cell.compute_state(inputs=feed_dict)
+            current_state = self.state_cell.out_state()
+            current_state_with_lod = layers.lod_reset(
+                x=current_state, y=prev_scores)
+            scores = layers.fc(input=current_state_with_lod,
+                               size=self._target_dict_dim,
+                               act='softmax')
+            topk_scores, topk_indices = layers.topk(scores, k=self._topk_size)
+            accu_scores = layers.elementwise_add(
+                x=layers.log(x=topk_scores),
+                y=layers.reshape(
+                    prev_scores, shape=[-1]),
+                axis=0)
+            selected_ids, selected_scores = layers.beam_search(
+                prev_ids,
+                prev_scores,
+                topk_indices,
+                accu_scores,
+                self._beam_size,
+                end_id=1,
+                level=0)
+
+            with layers.Switch() as switch:
+                with switch.case(layers.is_empty(selected_ids)):
+                    self.early_stop()
+                with switch.default():
+                    self.state_cell.update_states()
+                    self.update_array(prev_ids, selected_ids)
+                    self.update_array(prev_scores, selected_scores)
+                    for update_name, var_to_update in update_dict.items():
+                        self.update_array(var_to_update, feed_dict[update_name])
+
+    def read_array(self, init, is_ids=False, is_scores=False):
+        """
+        Read an array to get the decoded ids and scores generated by previous
+        RNN step. At the first step of RNN, the init variable mut be used to
+        initialize the array.
+
+        Args:
+            init (Variable): The initial variable for first step usage. init
+                must be provided.
+            is_ids (bool): Specify whether the variable is an id.
+            is_scores (bool): Specify whether the variable is a score.
+
+        Returns:
+            The associated variable generated during previous RNN steps.
+
+        Examples:
+            .. code-block:: python
+              prev_ids = decoder.read_array(init=init_ids, is_ids=True)
+              prev_scores = decoder.read_array(init=init_scores, is_scores=True)
+        """
+        self._assert_in_decoder_block('read_array')
+
+        if is_ids and is_scores:
+            raise ValueError('Shouldn\'t mark current array be ids array and'
+                             'scores array at the same time.')
+
+        if not isinstance(init, Variable):
+            raise TypeError('The input argument `init` must be a Variable.')
+
+        parent_block = self._parent_block()
+        array = parent_block.create_var(
+            name=unique_name.generate('beam_search_decoder_array'),
+            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+            dtype=init.dtype)
+        parent_block.append_op(
+            type='write_to_array',
+            inputs={'X': init,
+                    'I': self._zero_idx},
+            outputs={'Out': array})
+
+        if is_ids:
+            self._ids_array = array
+        elif is_scores:
+            self._scores_array = array
+
+        read_value = layers.array_read(array=array, i=self._counter)
+        self._array_dict[read_value.name] = array
+        return read_value
+
+    def update_array(self, array, value):
+        """
+        Store the value generated in current step in an array for each RNN step.
+        This array could be accessed by read_array method.
+
+        Args:
+            array (Variable): The array to append the new variable to.
+            value (Variable): The newly generated value to be stored.
+        """
+        self._assert_in_decoder_block('update_array')
+
+        if not isinstance(array, Variable):
+            raise TypeError(
+                'The input argument `array` of  must be a Variable.')
+        if not isinstance(value, Variable):
+            raise TypeError('The input argument `value` of must be a Variable.')
+
+        array = self._array_dict.get(array.name, None)
+        if array is None:
+            raise ValueError('Please invoke read_array before update_array.')
+        self._array_link.append((value, array))
+
+    def __call__(self):
+        """
+        Run the decode process and return the final decode result.
+
+        Returns:
+            A tuple of decoded (id, score) pairs. id is a Variable that holds
+            the generated tokens, and score is a Variable with the same shape
+            as id, holds the score for each generated token.
+        """
+        if self._status != BeamSearchDecoder.AFTER_BEAM_SEARCH_DECODER:
+            raise ValueError('Output of BeamSearchDecoder object can '
+                             'only be visited outside the block.')
+        return layers.beam_search_decode(
+            ids=self._ids_array,
+            scores=self._scores_array,
+            beam_size=self._beam_size,
+            end_id=self._end_id)
+
+    @property
+    def state_cell(self):
+        self._assert_in_decoder_block('state_cell')
+        return self._state_cell
+
+    def _parent_block(self):
+        """
+        Getter of parent block.
+
+        Returns:
+            The parent block of decoder.
+        """
+        program = self._helper.main_program
+        parent_block_idx = program.current_block().parent_idx
+        if parent_block_idx < 0:
+            raise ValueError('Invalid block with index %d.' % parent_block_idx)
+        parent_block = program.block(parent_block_idx)
+        return parent_block
+
+    def _assert_in_decoder_block(self, method):
+        if self._status != BeamSearchDecoder.IN_BEAM_SEARCH_DECODER:
+            raise ValueError('%s should be invoked inside block of '
+                             'BeamSearchDecoder object.' % method)
diff --git a/python/paddle/fluid/contrib/memory_usage_calc.py b/python/paddle/fluid/contrib/memory_usage_calc.py
new file mode 100644
index 0000000000000000000000000000000000000000..5da846edb63c28efd791fdfac4046cfa56c24181
--- /dev/null
+++ b/python/paddle/fluid/contrib/memory_usage_calc.py
@@ -0,0 +1,102 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module privides a memory usage calculate function for user.
+The purpose of this API is to allow users to estimate memory usage of
+a program under a special batch size, then user can set appropriate 
+batch size to fully utilize a GPU. 
+
+This API is still under active development and may change drastically.
+"""
+
+from .. import core
+from ..framework import Program, Variable
+
+__all__ = ['memory_usage']
+
+dtype_to_size = {
+    core.VarDesc.VarType.FP16: 2,
+    core.VarDesc.VarType.FP32: 4,
+    core.VarDesc.VarType.FP64: 8,
+    core.VarDesc.VarType.INT16: 2,
+    core.VarDesc.VarType.INT32: 4,
+    core.VarDesc.VarType.INT64: 8,
+    core.VarDesc.VarType.BOOL: 1,
+    core.VarDesc.VarType.UINT8: 1,
+}
+
+DEBUG = False
+
+
+def memory_usage(program, batch_size):
+    """
+    Get the estimate memory usage of program with input batch size.
+
+    Args:
+        program(Program): The current Program.
+        batch_size(int): The current input data batch_size.  
+    
+    Returns:
+        min_total_memory(float): the estimate memory usage lower bound.
+        max_total_memory(float): the estimate memory usage upper bound.
+        unit_str(string): the unit of estimate usage result.
+    
+    Examples:
+        
+        >>> import paddle.fluid as fluid
+        >>> lower_usage, upper_usage, unit = fluid.contrib.memory_usage(
+                fluid.default_main_program(), batch_size=10)
+        >>> print "memory usage is about %.3f - %.3f %s" % \
+                (lower_usage, upper_usage, unit)
+
+    """
+
+    # Parameters check
+    if not isinstance(program, Program):
+        raise TypeError(
+            "Calculating Memory Usage requires Program as its Parameter."
+            "But you passed in %s" % (type(prgram)))
+    if batch_size <= 0:
+        raise ValueError("The batch size need to be positive.")
+
+    # Get the var_name list of first block and calculate
+    total_memory = 0.0
+    for var in program.global_block().vars.itervalues():
+        data_count = 1
+        for x in var.shape:
+            if x == -1:
+                data_count *= batch_size
+            else:
+                data_count *= x
+        var_memory = data_count * dtype_to_size[var.dtype]
+        if DEBUG:
+            print "%s memory usage: %d" % (var.name, var_memory)
+        total_memory += var_memory
+    if DEBUG:
+        print "total memory usage: %.2f" % (total_memory)
+
+    # Convert appropriate unit
+    unit_str = "B"
+    if total_memory > 1024:
+        total_memory /= 1024
+        unit_str = "KB"
+        if total_memory > 1024:
+            total_memory /= 1024
+            unit_str = "MB"
+
+    # Append extra memory consumption (5% - 10%)
+    min_total_memory = total_memory * 1.05
+    max_total_memory = total_memory * 1.1
+
+    return min_total_memory, max_total_memory, unit_str
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index b436dfe70afdb52299222f8ba3f5bdff2842d103..4178971398c953236bf8de4d5cb6e93d0e33380c 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -247,6 +247,7 @@ class Executor(object):
         p.set_place(place)
         self.executor = core.Executor(p)
         self.program_caches = dict()
+        self._closed = False
 
     def as_lodtensor(self, data):
         """
@@ -309,7 +310,7 @@ class Executor(object):
         if not has_feed_operators(global_block, feed, feed_var_name):
             for i, name in enumerate(feed):
                 out = global_block.var(name)
-                global_block.prepend_op(
+                global_block._prepend_op(
                     type='feed',
                     inputs={'X': [feed_var]},
                     outputs={'Out': [out]},
@@ -348,11 +349,23 @@ class Executor(object):
         ]
         return outs
 
-    def begin_pass(self):
-        self.executor.begin_pass()
+    def close(self):
+        """
+        Close this executor.
 
-    def end_pass(self):
-        self.executor.end_pass()
+        You can no long use this executor after calling this method.
+        For the distributed training, this method would free the resource on PServers related to
+        the current Trainer.
+
+        Example:
+            >>> cpu = core.CPUPlace()
+            >>> exe = Executor(cpu)
+            >>> ...
+            >>> exe.close()
+        """
+        if not self._closed:
+            self.executor.close()
+            self._closed = True
 
     def run(self,
             program=None,
@@ -405,6 +418,10 @@ class Executor(object):
             >>>     feed={'X': x},
             >>>     fetch_list=[loss.name])
         """
+
+        if self._closed:
+            raise RuntimeError("Attempted to use a closed Executor")
+
         if feed is None:
             feed = {}
         if not isinstance(feed, dict):
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index ea3117e02bd993b06de39725b2c3296031065e3c..10b318cf547b5bfb0d70aeb9fb25b17793297afb 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -32,8 +32,6 @@ except Exception, e:
 import unique_name
 
 __all__ = [
-    'Block',
-    'Variable',
     'Program',
     'Operator',
     'Parameter',
@@ -303,7 +301,7 @@ class Variable(object):
 
     __repr__ = __str__
 
-    def set_desc(self, input):
+    def _set_desc(self, input):
         """
         Set the variable description.
 
@@ -348,7 +346,7 @@ class Variable(object):
     def type(self):
         return self.desc.type()
 
-    def set_error_clip(self, error_clip):
+    def _set_error_clip(self, error_clip):
         """
         Set the error_clip.
 
@@ -447,7 +445,7 @@ class Operator(object):
 
     Notes:
         The constructor of operator should not be invoked directly. Use
-        Block.append_op or Block.prepend_op instead.
+        Block.append_op or Block._prepend_op instead.
 
     Examples:
         .. code-block:: python
@@ -870,7 +868,7 @@ class Block(object):
     def forward_block_idx(self):
         return self.desc.get_forward_block_idx()
 
-    def set_forward_block_idx(self, idx):
+    def _set_forward_block_idx(self, idx):
         """
         Set the forward block Idx.
 
@@ -880,7 +878,7 @@ class Block(object):
         Returns:
             None
         """
-        self.desc.set_forward_block_idx(idx)
+        self.desc._set_forward_block_idx(idx)
 
     @property
     def idx(self):
@@ -909,7 +907,7 @@ class Block(object):
             raise ValueError("var %s not in this block" % name)
         return v
 
-    def var_recursive(self, name):
+    def _var_recursive(self, name):
         """
         Get a Variable by name from this block recursively.
 
@@ -951,9 +949,9 @@ class Block(object):
         raise ValueError("Var {0} is not found recursively".format(name))
 
     def all_parameters(self):
-        return list(self.iter_parameters())
+        return list(self._iter_parameters())
 
-    def iter_parameters(self):
+    def _iter_parameters(self):
         return (item[1] for item in self.vars.iteritems()
                 if isinstance(item[1], Parameter))
 
@@ -966,7 +964,7 @@ class Block(object):
     def has_var(self, name):
         return name in self.vars
 
-    def rename_var(self, name, new_name):
+    def _rename_var(self, name, new_name):
         """
         Rename variable in vars and ops' inputs and outputs
 
@@ -1000,8 +998,8 @@ class Block(object):
         else:
             raise ValueError("unsupported var type: %s", type(v))
         orig_var_type = v.type
-        self.desc.rename_var(name, new_name)
-        # NOTE: v is destroyed by C++ after calling rename_var.
+        self.desc._rename_var(name, new_name)
+        # NOTE: v is destroyed by C++ after calling _rename_var.
         d = self.desc.find_var(new_name)
         if var_type == "Parameter":
             var = Parameter(
@@ -1024,23 +1022,42 @@ class Block(object):
                 error_clip=error_clip,
                 stop_gradient=stop_gradient)
 
-        # rename the python side, sync_with_cpp will only add
+        # rename the python side, _sync_with_cpp will only add
         # new vars/ops to python side.
         self.vars[new_name] = var
         del self.vars[name]
-        self.sync_with_cpp()
+        self._sync_with_cpp()
         return var
 
-    def remove_var(self, name):
-        self.sync_with_cpp()
-        self.desc.remove_var(name)
+    def _remove_var(self, name):
+        self._sync_with_cpp()
+        self.desc._remove_var(name)
         del self.vars[name]
 
     def create_parameter(self, *args, **kwargs):
         global_block = self.program.global_block()
         param = Parameter(global_block, *args, **kwargs)
         if 'initializer' in kwargs:
-            kwargs['initializer'](param, self)
+
+            def _is_inited_by(block, var):
+                init_ops = []
+                for op in block.ops:
+                    if var.name in op.output_arg_names:
+                        init_ops.append(op)
+                return init_ops
+
+            initializer = kwargs['initializer']
+            init_ops = _is_inited_by(global_block, param)
+            init_ops_len = len(init_ops)
+            if init_ops_len > 1:
+                raise RuntimeError("param " + param.name +
+                                   " is inited by multiple init ops " + str(
+                                       init_ops))
+            elif init_ops_len == 1:
+                #TODO already inited, do nothing, should log a warning
+                pass
+            else:
+                initializer(param, self)
         return param
 
     def append_op(self, *args, **kwargs):
@@ -1055,7 +1072,7 @@ class Block(object):
         self.ops.append(op)
         return op
 
-    def insert_op(self, index, *args, **kwargs):
+    def _insert_op(self, index, *args, **kwargs):
         """
         Insert a Operator according to the giving arguments.
 
@@ -1065,13 +1082,13 @@ class Block(object):
         Returns:
             Operator: the insert Operator.
         """
-        self.sync_with_cpp()
-        op_desc = self.desc.insert_op(index)
+        self._sync_with_cpp()
+        op_desc = self.desc._insert_op(index)
         op = Operator(block=self, desc=op_desc, *args, **kwargs)
         self.ops.insert(index, op)
         return op
 
-    def remove_op(self, index):
+    def _remove_op(self, index):
         """
         Remove the specific position operator.
 
@@ -1081,11 +1098,11 @@ class Block(object):
         Returns:
             None
         """
-        self.sync_with_cpp()
-        self.desc.remove_op(index, index + 1)
+        self._sync_with_cpp()
+        self.desc._remove_op(index, index + 1)
         del self.ops[index]
 
-    def slice_ops(self, start, end):
+    def _slice_ops(self, start, end):
         """
         Return the Operator between start and end.
 
@@ -1098,13 +1115,13 @@ class Block(object):
         """
         return self.ops[start:end]
 
-    def prepend_op(self, *args, **kwargs):
-        op_desc = self.desc.prepend_op()
+    def _prepend_op(self, *args, **kwargs):
+        op_desc = self.desc._prepend_op()
         op = Operator(self, op_desc, *args, **kwargs)
         self.ops.insert(0, op)
         return op
 
-    def sync_with_cpp(self):
+    def _sync_with_cpp(self):
         """
         Sync from the desc on the c++ end. This method is used to synchronize
         the c++ desc instance generated by backward.
@@ -1170,7 +1187,7 @@ class Block(object):
         for index in range(len(self.ops)):
             assert self.ops[index].desc == ops_in_cpp[index]
 
-    def copy_param_info_from(self, other):
+    def _copy_param_info_from(self, other):
         """
         Copy the information of parameters from the other block.
 
@@ -1185,12 +1202,13 @@ class Block(object):
             None
         """
         if not isinstance(other, Block):
-            raise TypeError("copy_param_info_from should be invoked with Block")
-        for p in other.iter_parameters():
+            raise TypeError(
+                "_copy_param_info_from should be invoked with Block")
+        for p in other._iter_parameters():
             assert isinstance(p, Parameter)
             v = self.vars.get(p.name, None)
             if v is None:
-                raise ValueError("copy_param_info_from should be invoked with "
+                raise ValueError("_copy_param_info_from should be invoked with "
                                  "same topology")
             assert isinstance(v, Variable)
             new_p = Parameter(
@@ -1208,7 +1226,7 @@ class Block(object):
                 name=v.name)
             self.vars[new_p.name] = new_p
 
-    def clone_variable(self, var):
+    def _clone_variable(self, var):
         """
         Clone a variable into current block.
 
@@ -1319,7 +1337,7 @@ class Program(object):
         self._op_role_var = [var_name]
 
     @contextlib.contextmanager
-    def optimized_guard(self, var):
+    def optimized_guard(self, param_and_grads):
         """
         A with guard to set :code:`Optimization` :code:`OpRole` and
         :code:`OpRoleVar` automatically.
@@ -1327,17 +1345,20 @@ class Program(object):
         Notes: This is a very low level API. Users should not use it directly.
 
         Args:
-            var(Variable|str): The variable (name) to be optimized.
+            param_and_grads(list): The variables (names) to be optimized.
 
         Examples:
 
             >>> p, g = backward(...)
-            >>> with program.optimized_guard(p):
+            >>> with program.optimized_guard([p,g]):
             >>>     p = p - 0.001 * g
         """
         OpRole = core.op_proto_and_checker_maker.OpRole
         self._current_role = OpRole.Optimize
-        self._op_role_var = [var.name if isinstance(var, Variable) else var]
+        self._op_role_var = [
+            var.name if isinstance(var, Variable) else var
+            for var in param_and_grads
+        ]
         yield
         self._op_role_var = []
         self._current_role = OpRole.Forward
@@ -1481,9 +1502,9 @@ class Program(object):
             p = Program()
             p.desc = core.ProgramDesc(self.desc)
             p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
-            p.sync_with_cpp()
+            p._sync_with_cpp()
 
-        p.copy_param_info_from(self)
+        p._copy_param_info_from(self)
         p.copy_data_info_from(self)
         return p
 
@@ -1533,12 +1554,17 @@ class Program(object):
         res = Program()
         res.desc = core.prune(self.desc, targets_idx)
         res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())]
-        res.sync_with_cpp()
+        res._sync_with_cpp()
         return res
 
     def inference_optimize(self):
         """
-        This method will create a new program and change the :code:`is_test`
+        This method will create a new program and do following adjustments on it:
+        1. Remove all reader variables and their creator ops if exist.
+
+        2. Remove the :code:`read_op` if exists.
+
+        3. change the :code:`is_test` 
         attribute of operators to :code:`True`. All the :code:`Parameter`
         information will be lost.
 
@@ -1552,6 +1578,22 @@ class Program(object):
         # core.inference_optimize being fixed.
         res = Program()
         res.desc = core.ProgramDesc(self.desc)
+
+        # remove all readers and the read_op if exist
+        read_op_idx = 0
+        root_block = res.desc.block(0)
+        while True:
+            if read_op_idx >= root_block.op_size() or root_block.op(
+                    read_op_idx).type() == 'read':
+                break
+            read_op_idx += 1
+        if read_op_idx < root_block.op_size():
+            root_block._remove_op(0, read_op_idx + 1)
+        for var in root_block.all_vars():
+            if var.type() == core.VarDesc.VarType.READER:
+                root_block._remove_var(var.name())
+
+        # change all `is_test` attributes to True
         for i in xrange(res.desc.num_blocks()):
             block = res.desc.block(i)
             for j in xrange(block.op_size()):
@@ -1559,7 +1601,7 @@ class Program(object):
                 if op.has_attr('is_test'):
                     op.set_attr('is_test', True)
         res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())]
-        res.sync_with_cpp()
+        res._sync_with_cpp()
         return res
 
     @staticmethod
@@ -1579,7 +1621,7 @@ class Program(object):
         p = Program()
         p.desc = core.ProgramDesc(binary_str)
         p.blocks = [Block(p, i) for i in xrange(p.desc.num_blocks())]
-        p.sync_with_cpp()
+        p._sync_with_cpp()
         return p
 
     @property
@@ -1659,7 +1701,7 @@ class Program(object):
         """
         self.current_block_idx = self.current_block().parent_idx
 
-    def sync_with_cpp(self):
+    def _sync_with_cpp(self):
         """
         Synchronize Python instance to its binding C++ object instance.
         If the program is modified in C++ space, this method should be invoked.
@@ -1673,9 +1715,9 @@ class Program(object):
         for block_idx in range(len(self.blocks), self.desc.num_blocks()):
             self.blocks.append(Block(self, block_idx))
         for block in self.blocks:
-            block.sync_with_cpp()
+            block._sync_with_cpp()
 
-    def copy_param_info_from(self, other):
+    def _copy_param_info_from(self, other):
         """
         Copy the information of parameters from other program.
 
@@ -1689,13 +1731,13 @@ class Program(object):
             None
         """
         if not isinstance(other, Program):
-            raise TypeError("copy_param_info_from should be invoked with "
+            raise TypeError("_copy_param_info_from should be invoked with "
                             "Program")
 
         if len(self.blocks) != len(other.blocks):
-            raise ValueError("copy_param_info_from should be invoked with two "
+            raise ValueError("_copy_param_info_from should be invoked with two "
                              "program, with represent the same topology")
-        self.global_block().copy_param_info_from(other.global_block())
+        self.global_block()._copy_param_info_from(other.global_block())
 
     def copy_data_info_from(self, other):
         """
@@ -1711,11 +1753,11 @@ class Program(object):
             None
         """
         if not isinstance(other, Program):
-            raise TypeError("copy_param_info_from should be invoked with "
+            raise TypeError("_copy_param_info_from should be invoked with "
                             "Program")
 
         if len(self.blocks) != len(other.blocks):
-            raise ValueError("copy_param_info_from should be invoked with two "
+            raise ValueError("_copy_param_info_from should be invoked with two "
                              "program, with represent the same topology")
         for var in other.global_block().vars.itervalues():
             if var.is_data:
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 373e9c060de1ee27c165ccd2380cd8c38612c4d9..0e640bf280d396504deec1183821da3e8a156530 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -148,7 +148,7 @@ class ConstantInitializer(Initializer):
         assert isinstance(var, framework.Variable)
         assert isinstance(block, framework.Block)
         # Initialization Ops should be prepended and not appended
-        op = block.prepend_op(
+        op = block._prepend_op(
             type="fill_constant",
             outputs={"Out": var},
             attrs={
@@ -202,7 +202,7 @@ class UniformInitializer(Initializer):
         # Initialization Ops should be prepended and not appended
         if self._seed == 0:
             self._seed = block.program.random_seed
-        op = block.prepend_op(
+        op = block._prepend_op(
             type="uniform_random",
             outputs={"Out": var},
             attrs={
@@ -256,7 +256,7 @@ class NormalInitializer(Initializer):
         # Initialization Ops should be prepended and not appended
         if self._seed == 0:
             self._seed = block.program.random_seed
-        op = block.prepend_op(
+        op = block._prepend_op(
             type="gaussian_random",
             outputs={"Out": var},
             attrs={
@@ -346,7 +346,7 @@ class XavierInitializer(Initializer):
 
         if self._uniform:
             limit = np.sqrt(6.0 / float(fan_in + fan_out))
-            op = block.prepend_op(
+            op = block._prepend_op(
                 type="uniform_random",
                 outputs={"Out": var},
                 attrs={
@@ -359,7 +359,7 @@ class XavierInitializer(Initializer):
 
         else:
             std = np.sqrt(2.0 / float(fan_in + fan_out))
-            op = block.prepend_op(
+            op = block._prepend_op(
                 type="gaussian_random",
                 outputs={"Out": var},
                 attrs={
@@ -444,7 +444,7 @@ class MSRAInitializer(Initializer):
 
         if self._uniform:
             limit = np.sqrt(6.0 / float(fan_in))
-            op = block.prepend_op(
+            op = block._prepend_op(
                 type="uniform_random",
                 outputs={"Out": var},
                 attrs={
@@ -457,7 +457,7 @@ class MSRAInitializer(Initializer):
 
         else:
             std = np.sqrt(2.0 / float(fan_in))
-            op = block.prepend_op(
+            op = block._prepend_op(
                 type="gaussian_random",
                 outputs={"Out": var},
                 attrs={
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 5c8f4f6507c7dd9b3d005639d962ce1e55b2c704..1ec670de07062057ba09e15ac1e4da026d035a53 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -24,10 +24,7 @@ from . import core
 __all__ = [
     'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
     'load_persistables', 'save_inference_model', 'load_inference_model',
-    'get_inference_program', 'save_checkpoint', 'load_checkpoint',
-    'clean_checkpoint', 'load_persist_vars_without_grad',
-    'load_lookup_table_vars', 'save_persist_vars_without_grad',
-    'get_latest_checkpoint_serial'
+    'get_inference_program'
 ]
 
 
@@ -69,7 +66,8 @@ def is_persistable(var):
             res = fluid.io.is_persistable(param)
     """
     if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-            var.desc.type() == core.VarDesc.VarType.FETCH_LIST:
+            var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+            var.desc.type() == core.VarDesc.VarType.READER:
         return False
     return var.persistable
 
@@ -526,7 +524,7 @@ def prepend_feed_ops(inference_program,
 
     for i, name in enumerate(feed_target_names):
         out = global_block.var(name)
-        global_block.prepend_op(
+        global_block._prepend_op(
             type='feed',
             inputs={'X': [feed_var]},
             outputs={'Out': [out]},
@@ -628,7 +626,7 @@ def save_inference_model(dirname,
     for i, op in enumerate(global_block.ops):
         op.desc.set_is_target(False)
         if op.type == "feed" or op.type == "fetch":
-            global_block.remove_op(i)
+            global_block._remove_op(i)
     copy_program.desc.flush()
 
     pruned_program = copy_program.prune(targets=target_vars)
@@ -792,683 +790,3 @@ def get_parameter_value_by_name(name, executor, program=None):
         program = default_main_program()
     var = program.global_block().var(name)
     return get_parameter_value(var, executor)
-
-
-SUCCESS_MARK_FILENAME = "_SUCCESS"
-CHECKPOINT_PREFIX = "checkpoint"
-MODEL_DIR = "__model__"
-LOOKUP_TABLE_DIR = "__lookup_table__"
-TRAINER_PREFIX = "trainer"
-CHECKPOINT_SEPARATOR = "_"
-
-
-def save_checkpoint(executor,
-                    checkpoint_dir,
-                    trainer_id,
-                    trainer_args=None,
-                    main_program=None,
-                    max_num_checkpoints=3,
-                    lookup_table=None,
-                    ps_endpoint_list=None):
-    """
-    This function filters out all checkpoint variables from the give
-    main_program and then saves these variables to the `checkpoint_dir` 
-    directory.
-
-    In the training precess, we generally save a checkpoint in each
-    iteration. So there might be a lot of checkpoints in the 
-    `checkpoint_dir`. To avoid them taking too much disk space, the 
-    `max_num_checkpoints` are introduced to limit the total number of 
-    checkpoints. If the number of existing checkpints is greater than 
-    the `max_num_checkpoints`, oldest ones will be scroll deleted.
-
-    A variable is a checkpoint variable and will be saved if it meets
-    all following conditions:
-        1. It's persistable.
-        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
-        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
-
-    Args:
-        executor(Executor): The executor to run for save checkpoint.
-        checkpoint_dir(str): The folder where to save checkpoints.
-        trainer_id(int): currect trainer id, if id is equal to 0, the trainer 
-            is chief.
-        trainer_args(dict|None): Current training arguments. Such as 'epoch_id' 
-            and 'step_id'.
-            Defaut: None
-        main_program(Program|None): The program whose checkpoint variables will
-            be saved. If it is None, the default main program will be used.
-        max_num_checkpoints(int): The max number of total number of existing 
-            checkpoints.
-            Default: 3
-        lookup_table(string|None): the lookup table name, when use distribute
-            lookup table, we can get lookup table name by DistributeTranspiler.
-            table_name 
-        ps_endpoint_list(list|None): the parameter server ip:port list.  
-            when use distribute lookup table, we can get ps_endpoint_list by 
-            distribute arguments.
-
-    Returns:
-        None
-
-    Raises:
-        ValueError: If `checkpoint_dir` is None.
-        AssertionError: If `trainer_args` is not a dict.
-
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            path = "./checkpoints"
-            prog = fluid.default_main_program()
-            trainer_args = {"epoch_id": 200,
-                            "step_id": 20} # just an example
-            table_name = "share_w"
-            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
-
-            fluid.io.save_checkpoint(executor=exe,
-                                     checkpoint_dir=path,
-                                     trainer_id=0,
-                                     trainer_args=trainer_args,
-                                     main_program=prog,
-                                     max_num_checkpoints=3,
-                                     lookup_table=table_name,
-                                     ps_endpoint_list = ps_endpoints)
-    """
-    if checkpoint_dir is None:
-        raise ValueError("'checkpoint_dir' should not be None")
-    assert checkpoint_dir
-
-    if trainer_args:
-        assert isinstance(trainer_args, dict)
-
-    is_chief = trainer_id == 0
-
-    _make_chekcpoint_dirs(checkpoint_dir)
-    serial = get_latest_checkpoint_serial(checkpoint_dir) + 1
-    cur_dir = _get_serial_dir(checkpoint_dir, serial)
-
-    save_trainer_args(cur_dir, trainer_id, trainer_args)
-
-    if is_chief:
-        save_persist_vars_without_grad(executor, cur_dir, main_program)
-
-    if is_chief and lookup_table and ps_endpoint_list:
-        save_pserver_vars_by_notify(executor, cur_dir, lookup_table,
-                                    ps_endpoint_list)
-
-    _scroll_delete(checkpoint_dir, max_num_checkpoints)
-
-
-def load_checkpoint(executor, checkpoint_dir, serial, main_program):
-    """
-    This function filters out all checkpoint variables from the give
-    main_program and then try to load these variables from the
-    `checkpoint_dir` directory.
-
-    In the training precess, we generally save a checkpoint in each
-    iteration. So there are more than one checkpoint in the 
-    `checkpoint_dir` (each checkpoint has its own sub folder), use 
-    `serial` to specify which serial of checkpoint you would like to
-    load.
-
-    A variable is a checkpoint variable and will be loaded if it meets
-    all following conditions:
-        1. It's persistable.
-        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
-        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
-
-    Args:
-        executor(Executor): The executor to run for loading checkpoint.
-        checkpoint_dir(str): The folder where all checkpoints are.
-        serial(int): The serial of checkpoint you would like to load.
-        main_program(Program): The program whose checkpoint variables will
-                               be loaded.
-
-    Returns:
-        None
-
-    Raises:
-        ValueError: If `checkpoint_dir` is None.
-        ValueError: If `serial` is None or `serial` is less than 0.
-        ValueError: If `main_program` is None.
-
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            path = "./checkpoints"
-            prog = fluid.default_main_program()
-            fluid.io.load_checkpoint(executor=exe, checkpoint_dir=path,
-                    serial=9, main_program=prog)
-
-            # In this example, `load_checkpoint` function
-            # will first filters out all checkpoint variables in the default
-            # main program, and then try to load these variables form the
-            # folder "./checkpoints/checkpoint_9/__model__".
-    """
-
-    if checkpoint_dir is None:
-        raise ValueError("'checkpoint_dir' should not be None")
-
-    if serial is None or serial < 0:
-        raise ValueError("'serial' should not be None or <0 ")
-
-    if main_program is None:
-        raise ValueError('main_program should not be None.')
-
-    cur_dir = _get_serial_dir(checkpoint_dir, serial)
-    load_persist_vars_without_grad(executor, cur_dir, main_program, True)
-
-
-def clean_checkpoint(checkpoint_dir, delete_dir=False):
-    """
-    clean the checkpoint dir, when the train exits normally, 
-    the trainer will call clean_checkpoint to delete checkpoint directory saved before.
-    delete_dir only works when the directory is empty, otherwise, OSError is raised.  
-
-    : param checkpoint_dir
-    : param delete_dir
-    """
-
-    if checkpoint_dir is None:
-        raise ValueError("'checkpoint_dir' should not be None")
-    _scroll_delete(checkpoint_dir, max_num_checkpoints=0)
-
-    if delete_dir and not os.listdir(checkpoint_dir):
-        os.rmdir(checkpoint_dir)
-
-
-def load_persist_vars_without_grad(executor,
-                                   dirname,
-                                   program,
-                                   has_model_dir=False):
-    """
-    This function filters out all checkpoint variables from the give
-    program and then trys to load these variables from the given directory.
-
-    A variable is a checkpoint variable if it meets all following
-    conditions:
-        1. It's persistable.
-        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
-        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
-
-    Args:
-        executor(Executor): The executor to run for loading variables.
-        dirname(str): The directory path.
-        program(Program): The program whose checkpoint variables will
-                          be loaded.
-        has_model_dir(bool): if True, the function loads variables
-                             from a sub directory named '__model__'.
-                             Default: False
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            param_path = "./my_paddle_model"
-            prog = fluid.default_main_program()
-            fluid.io.load_persist_vars_without_grad(executor=exe,
-                    dirname=param_path, program=prog, has_model_dir=True)
-
-            # In this example, `load_persist_vars_without_grad` function
-            # will first filters out all checkpoint variables in the default
-            # main program, and then trys to load these variables form the
-            # folder "./my_paddle_model/__model__".
-    """
-
-    if has_model_dir:
-        dirname = _get_model_dir(dirname)
-
-    load_vars(
-        executor,
-        dirname=dirname,
-        main_program=program,
-        predicate=_is_checkpoint_var,
-        filename=None)
-
-
-def load_lookup_table_vars(executor, dirname, program, pserver_id, table_name):
-    """
-    The parameter server will load lookup table's local file in 
-    selectedrows variable.
-
-    Args:
-        executor(Executor): The executor to run for loading persistable variables
-        dirname(str): The directory path
-        main_program(Program): Find the variable named table_name in main_program
-        pserver_id(int): the serial number in pserver_endpoints list
-        table_name(str): lookup table name
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            dirname = "./checkpoints/checkpoint_9/__model__"
-            prog = fluid.default_main_program()
-            pserver_id = 1
-            table_name = "share_w"
-            fluid.io.load_lookup_table_vars(executor=exe,
-                    dirname=dirname, program=prog, pserver_id=pserver_id,
-                    table_name=table_name)
-    """
-
-    for var in program.list_vars():
-        if var.name == table_name:
-            lookup_table_var = var
-            break
-
-    assert lookup_table_var is not None
-
-    lookup_table_dir = os.path.join(dirname, LOOKUP_TABLE_DIR)
-    table_file = table_name + CHECKPOINT_SEPARATOR + str(pserver_id)
-
-    load_prog = Program()
-    load_block = load_prog.global_block()
-
-    load_block.append_op(
-        type='load',
-        inputs={},
-        outputs={'Out': [lookup_table_var]},
-        attrs={'file_path': os.path.join(lookup_table_dir, table_file)})
-
-    executor.run(load_prog)
-
-
-def save_persist_vars_without_grad(executor, dirname, program):
-    """
-    This function filters out all checkpoint variables from the give
-    program and then save these variables to a sub-folder '__model__' of 
-    the given directory.
-
-    A variable is a checkpoint variable if it meets all following
-    conditions:
-        1. It's persistable.
-        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
-        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
-
-    Args:
-        executor(Executor): The executor to run for saving variables.
-        dirname(str): The directory path.
-        program(Program): The program whose checkpoint variables will
-                          be saved.
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            param_path = "./my_paddle_model"
-            prog = fluid.default_main_program()
-            fluid.io.save_persist_vars_without_grad(executor=exe,
-                    dirname=param_path, program=prog)
-
-            # In this example, `save_persist_vars_without_grad` function
-            # will first filters out all checkpoint variables in the default
-            # main program, and then saves these variables to the folder 
-            # "./my_paddle_model/__model__".
-    """
-    cur_dir = _get_model_dir(dirname)
-    save_vars(
-        executor,
-        dirname=cur_dir,
-        main_program=program,
-        vars=None,
-        predicate=_is_checkpoint_var,
-        filename=None)
-    _write_success(cur_dir)
-
-
-def save_pserver_vars_by_notify(executor, dirname, lookup_table,
-                                ps_endpoint_list):
-    """
-    This function will send checkpoint notify message from Trainer 0
-    to all the pservers.
-    The checkpoint notify message contains lookup table name, 
-    the absolute path on pserver to save lookup_table.
-
-    Args:
-        executor(Executor): The executor to run for send checkpoint notify.
-        dirname(str): The folder where to save checkpoints.
-        lookup_table(string): the lookup table name, when use distribute
-            lookup table, we can get lookup table name by DistributeTranspiler.
-            table_name 
-        ps_endpoint_list(list): the parameter server ip:port list.  
-            when use distribute lookup table, we can get ps_endpoint_list by 
-            distribute arguments.
-    Return:
-        None
-    
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            param_path = "./my_paddle_model"
-            prog = fluid.default_main_program()
-            table_name = "share_w"
-            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
-
-            fluid.io.save_pserver_vars_by_notify(executor=exe,
-                    dirname=param_path, lookup_table=table_name, 
-                    ps_endpoint_list=ps_endpoints)
-    """
-    cur_dir = _get_lookuptable_dir(dirname)
-
-    checkpoint_notify_program = Program()
-    checkpoint_notify_block = checkpoint_notify_program.global_block()
-
-    attrs = {}
-    attrs['epmap'] = ps_endpoint_list
-    attrs['dir'] = cur_dir
-    attrs['lookup_table'] = lookup_table
-
-    checkpoint_notify_block.append_op(
-        type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
-    executor.run(checkpoint_notify_program)
-
-
-def save_trainer_args(dirname, trainer_id, trainer_args):
-    assert isinstance(trainer_args, dict)
-
-    cur_dir = _get_trainer_dir(dirname, trainer_id)
-
-    for name, value in trainer_args.iteritems():
-        args_file = os.path.join(cur_dir, name)
-        with open(args_file, 'w') as f:
-            f.write(str(value))
-    _write_success(cur_dir)
-
-
-def load_trainer_args(checkpoint_dir, serial, trainer_id, trainer_args):
-    """
-    trainer will load some args from it's independent directory, 
-    such as epoch_id and step_id.
-
-    Args:
-        checkpoint_dir(str): The folder where all checkpoints are.
-        serial(int): The serial of checkpoint you would like to load.
-        trainer_id(int): current trainer id.
-        trainer_args(list): list about load trainer args
-    Return:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            param_path = "./checkpoint/"
-            serial = 7
-            trainer_id = 2
-            trainer_args = ["epoch_id", "step_id"]
-
-            fluid.io.load_trainer_args(checkpoint_dir=param_path, serial=serial,
-            trainer_id=trainer_id, trainer_args=trainer_args)
-    """
-    assert isinstance(trainer_args, list)
-
-    cur_dir = _get_serial_dir(checkpoint_dir, serial)
-    cur_dir = _get_trainer_dir(cur_dir, trainer_id)
-
-    ret_values = []
-
-    for arg in trainer_args:
-        cur_file = os.path.join(cur_dir, arg)
-        with open(cur_file, 'r') as f:
-            contents = f.read()
-            ret_values.append(contents.strip())
-    return ret_values
-
-
-def _is_checkpoint_var(var):
-    """
-    the checkpoint will not save or load all the variables.
-    var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
-
-    : param var(Variable)
-    """
-    if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-            var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
-            var.desc.type() == core.VarDesc.VarType.RAW:
-        return False
-    # @GRAD are named for gradient variables, checkpoint will not save it.
-    if "@GRAD" in var.name:
-        return False
-    # .trainer_ are named for distribute train variables, checkpoint will not save it.
-    if ".trainer_" in var.name:
-        return False
-
-    # .block is named for distribute train variables, checkpoint will not save it.
-    if ".block" in var.name:
-        return False
-
-    return var.persistable
-
-
-def _make_chekcpoint_dirs(dirs):
-    """
-    _make_chekcpoint_dirs will makdir local directory directly, when the directory is exist, it will igore it.
-    """
-    assert dirs is not None
-
-    if os.path.isfile(dirs):
-        raise OSError(errno.ENOTDIR, "dirs path shoule be a Directory.", dirs)
-
-    if not os.path.isdir(dirs):
-        try:
-            os.makedirs(dirs)
-        except OSError as err:
-            if err.errno != errno.EEXIST:
-                raise err
-
-
-def _get_dir_serial(dirname):
-    _, serial = dirname.split(CHECKPOINT_SEPARATOR)
-
-    try:
-        serial_num = int(serial)
-    except ValueError:
-        serial_num = -1
-    return serial_num
-
-
-def _get_serial_dir(dirname, serial):
-    serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial)
-    serial_dir = os.path.join(dirname, serial_folder)
-    _make_chekcpoint_dirs(serial_dir)
-
-    return serial_dir
-
-
-def _get_model_dir(dirname):
-    model_dir = os.path.join(dirname, MODEL_DIR)
-    _make_chekcpoint_dirs(model_dir)
-    return model_dir
-
-
-def _get_lookuptable_dir(dirname):
-    lookuptable_dir = os.path.join(dirname, LOOKUP_TABLE_DIR)
-    _make_chekcpoint_dirs(lookuptable_dir)
-    return lookuptable_dir
-
-
-def _get_trainer_dir(dirname, trainer_id):
-    trainer_folder = TRAINER_PREFIX + CHECKPOINT_SEPARATOR + str(trainer_id)
-    trainer_dir = os.path.join(dirname, trainer_folder)
-    _make_chekcpoint_dirs(trainer_dir)
-    return trainer_dir
-
-
-def _scroll_delete(dirname, max_num_checkpoints=3):
-    dirs = os.listdir(dirname)
-    serial_map = {}
-    for serial in dirs:
-        serial_num = _get_dir_serial(serial)
-        serial_map[serial_num] = serial
-
-    if len(serial_map.keys()) <= max_num_checkpoints:
-        return
-
-    serials = serial_map.keys()
-    serials.sort(reverse=True)
-    serials = serials[max_num_checkpoints:]
-    for serial in serials:
-        cur_dir = _get_serial_dir(dirname, serial)
-        try:
-            shutil.rmtree(cur_dir)
-        except OSError as err:
-            if err.errno != errno.ENOENT:
-                raise err
-
-
-def _write_success(dirname):
-    """
-    write an empty file named "_SUCCESS" in checkpoint dir, indicate this checkpoint is correct.
-
-    : param dirname
-    """
-    success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME)
-    with open(success_file, 'a') as f:
-        now = time.ctime()
-        f.write(now)
-
-
-def get_latest_checkpoint_serial(checkpoint_dir):
-    """
-    get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory
-
-    : param checkpoint_dir
-    """
-    if not checkpoint_dir:
-        return -1
-
-    def has_success(checkpoint_dir, cur_dir):
-        """
-        is _SUCCESS in this dir
-        """
-
-        serial = _get_dir_serial(cur_dir)
-        if serial == -1 or not os.path.isdir(
-                os.path.join(checkpoint_dir, cur_dir)):
-            return -1
-
-        success_path = os.path.join(
-            _get_serial_dir(checkpoint_dir, serial), MODEL_DIR,
-            SUCCESS_MARK_FILENAME)
-        if os.path.isfile(success_path):
-            return serial
-
-    if not os.path.isdir(checkpoint_dir):
-        return -1
-
-    current_dir = -1
-    dirs = os.listdir(checkpoint_dir)
-    for cur_dir in dirs:
-        success_num = has_success(checkpoint_dir, cur_dir)
-        if success_num > current_dir:
-            current_dir = success_num
-    return current_dir
-
-
-def get_test_program(filelist, program=None, startup_program=None):
-    """
-    Transpile current train program to a program to read test dataset
-    if the program is using reader ops like "open_files_op".
-    """
-
-    def _copy_reader_var_(block, var, new_name=None):
-        if new_name == None:
-            new_name = var.name
-        new_var = block.create_var(
-            name=str(new_name), type=core.VarDesc.VarType.READER)
-        new_var.desc.set_shapes(var.desc.shapes())
-        new_var.desc.set_dtypes(var.desc.dtypes())
-        new_var.persistable = True
-        return new_var
-
-    def _get_test_reader_name(train_reader_name):
-        return train_reader_name + "_test"
-
-    def _is_reader_op(op):
-        block = op.block
-        if "Out" in op.output_names:
-            reader_out = block.vars[op.output("Out")[0]]
-            if reader_out.type == core.VarDesc.VarType.READER:
-                return True
-        return False
-
-    if program == None:
-        program = default_main_program()
-    if startup_program == None:
-        startup_program = default_startup_program()
-    startup_block = startup_program.global_block()
-
-    # 1. find out the orignal reader var name
-    startup_reader_op_list = []
-
-    for op in startup_block.ops:
-        if _is_reader_op(op):
-            startup_reader_op_list.append(op)
-
-    if len(startup_reader_op_list) == 0:
-        return program
-
-    root_reader_op = startup_reader_op_list[0]
-    train_test_reader_map = {}
-    # 2. add operators to startup to read open and read test data files
-    for op in startup_reader_op_list:
-        assert (len(op.output("Out")) == 1)
-        train_reader_name = op.output("Out")[0]
-        train_reader = startup_block.vars[train_reader_name]
-        test_reader = _copy_reader_var_(
-            startup_block,
-            train_reader,
-            new_name=_get_test_reader_name(train_reader_name))
-        train_test_reader_map[train_reader.name] = test_reader
-
-        test_op_inputs = {}
-        for name in op.input_names:
-            train_arg_names = op.input(name)
-            test_arg_vars = []
-            for arg_name in train_arg_names:
-                arg_var = train_test_reader_map[
-                    arg_name] if name == "UnderlyingReader" else startup_block.vars[
-                        arg_name]
-                test_arg_vars.append(arg_var)
-            test_op_inputs[name] = test_arg_vars
-
-        test_op = startup_block.append_op(
-            type=op.type,
-            inputs=test_op_inputs,
-            outputs={'Out': [test_reader]},
-            attrs=op.attrs)
-        # root reader op's filelist attr for read test files
-        if op.type == root_reader_op.type:
-            test_op.set_attr("file_names", filelist)
-        if op.type == "create_multi_pass_reader":
-            test_op.set_attr("pass_num", 1)
-
-    # 3. rename reader vars in inference program to different name
-    #    to avoid read from train data.
-    main_block = program.global_block()
-    for var in main_block.vars.values():
-        if var.type == core.VarDesc.VarType.READER:
-            main_block.rename_var(
-                str(var.name), str(_get_test_reader_name(var.name)))
-
-    for op in main_block.ops:
-        if op.type == root_reader_op.type:
-            test_op.set_attr("file_names", filelist)
-        if op.type == "create_multi_pass_reader":
-            test_op.set_attr("pass_num", 1)
-
-    startup_program.sync_with_cpp()
-    program.sync_with_cpp()
-
-    return program
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index 86efd1ff51cf29485ee28b4d60ffb1439af1aad9..de752d1daeb6bc725cf6eff1bb74a786e2ad6b95 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -68,11 +68,11 @@ class LayerHelper(object):
 
     @property
     def param_attr(self):
-        return ParamAttr.to_attr(self.kwargs.get('param_attr', None))
+        return ParamAttr._to_attr(self.kwargs.get('param_attr', None))
 
     @property
     def bias_attr(self):
-        return ParamAttr.to_attr(self.kwargs.get('bias_attr', None))
+        return ParamAttr._to_attr(self.kwargs.get('bias_attr', None))
 
     def multiple_param_attr(self, length):
         param_attr = self.param_attr
@@ -262,11 +262,11 @@ class LayerHelper(object):
         g_param = self.startup_program.global_block().create_parameter(
             dtype=dtype,
             shape=g_param_shape,
-            **g_param_attr.to_kwargs(with_initializer=False))
+            **g_param_attr._to_kwargs(with_initializer=False))
         v_param = self.startup_program.global_block().create_parameter(
             dtype=dtype,
             shape=v_param_shape,
-            **v_param_attr.to_kwargs(with_initializer=True))
+            **v_param_attr._to_kwargs(with_initializer=True))
         __norm_except_dim(
             x=v_param,
             out=g_param,
@@ -275,9 +275,9 @@ class LayerHelper(object):
 
         # Add weight normalization to main_program
         g_param = self.main_program.global_block().create_parameter(
-            dtype=dtype, shape=g_param_shape, **g_param_attr.to_kwargs())
+            dtype=dtype, shape=g_param_shape, **g_param_attr._to_kwargs())
         v_param = self.main_program.global_block().create_parameter(
-            dtype=dtype, shape=v_param_shape, **v_param_attr.to_kwargs())
+            dtype=dtype, shape=v_param_shape, **v_param_attr._to_kwargs())
         w_param = __weight_normalize(g_param, v_param, dim=attr.dim)
         return w_param
 
@@ -296,11 +296,11 @@ class LayerHelper(object):
 
         if default_initializer is None and attr.initializer is None:
             if is_bias:
-                attr.set_default_bias_initializer()
+                attr._set_default_bias_initializer()
             else:
-                attr.set_default_param_initializer()
+                attr._set_default_param_initializer()
         else:
-            attr.set_default_initializer(default_initializer)
+            attr._set_default_initializer(default_initializer)
 
         # If weight normalization is set, insert extra parameters and ops.
         # Refer to https://arxiv.org/pdf/1602.07868.pdf
@@ -310,9 +310,9 @@ class LayerHelper(object):
             return param
 
         self.startup_program.global_block().create_parameter(
-            dtype=dtype, shape=shape, **attr.to_kwargs(with_initializer=True))
+            dtype=dtype, shape=shape, **attr._to_kwargs(with_initializer=True))
         return self.main_program.global_block().create_parameter(
-            dtype=dtype, shape=shape, **attr.to_kwargs())
+            dtype=dtype, shape=shape, **attr._to_kwargs())
 
     def get_parameter(self, name):
         param = self.main_program.global_block().var(name)
diff --git a/python/paddle/fluid/layers/__init__.py b/python/paddle/fluid/layers/__init__.py
index cd1492da24d5e9d09a9eaac0b1b9c7aaffac6250..4917e67de0d20ff9e8f9a27f38e1bd2abef5c503 100644
--- a/python/paddle/fluid/layers/__init__.py
+++ b/python/paddle/fluid/layers/__init__.py
@@ -33,7 +33,6 @@ from metric_op import *
 from learning_rate_scheduler import *
 
 __all__ = []
-__all__ += math_op_patch.__all__
 __all__ += nn.__all__
 __all__ += io.__all__
 __all__ += tensor.__all__
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 849474dc58461ac3772f439da7bf5d57592daa8c..3ee1c636ace504e14cf7d6c106df1bc3e864d660 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -21,30 +21,20 @@ from ..layer_helper import LayerHelper, unique_name
 from ..initializer import force_init_on_cpu
 from ops import logical_and, logical_not, logical_or
 import numpy
+import warnings
 
 __all__ = [
-    'split_lod_tensor',
-    'merge_lod_tensor',
-    'BlockGuard',
-    'BlockGuardWithCompletion',
-    'WhileGuard',
     'While',
     'Switch',
-    'lod_rank_table',
-    'max_sequence_len',
-    'lod_tensor_to_array',
-    'array_to_lod_tensor',
     'increment',
     'array_write',
     'create_array',
     'less_than',
     'equal',
     'array_read',
-    'shrink_memory',
     'array_length',
     'IfElse',
     'DynamicRNN',
-    'ConditionalBlock',
     'StaticRNN',
     'reorder_lod_tensor_by_rank',
     'ParallelDo',
@@ -291,6 +281,9 @@ class ParallelDo(object):
     """
 
     def __init__(self, places, use_nccl=False, name=None):
+        warnings.warn(
+            "API ParallelDo is deprecated since 0.15.0. Please use ParallelExecutor instead.",
+            Warning)
         self.helper = LayerHelper("parallel_do", name=name)
         self.inputs = []
         self.places = places
@@ -349,7 +342,7 @@ class ParallelDo(object):
 
         return [parent_block.var(name) for name in params]
 
-    def complete_op(self):
+    def _complete_op(self):
         main_program = self.helper.main_program
         current_block = main_program.current_block()
         parent_block = self.parent_block()
@@ -405,7 +398,7 @@ class BlockGuardWithCompletion(BlockGuard):
         if exc_type is not None:
             return False
         self.rnn.status = StaticRNN.AFTER_RNN_BLOCK
-        self.rnn.complete_op()
+        self.rnn._complete_op()
         return super(BlockGuardWithCompletion, self).__exit__(exc_type, exc_val,
                                                               exc_tb)
 
@@ -481,7 +474,7 @@ class StaticRNN(object):
             if shape is None or batch_ref is None:
                 raise ValueError(
                     "if init is None, memory at least need shape and batch_ref")
-            parent_block = self.parent_block()
+            parent_block = self._parent_block()
             var_name = unique_name.generate("@".join(
                 [self.helper.name, "memory_boot"]))
             boot_var = parent_block.create_var(
@@ -538,7 +531,7 @@ class StaticRNN(object):
             outputs={'Out': tmp_o},
             attrs={'dtype': o.dtype})
 
-        out_var = self.parent_block().create_var(
+        out_var = self._parent_block().create_var(
             name=tmp_o.name,
             shape=[self.seq_len] + list(tmp_o.shape),
             dtype=tmp_o.dtype)
@@ -554,7 +547,7 @@ class StaticRNN(object):
             raise TypeError("update memory should take variables")
         self.memories[mem.name].mem = var
 
-    def parent_block(self):
+    def _parent_block(self):
         prog = self.helper.main_program
         parent_idx = prog.current_block().parent_idx
         assert parent_idx >= 0
@@ -571,10 +564,10 @@ class StaticRNN(object):
         else:
             return self.outputs
 
-    def complete_op(self):
+    def _complete_op(self):
         main_program = self.helper.main_program
         rnn_block = main_program.current_block()
-        parent_block = self.parent_block()
+        parent_block = self._parent_block()
 
         local_inputs = set()
 
@@ -654,7 +647,7 @@ class WhileGuard(BlockGuard):
         if exc_type is not None:
             return False
         self.while_op.status = While.AFTER_WHILE_BLOCK
-        self.while_op.complete()
+        self.while_op._complete()
         return super(WhileGuard, self).__exit__(exc_type, exc_val, exc_tb)
 
 
@@ -701,7 +694,7 @@ class While(object):
     def block(self):
         return WhileGuard(self)
 
-    def complete(self):
+    def _complete(self):
         main_program = self.helper.main_program
         while_block = main_program.current_block()
         parent_block = main_program.block(main_program.current_block()
@@ -730,8 +723,10 @@ class While(object):
         parent_block.append_op(
             type='while',
             inputs={
-                'X':
-                [parent_block.var_recursive(x_name) for x_name in x_name_list],
+                'X': [
+                    parent_block._var_recursive(x_name)
+                    for x_name in x_name_list
+                ],
                 'Condition': [self.cond_var]
             },
             outputs={'Out': out_vars,
@@ -1259,7 +1254,7 @@ class ConditionalBlock(object):
         input_set = set([ipt.name for ipt in self.inputs])
 
         param_list = [
-            parent_block.var_recursive(each_name) for each_name in params
+            parent_block._var_recursive(each_name) for each_name in params
             if each_name not in input_set
         ]
 
@@ -1458,7 +1453,7 @@ class IfElse(object):
         if self.status == IfElse.OUT_IF_ELSE_BLOCKS:
             raise ValueError("input must in true/false blocks")
         if id(x) not in self.input_table:
-            parent_block = self.parent_block()
+            parent_block = self._parent_block()
             out_true = parent_block.create_var(
                 name=unique_name.generate('ifelse_input' + self.helper.name),
                 dtype=x.dtype)
@@ -1484,7 +1479,7 @@ class IfElse(object):
         else:
             return out_false
 
-    def parent_block(self):
+    def _parent_block(self):
         current_block = self.helper.main_program.current_block()
         return self.helper.main_program.block(current_block.parent_idx)
 
@@ -1500,7 +1495,7 @@ class IfElse(object):
 
         out_table = self.output_table[1 if self.status ==
                                       self.IN_IF_ELSE_TRUE_BLOCKS else 0]
-        parent_block = self.parent_block()
+        parent_block = self._parent_block()
         for each_out in outs:
             if not isinstance(each_out, Variable):
                 raise TypeError("Each output should be a variable")
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 6af01297df54ffd4201776d20d51a88f5808ccb0..60a12686f8ff43f5ee7e30650a208296963bda3d 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -30,12 +30,14 @@ __all__ = [
     'detection_output',
     'ssd_loss',
     'detection_map',
+    'rpn_target_assign',
     'anchor_generator',
 ]
 
 __auto__ = [
     'iou_similarity',
     'box_coder',
+    'polygon_box_transform',
 ]
 
 __all__ += __auto__
@@ -44,6 +46,135 @@ for _OP in set(__auto__):
     globals()[_OP] = generate_layer_fn(_OP)
 
 
+def rpn_target_assign(loc,
+                      scores,
+                      anchor_box,
+                      gt_box,
+                      rpn_batch_size_per_im=256,
+                      fg_fraction=0.25,
+                      rpn_positive_overlap=0.7,
+                      rpn_negative_overlap=0.3):
+    """
+    ** Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection. **
+
+    This layer can be, for given the  Intersection-over-Union (IoU) overlap
+    between anchors and ground truth boxes, to assign classification and
+    regression targets to each each anchor, these target labels are used for
+    train RPN. The classification targets is a binary class label (of being
+    an object or not). Following the paper of Faster-RCNN, the positive labels
+    are two kinds of anchors: (i) the anchor/anchors with the highest IoU
+    overlap with a ground-truth box, or (ii) an anchor that has an IoU overlap
+    higher than rpn_positive_overlap(0.7) with any ground-truth box. Note
+    that a single ground-truth box may assign positive labels to multiple
+    anchors. A non-positive anchor is when its IoU ratio is lower than
+    rpn_negative_overlap (0.3) for all ground-truth boxes. Anchors that are
+    neither positive nor negative do not contribute to the training objective.
+    The regression targets are the encoded ground-truth boxes associated with
+    the positive anchors.
+
+    Args:
+        loc(Variable): A 3-D Tensor with shape [N, M, 4] represents the
+            predicted locations of M bounding bboxes. N is the batch size,
+            and each bounding box has four coordinate values and the layout
+            is [xmin, ymin, xmax, ymax].
+        scores(Variable): A 3-D Tensor with shape [N, M, C] represents the
+            predicted confidence predictions. N is the batch size, C is the
+            class number, M is number of bounding boxes. For each category
+            there are total M scores which corresponding M bounding boxes.
+        anchor_box(Variable): A 2-D Tensor with shape [M, 4] holds M boxes,
+            each box is represented as [xmin, ymin, xmax, ymax],
+            [xmin, ymin] is the left top coordinate of the anchor box,
+            if the input is image feature map, they are close to the origin
+            of the coordinate system. [xmax, ymax] is the right bottom
+            coordinate of the anchor box.
+        gt_box (Variable): The ground-truth boudding boxes (bboxes) are a 2D
+            LoDTensor with shape [Ng, 4], Ng is the total number of ground-truth
+            bboxes of mini-batch input.
+        rpn_batch_size_per_im(int): Total number of RPN examples per image.
+        fg_fraction(float): Target fraction of RoI minibatch that is labeled
+            foreground (i.e. class > 0), 0-th class is background.
+        rpn_positive_overlap(float): Minimum overlap required between an anchor
+            and ground-truth box for the (anchor, gt box) pair to be a positive
+            example.
+        rpn_negative_overlap(float): Maximum overlap allowed between an anchor
+            and ground-truth box for the (anchor, gt box) pair to be a negative
+            examples.
+
+    Returns:
+        tuple: 
+               A tuple(predicted_scores, predicted_location, target_label,
+               target_bbox) is returned. The predicted_scores and
+               predicted_location is the predicted result of the RPN.
+               The target_label and target_bbox is the ground truth,
+               respectively. The predicted_location is a 2D Tensor with shape
+               [F, 4], and the shape of target_bbox is same as the shape of
+               the predicted_location, F is the number of the foreground
+               anchors. The predicted_scores is a 2D Tensor with shape
+               [F + B, 1], and the shape of target_label is same as the shape
+               of the predicted_scores, B is the number of the background
+               anchors, the F and B is depends on the input of this operator. 
+
+    Examples:
+        .. code-block:: python
+
+        loc = layers.data(name='location', shape=[2, 80],
+                          append_batch_size=False, dtype='float32')
+        scores = layers.data(name='scores', shape=[2, 40],
+                          append_batch_size=False, dtype='float32')
+        anchor_box = layers.data(name='anchor_box', shape=[20, 4],
+                          append_batch_size=False, dtype='float32')
+        gt_box = layers.data(name='gt_box', shape=[10, 4],
+                         append_batch_size=False, dtype='float32')
+        loc_pred, score_pred, loc_target, score_target =
+            fluid.layers.detection_output(loc=location,
+                                          scores=scores,
+                                          anchor_box=anchor_box,
+                                          gt_box=gt_box)
+    """
+
+    helper = LayerHelper('rpn_target_assign', **locals())
+    # 1. Compute the regression target bboxes
+    target_bbox = box_coder(
+        prior_box=anchor_box,
+        target_box=gt_box,
+        code_type='encode_center_size',
+        box_normalized=False)
+
+    # 2. Compute overlaps between the prior boxes and the gt boxes overlaps
+    iou = iou_similarity(x=gt_box, y=anchor_box)
+
+    # 3. Assign target label to anchors
+    loc_index = helper.create_tmp_variable(dtype=anchor_box.dtype)
+    score_index = helper.create_tmp_variable(dtype=anchor_box.dtype)
+    target_label = helper.create_tmp_variable(dtype=anchor_box.dtype)
+    helper.append_op(
+        type="rpn_target_assign",
+        inputs={'Overlap': iou, },
+        outputs={
+            'LocationIndex': loc_index,
+            'ScoreIndex': score_index,
+            'TargetLabel': target_label,
+        },
+        attrs={
+            'rpn_batch_size_per_im': rpn_batch_size_per_im,
+            'rpn_positive_overlap': rpn_positive_overlap,
+            'rpn_negative_overlap': rpn_negative_overlap,
+            'fg_fraction': fg_fraction,
+        })
+
+    # 4. Reshape and gather the target entry
+    scores = nn.reshape(x=scores, shape=(-1, 1))
+    loc = nn.reshape(x=loc, shape=(-1, 4))
+    target_label = nn.reshape(x=target_label, shape=(-1, 1))
+    target_bbox = nn.reshape(x=target_bbox, shape=(-1, 4))
+
+    predicted_scores = nn.gather(scores, score_index)
+    predicted_location = nn.gather(loc, loc_index)
+    target_label = nn.gather(target_label, score_index)
+    target_bbox = nn.gather(target_bbox, loc_index)
+    return predicted_scores, predicted_loc, target_label, target_bbox
+
+
 def detection_output(loc,
                      scores,
                      prior_box,
@@ -388,7 +519,6 @@ def target_assign(input,
 
     Returns:
         tuple: 
-        
                A tuple(out, out_weight) is returned. out is a 3D Tensor with 
                shape [N, P, K], N and P is the same as they are in 
                `neg_indices`, K is the same as it in input of X. If 
@@ -660,7 +790,8 @@ def prior_box(input,
               clip=False,
               steps=[0.0, 0.0],
               offset=0.5,
-              name=None):
+              name=None,
+              min_max_aspect_ratios_order=False):
     """
     **Prior Box Operator**
 
@@ -689,6 +820,11 @@ def prior_box(input,
             Default: [0., 0.]
        offset(float): Prior boxes center offset. Default: 0.5
        name(str): Name of the prior box op. Default: None.
+       min_max_aspect_ratios_order(bool): If set True, the output prior box is
+            in order of [min, max, aspect_ratios], which is consistent with 
+            Caffe. Please note, this order affects the weights order of
+            convolution layer followed by and does not affect the final
+            detection results. Default: False.
 
     Returns:
         tuple: A tuple with two Variable (boxes, variances)
@@ -742,7 +878,8 @@ def prior_box(input,
         'clip': clip,
         'step_w': steps[0],
         'step_h': steps[1],
-        'offset': offset
+        'offset': offset,
+        'min_max_aspect_ratios_order': min_max_aspect_ratios_order
     }
     if max_sizes is not None and len(max_sizes) > 0 and max_sizes[0] > 0:
         if not _is_list_or_tuple_(max_sizes):
@@ -782,7 +919,8 @@ def multi_box_head(inputs,
                    kernel_size=1,
                    pad=0,
                    stride=1,
-                   name=None):
+                   name=None,
+                   min_max_aspect_ratios_order=False):
     """
     Generate prior boxes for SSD(Single Shot MultiBox Detector)
     algorithm. The details of this algorithm, please refer the
@@ -825,6 +963,11 @@ def multi_box_head(inputs,
        pad(int|list|tuple): The padding of conv2d. Default:0.
        stride(int|list|tuple): The stride of conv2d. Default:1,
        name(str): Name of the prior box layer. Default: None.
+       min_max_aspect_ratios_order(bool): If set True, the output prior box is
+            in order of [min, max, aspect_ratios], which is consistent with 
+            Caffe. Please note, this order affects the weights order of
+            convolution layer followed by and does not affect the fininal
+            detection results. Default: False.
 
     Returns:
         tuple: A tuple with four Variables. (mbox_loc, mbox_conf, boxes, variances)
@@ -939,7 +1082,8 @@ def multi_box_head(inputs,
         step = [step_w[i] if step_w else 0.0, step_h[i] if step_w else 0.0]
 
         box, var = prior_box(input, image, min_size, max_size, aspect_ratio,
-                             variance, flip, clip, step, offset)
+                             variance, flip, clip, step, offset, None,
+                             min_max_aspect_ratios_order)
 
         box_results.append(box)
         var_results.append(var)
diff --git a/python/paddle/fluid/layers/device.py b/python/paddle/fluid/layers/device.py
index e0c1aab230aeed7fb858e91e7da7eae58032ee16..384d302a709eeec220864b9e8c9210ed028470f6 100644
--- a/python/paddle/fluid/layers/device.py
+++ b/python/paddle/fluid/layers/device.py
@@ -18,10 +18,12 @@ All util layers.
 from layer_function_generator import autodoc
 from ..framework import unique_name
 from ..layer_helper import LayerHelper
+from ..annotations import deprecated
 
-__all__ = ['get_places']
+__all__ = []
 
 
+@deprecated(since='0.15.0', instead="ParallelExecutor")
 @autodoc()
 def get_places(device_count=None, device_type=None):
     helper = LayerHelper('get_places', **locals())
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index f33ae76aea95ceeca73c5bae6e4e490cdff29bf3..fab4a92a0ac5ab28508fb52a84aefdba19ac6dde 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -12,19 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import contextlib
+import multiprocessing
+import threading
 
-from .. import core
-from ..framework import convert_np_dtype_to_dtype_, default_main_program, default_startup_program, Program
-from ..unique_name import generate as unique_name
+from ..data_feeder import DataFeeder
 from control_flow import BlockGuard
-from ..layer_helper import LayerHelper
+from layer_function_generator import templatedoc
+from .. import core
 from ..executor import global_scope
-from layer_function_generator import generate_layer_fn, templatedoc
+from ..framework import convert_np_dtype_to_dtype_, default_main_program, \
+    default_startup_program, program_guard, Program
+from ..layer_helper import LayerHelper
+from ..unique_name import generate as unique_name
 
 __all__ = [
-    'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'Recv',
-    'open_recordio_file', 'open_files', 'read_file', 'shuffle', 'batch',
-    'double_buffer', 'random_data_generator', 'Preprocessor', 'load'
+    'data', 'open_recordio_file', 'open_files', 'read_file', 'shuffle', 'batch',
+    'double_buffer', 'random_data_generator', 'py_reader', 'Preprocessor',
+    'load'
 ]
 
 
@@ -375,9 +379,6 @@ def open_recordio_file(filename,
     if pass_num > 1:
         main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
 
-    if for_parallel:
-        main_prog_var = parallel(reader=main_prog_var)
-
     return monkey_patch_reader_methods(main_prog_var)
 
 
@@ -442,20 +443,267 @@ def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
     main_prog_var = _copy_reader_var_(default_main_program().current_block(),
                                       startup_var)
 
-    if for_parallel:
-        main_prog_var = parallel(reader=main_prog_var)
-
     return monkey_patch_reader_methods(main_prog_var)
 
 
+def py_reader(capacity,
+              shapes,
+              dtypes,
+              lod_levels=None,
+              name=None,
+              use_double_buffer=True):
+    """
+    Create a Python reader for data feeding in Python
+
+    This layer returns a Reader Variable.
+    The Reader provides :code:`decorate_paddle_reader()` and
+    :code:`decorate_tensor_provider()` to set a Python generator as the data
+    source in Python side. When :code:`Executor::Run()` is invoked in C++
+    side, the data from the generator would be read automatically. Unlike
+    :code:`DataFeeder.feed()`, the data reading process and
+    :code:`Executor::Run()` process can run in parallel using
+    :code:`py_reader`. The :code:`start()` method of the Reader should be
+    called when each pass begins, while the :code:`reset()` method should be
+    called when the pass ends and :code:`fluid.core.EOFException` raises.
+    Note that :code:`Program.clone()` method cannot clone :code:`py_reader`.
+
+    Args:
+       capacity(int): The buffer capacity maintained by :code:`py_reader`.
+       shapes(list|tuple): List of tuples which declaring data shapes.
+       dtypes(list|tuple): List of strs which declaring data type.
+       lod_levels(list|tuple): List of ints which declaring data lod_level.
+       name(basestring): The prefix Python queue name and Reader name. None will
+            be generated automatically.
+       use_double_buffer(bool): Whether use double buffer or not.
+
+    Returns:
+       Variable: A Reader from which we can get feeding data.
+
+    Examples:
+
+        1. The basic usage of :code:`py_reader` is as follows:
+
+        >>> import paddle.v2
+        >>> import paddle.fluid as fluid
+        >>> import paddle.dataset.mnist as mnist
+        >>>
+        >>> reader = fluid.layers.py_reader(capacity=64,
+        >>>                                 shapes=[(-1,3,224,224), (-1,1)],
+        >>>                                 dtypes=['float32', 'int64'])
+        >>> reader.decorate_paddle_reader(
+        >>>     paddle.v2.reader.shuffle(paddle.batch(mnist.train())
+        >>>
+        >>> img, label = fluid.layers.read_file(reader)
+        >>> loss = network(img, label) # some network definition
+        >>>
+        >>> fluid.Executor(fluid.CUDAPlace(0)).run(fluid.default_startup_program())
+        >>>
+        >>> exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name)
+        >>> for epoch_id in range(10):
+        >>>     reader.start()
+        >>>     try:
+        >>>         while True:
+        >>>             exe.run(fetch_list=[loss.name])
+        >>>     except fluid.core.EOFException:
+        >>>         reader.reset()
+
+        2. When training and testing are both performed, two different
+        :code:`py_reader` should be created with different names, e.g.:
+
+        >>> import paddle.v2
+        >>> import paddle.fluid as fluid
+        >>> import paddle.dataset.mnist as mnist
+        >>>
+        >>> def network(reader):
+        >>>     img, label = fluid.layers.read_file(reader)
+        >>>     # Here, we omitted the network definition
+        >>>     return loss
+        >>>
+        >>> train_reader = fluid.layers.py_reader(capacity=64,
+        >>>                                       shapes=[(-1,3,224,224), (-1,1)],
+        >>>                                       dtypes=['float32', 'int64'],
+        >>>                                       name='train_reader')
+        >>> train_reader.decorate_paddle_reader(
+        >>>     paddle.v2.reader.shuffle(paddle.batch(mnist.train())
+        >>>
+        >>> test_reader = fluid.layers.py_reader(capacity=32,
+        >>>                                      shapes=[(-1,3,224,224), (-1,1)],
+        >>>                                      dtypes=['float32', 'int64'],
+        >>>                                      name='test_reader')
+        >>> test_reader.decorate_paddle_reader(paddle.batch(mnist.test(), 512))
+        >>>
+        >>> # Create train_main_prog and train_startup_prog
+        >>> train_main_prog = fluid.Program()
+        >>> train_startup_prog = fluid.Program()
+        >>> with fluid.program_guard(train_main_prog, train_startup_prog):
+        >>>     # Use fluid.unique_name.guard() to share parameters with test program
+        >>>     with fluid.unique_name.guard():
+        >>>         train_loss = network(train_reader) # some network definition
+        >>>         adam = fluid.optimizer.Adam(learning_rate=0.01)
+        >>>         adam.minimize(loss)
+        >>>
+        >>> # Create test_main_prog and test_startup_prog
+        >>> test_main_prog = fluid.Program()
+        >>> test_startup_prog = fluid.Program()
+        >>> with fluid.program_guard(test_main_prog, test_startup_prog):
+        >>>     # Use fluid.unique_name.guard() to share parameters with train program
+        >>>     with fluid.unique_name.guard():
+        >>>         test_loss = network(test_reader)
+        >>>
+        >>> fluid.Executor(fluid.CUDAPlace(0)).run(train_startup_prog)
+        >>> fluid.Executor(fluid.CUDAPlace(0)).run(test_startup_prog)
+        >>>
+        >>> train_exe = fluid.ParallelExecutor(use_cuda=True,
+        >>>                 loss_name=train_loss.name, main_program=train_main_prog)
+        >>> test_exe = fluid.ParallelExecutor(use_cuda=True,
+        >>>                 loss_name=test_loss.name, main_program=test_main_prog)
+        >>> for epoch_id in range(10):
+        >>>     train_reader.start()
+        >>>     try:
+        >>>         while True:
+        >>>             train_exe.run(fetch_list=[train_loss.name])
+        >>>     except fluid.core.EOFException:
+        >>>         train_reader.reset()
+        >>>
+        >>>     test_reader.start()
+        >>>     try:
+        >>>         while True:
+        >>>             test_exe.run(fetch_list=[test_loss.name])
+        >>>     except fluid.core.EOFException:
+        >>>         test_reader.reset()
+    """
+    dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
+    shape_concat = []
+    ranks = []
+
+    for shape in shapes:
+        shape_concat.extend(shape)
+        ranks.append(len(shape))
+
+    if lod_levels is None:
+        lod_levels = [0] * len(shapes)
+
+    if name is None:
+        queue_name = unique_name('lod_tensor_blocking_queue')
+        reader_name = unique_name('create_py_reader')
+        double_buffer_name = unique_name('double_buffer')
+    else:
+        queue_name = "_".join([name, "queue"])
+        reader_name = "_".join([name, "reader"])
+        double_buffer_name = "_".join([name, "double_buffer"])
+
+    var = global_scope().var(queue_name)
+    feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes)
+
+    startup_blk = default_startup_program().current_block()
+    startup_var = startup_blk.create_var(name=reader_name)
+    startup_blk.append_op(
+        type='create_py_reader',
+        inputs={'blocking_queue': [queue_name]},
+        outputs={'Out': [startup_var]},
+        attrs={
+            'shape_concat': shape_concat,
+            'lod_levels': lod_levels,
+            'ranks': ranks
+        })
+
+    startup_var.desc.set_dtypes(dtypes)
+    startup_var.persistable = True
+
+    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
+                                      startup_var)
+
+    reader = monkey_patch_reader_methods(main_prog_var)
+    if use_double_buffer:
+        double_buffer_reader = double_buffer(reader, name=double_buffer_name)
+        # we return a double buffer reader. However, the reset method comes from
+        # py_reader.
+        double_buffer_reader.reset = reader.reset
+        reader = double_buffer_reader
+
+    # monkey patch py_reader special methods
+    reader.queue = feed_queue
+    current_reset_method = reader.reset
+    reader.thread = None
+    reader.tensor_provider = None
+    reader.exited = False
+
+    def start_provide_thread(func):
+        def __provider_thread__():
+            for tensors in func():
+                array = core.LoDTensorArray()
+                for item in tensors:
+                    if not isinstance(item, core.LoDTensor):
+                        tmp = core.LoDTensor()
+                        tmp.set(item, core.CPUPlace())
+                        item = tmp
+
+                    array.append(item)
+
+                if reader.exited:
+                    break
+                feed_queue.push(array)
+                if reader.exited:
+                    break
+            feed_queue.close()
+
+        reader.thread = threading.Thread(target=__provider_thread__)
+        reader.thread.daemon = True
+        reader.thread.start()
+
+    def __set_tensor_provider__(func):
+        reader.tensor_provider = func
+
+    def __set_paddle_reader__(paddle_reader):
+        with program_guard(Program(), Program()):
+            feed_list = []
+            counter = 0
+            for dtype, shape, lod_level in zip(dtypes, shapes, lod_levels):
+                name = str(counter)
+                feed_list.append(
+                    data(
+                        name=name,
+                        dtype=dtype,
+                        shape=shape,
+                        lod_level=lod_level))
+                counter += 1
+
+            feeder = DataFeeder(feed_list=feed_list, place=core.CPUPlace())
+            paddle_reader = feeder.decorate_reader(
+                paddle_reader, multi_devices=False)
+
+        def __tensor_provider__():
+            for slots in paddle_reader():
+                yield [slots[str(idx)] for idx in xrange(counter)]
+
+        __set_tensor_provider__(__tensor_provider__)
+
+    def __reset__():
+        current_reset_method()
+        if reader.thread is not None and reader.tensor_provider is not None:
+            reader.exited = True
+            reader.thread.join()
+            reader.exited = False
+
+    def __start__():
+        start_provide_thread(reader.tensor_provider)
+
+    reader.reset = __reset__
+    reader.decorate_tensor_provider = __set_tensor_provider__
+    reader.decorate_paddle_reader = __set_paddle_reader__
+    reader.start = __start__
+
+    return reader
+
+
 def open_files(filenames,
                shapes,
                lod_levels,
                dtypes,
-               thread_num=1,
+               thread_num=None,
                buffer_size=None,
                pass_num=1,
-               for_parallel=True):
+               is_test=None):
     """
     Open files
 
@@ -468,14 +716,14 @@ def open_files(filenames,
        shapes(list): List of tuples which declaring data shapes.
        lod_levels(list): List of ints which declaring data lod_level.
        dtypes(list): List of strs which declaring data type.
-       thread_num(int): The maximal concurrent prefetch thread number.
-       buffer_size(int|None): The size of prefetch buffer. If it is setted None, 
-            buffer size will be thread_num * 3.
-            Default: None
+       thread_num(None): The number of thread to read files.
+            Default: min(len(filenames), cpu_number).
+       buffer_size(None): The buffer size of reader. Default: 3 * thread_num
        pass_num(int): Number of passes to run.
-       for_parallel(Bool): Set it as True if you are going to run 
-            subsequent operators in parallel.
-            Default: True
+       is_test(bool|None): Whether `open_files` used for testing or not. If it
+            is used for testing, the order of data generated is same as the file
+            order. Otherwise, it is not guaranteed the order of data is same
+            between every epoch. [Default: False].
 
     Returns:
        Variable: A Reader Variable via which we can get file data.
@@ -487,15 +735,21 @@ def open_files(filenames,
                                                      './data2.recordio'],
                                              shapes=[(3,224,224), (1)],
                                              lod_levels=[0, 0],
-                                             dtypes=['float32', 'int64'],
-                                             thread_num=2,
-                                             buffer_size=2)
+                                             dtypes=['float32', 'int64'])
 
          # Via the reader, we can use 'read_file' layer to get data:
          image, label = fluid.layers.io.read_file(reader)
     """
+    if thread_num is None:
+        thread_num = min(len(filenames), multiprocessing.cpu_count())
+    else:
+        thread_num = int(thread_num)
+
     if buffer_size is None:
-        buffer_size = thread_num * 3
+        buffer_size = 3 * thread_num
+    else:
+        buffer_size = int(buffer_size)
+
     if isinstance(filenames, basestring):
         filenames = [filenames]
     dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
@@ -509,17 +763,18 @@ def open_files(filenames,
     multi_file_reader_name = unique_name('multi_file_reader')
     startup_blk = default_startup_program().current_block()
     startup_reader = startup_blk.create_var(name=multi_file_reader_name)
+    attrs = {
+        'shape_concat': shape_concat,
+        'lod_levels': lod_levels,
+        'ranks': ranks,
+        'file_names': filenames,
+        'thread_num': thread_num,
+        'buffer_size': buffer_size
+    }
+    if is_test is not None:
+        attrs['is_test'] = is_test
     startup_blk.append_op(
-        type='open_files',
-        outputs={'Out': [startup_reader]},
-        attrs={
-            'shape_concat': shape_concat,
-            'lod_levels': lod_levels,
-            'ranks': ranks,
-            'file_names': filenames,
-            'thread_num': thread_num,
-            'buffer_size': buffer_size
-        })
+        type='open_files', outputs={'Out': [startup_reader]}, attrs=attrs)
 
     startup_reader.desc.set_dtypes(dtypes)
     startup_reader.persistable = True
@@ -529,9 +784,6 @@ def open_files(filenames,
         main_prog_reader = multi_pass(
             reader=main_prog_reader, pass_num=pass_num)
 
-    if for_parallel:
-        main_prog_reader = parallel(reader=main_prog_reader)
-
     return monkey_patch_reader_methods(main_prog_reader)
 
 
@@ -647,11 +899,6 @@ def multi_pass(reader, pass_num):
         'create_multi_pass_reader', reader, {'pass_num': int(pass_num)})
 
 
-def parallel(reader):
-    return __create_shared_decorated_reader__('create_threaded_reader', reader,
-                                              {})
-
-
 def read_file(reader):
     """
     Execute the given reader and get data via it.
@@ -730,7 +977,7 @@ class Preprocessor(object):
         self.sink_var_names = None
         self.status = Preprocessor.BEFORE_SUB_BLOCK
 
-    def is_completed(self):
+    def _is_completed(self):
         return self.sub_block and self.source_var_names and self.sink_var_names
 
     @contextlib.contextmanager
@@ -740,7 +987,7 @@ class Preprocessor(object):
         yield
         self.main_prog.rollback()
         self.status = Preprocessor.AFTER_SUB_BLOCK
-        if not self.is_completed():
+        if not self._is_completed():
             raise RuntimeError(
                 "The definition of preprocessor is incompleted! "
                 "Please make sure that you have set input and output "
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index 6071e3e74218e4db4cddc223818d3a9b7086fd86..c7966e36f15ef0e3f30f8a96ad71df04aece0fa1 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -62,10 +62,10 @@ def noam_decay(d_model, warmup_steps):
         The decayed learning rate.
     """
     global_step = _decay_step_counter(1)
-    with init_on_cpu():
-        a = global_step**-0.5
-        b = (warmup_steps**-1.5) * global_step
-        lr_value = (d_model**-0.5) * ops.elementwise_min(a, b)
+
+    a = global_step**-0.5
+    b = (warmup_steps**-1.5) * global_step
+    lr_value = (d_model**-0.5) * ops.elementwise_min(a, b)
 
     return lr_value
 
@@ -108,12 +108,10 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     """
     global_step = _decay_step_counter()
 
-    with init_on_cpu():
-        # update learning_rate
-        div_res = global_step / decay_steps
-        if staircase:
-            div_res = ops.floor(div_res)
-        decayed_lr = learning_rate * (decay_rate**div_res)
+    div_res = global_step / decay_steps
+    if staircase:
+        div_res = ops.floor(div_res)
+    decayed_lr = learning_rate * (decay_rate**div_res)
 
     return decayed_lr
 
@@ -138,11 +136,10 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     """
     global_step = _decay_step_counter()
 
-    with init_on_cpu():
-        div_res = global_step / decay_steps
-        if staircase:
-            div_res = ops.floor(div_res)
-        decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res)
+    div_res = global_step / decay_steps
+    if staircase:
+        div_res = ops.floor(div_res)
+    decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res)
 
     return decayed_lr
 
@@ -184,12 +181,11 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     """
     global_step = _decay_step_counter()
 
-    with init_on_cpu():
-        div_res = global_step / decay_steps
-        if staircase:
-            div_res = ops.floor(div_res)
+    div_res = global_step / decay_steps
+    if staircase:
+        div_res = ops.floor(div_res)
 
-        decayed_lr = learning_rate / (1 + decay_rate * div_res)
+    decayed_lr = learning_rate / (1 + decay_rate * div_res)
 
     return decayed_lr
 
@@ -224,25 +220,22 @@ def polynomial_decay(learning_rate,
     """
     global_step = _decay_step_counter()
 
-    with init_on_cpu():
-        if cycle:
-            div_res = ops.ceil(global_step / decay_steps)
-            zero_var = tensor.fill_constant(
-                shape=[1], dtype='float32', value=0.0)
-            one_var = tensor.fill_constant(
-                shape=[1], dtype='float32', value=1.0)
-
-            with control_flow.Switch() as switch:
-                with switch.case(global_step == zero_var):
-                    tensor.assign(input=one_var, output=div_res)
-            decay_steps = decay_steps * div_res
-        else:
-            decay_steps_var = tensor.fill_constant(
-                shape=[1], dtype='float32', value=float(decay_steps))
-            global_step = ops.elementwise_min(x=global_step, y=decay_steps_var)
+    if cycle:
+        div_res = ops.ceil(global_step / decay_steps)
+        zero_var = tensor.fill_constant(shape=[1], dtype='float32', value=0.0)
+        one_var = tensor.fill_constant(shape=[1], dtype='float32', value=1.0)
 
-        decayed_lr = (learning_rate - end_learning_rate) * \
-                     ((1 - global_step / decay_steps) ** power) + end_learning_rate
+        with control_flow.Switch() as switch:
+            with switch.case(global_step == zero_var):
+                tensor.assign(input=one_var, output=div_res)
+        decay_steps = decay_steps * div_res
+    else:
+        decay_steps_var = tensor.fill_constant(
+            shape=[1], dtype='float32', value=float(decay_steps))
+        global_step = ops.elementwise_min(x=global_step, y=decay_steps_var)
+
+    decayed_lr = (learning_rate - end_learning_rate) * \
+        ((1 - global_step / decay_steps) ** power) + end_learning_rate
     return decayed_lr
 
 
@@ -277,28 +270,28 @@ def piecewise_decay(boundaries, values):
 
     global_step = _decay_step_counter()
 
-    with init_on_cpu():
-        lr = tensor.create_global_var(
-            shape=[1],
-            value=0.0,
-            dtype='float32',
-            persistable=True,
-            name="learning_rate")
+    lr = tensor.create_global_var(
+        shape=[1],
+        value=0.0,
+        dtype='float32',
+        persistable=True,
+        name="learning_rate")
 
-        with control_flow.Switch() as switch:
-            for i in range(len(boundaries)):
-                boundary_val = tensor.fill_constant(
-                    shape=[1], dtype='float32', value=float(boundaries[i]))
-                value_var = tensor.fill_constant(
-                    shape=[1], dtype='float32', value=float(values[i]))
-                with switch.case(global_step < boundary_val):
-                    tensor.assign(value_var, lr)
-            last_value_var = tensor.fill_constant(
+    with control_flow.Switch() as switch:
+        for i in range(len(boundaries)):
+            boundary_val = tensor.fill_constant(
                 shape=[1],
                 dtype='float32',
-                value=float(values[len(values) - 1]))
-            with switch.default():
-                tensor.assign(last_value_var, lr)
+                value=float(boundaries[i]),
+                force_cpu=True)
+            value_var = tensor.fill_constant(
+                shape=[1], dtype='float32', value=float(values[i]))
+            with switch.case(global_step < boundary_val):
+                tensor.assign(value_var, lr)
+        last_value_var = tensor.fill_constant(
+            shape=[1], dtype='float32', value=float(values[len(values) - 1]))
+        with switch.default():
+            tensor.assign(last_value_var, lr)
 
     return lr
 
@@ -333,9 +326,9 @@ def append_LARS(params_grads, learning_rate, weight_decay):
         grad_norm = ops.sqrt(nn.reduce_sum(input=ops.square(grad)))
         if type(param_lr) == float and param_lr == 1.0:
             decayed_lr = learning_rate * param_norm \
-                         / _balanced_weight(param_norm, grad_norm)
+                / _balanced_weight(param_norm, grad_norm)
         else:
             decayed_lr = learning_rate * param_lr * param_norm \
-                         / _balanced_weight(param_norm, grad_norm)
+                / _balanced_weight(param_norm, grad_norm)
         # set back param local learning rate
         param.optimize_attr['learning_rate'] = decayed_lr
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index 1754061c4ba6f5b97bced3548bc412dfb1b7932c..f814c41633fbac76eb9411e2f418f521e8e9679d 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -16,8 +16,6 @@ from ..framework import Variable, unique_name
 from layer_function_generator import OpProtoHolder
 from ..initializer import force_init_on_cpu
 
-__all__ = ['monkey_patch_variable']
-
 
 def monkey_patch_variable():
     def unique_tmp_name():
diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py
index 99e82fdd04282177fae63f1fb94b5e32d41c612e..e7d7a9e826de95514b6f2e04e7408075ab0b8cb6 100644
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -76,7 +76,7 @@ def accuracy(input, label, k=1, correct=None, total=None):
     return acc_out
 
 
-def auc(input, label, curve='ROC', num_thresholds=200):
+def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
     """
     **Area Under the Curve (AUC) Layer**
 
@@ -102,6 +102,7 @@ def auc(input, label, curve='ROC', num_thresholds=200):
         curve(str): Curve type, can be 'ROC' or 'PR'. Default 'ROC'.
         num_thresholds(int): The number of thresholds to use when discretizing 
                              the roc curve. Default 200.
+        topk(int): only topk number of prediction output will be used for auc.
 
     Returns:
         Variable: A scalar representing the current AUC.
@@ -113,26 +114,35 @@ def auc(input, label, curve='ROC', num_thresholds=200):
             prediction = network(image, is_infer=True)
             auc_out=fluid.layers.auc(input=prediction, label=label)
     """
-
-    warnings.warn(
-        "This interface not recommended, fluid.layers.auc compute the auc at every minibatch, \
-        but can not aggregate them and get the pass AUC, because pass \
-        auc can not be averaged with weighted from the minibatch auc value. \
-        Please use fluid.metrics.Auc, it can compute the auc value via Python natively, \
-        which can get every minibatch and every pass auc value.", Warning)
     helper = LayerHelper("auc", **locals())
-    topk_out = helper.create_tmp_variable(dtype=input.dtype)
-    topk_indices = helper.create_tmp_variable(dtype="int64")
-    topk_out, topk_indices = nn.topk(input, k=k)
-    auc_out = helper.create_tmp_variable(dtype="float32")
+    auc_out = helper.create_tmp_variable(dtype="float64")
+    # make tp, tn, fp, fn persistable, so that can accumulate all batches.
+    tp = helper.create_global_variable(persistable=True, dtype='int64')
+    tn = helper.create_global_variable(persistable=True, dtype='int64')
+    fp = helper.create_global_variable(persistable=True, dtype='int64')
+    fn = helper.create_global_variable(persistable=True, dtype='int64')
+    for var in [tp, tn, fp, fn]:
+        helper.set_variable_initializer(
+            var, Constant(
+                value=0.0, force_cpu=True))
+
     helper.append_op(
         type="auc",
         inputs={
-            "Out": [topk_out],
-            "Indices": [topk_indices],
-            "Label": [label]
+            "Predict": [input],
+            "Label": [label],
+            "TP": [tp],
+            "TN": [tn],
+            "FP": [fp],
+            "FN": [fn]
         },
         attrs={"curve": curve,
                "num_thresholds": num_thresholds},
-        outputs={"AUC": [auc_out], })
-    return auc_out
+        outputs={
+            "AUC": [auc_out],
+            "TPOut": [tp],
+            "TNOut": [tn],
+            "FPOut": [fp],
+            "FNOut": [fn]
+        })
+    return auc_out, [tp, tn, fp, fn]
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index bcf520d5a4e3bbe1d949d08f42199dd8c5cdc947..94933d1489c356eb6e9efd6d98bd1cba5ddfcd23 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1,4 +1,18 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#   Copyright (c ) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -71,6 +85,7 @@ __all__ = [
     'transpose',
     'im2sequence',
     'nce',
+    'hsigmoid',
     'beam_search',
     'row_conv',
     'multiplex',
@@ -95,6 +110,7 @@ __all__ = [
     'relu',
     'log',
     'crop',
+    'rank_loss',
 ]
 
 
@@ -151,7 +167,8 @@ def fc(input,
         param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
             parameters/weights of this layer.
         bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
-            of this layer. If it is set to None, no bias will be added to the output units.
+            of this layer. If it is set to False, no bias will be added to the output units.
+            If it is set to None, the bias is initialized zero. Default: None.
         act (str, default None): Activation to be applied to the output of this layer.
         is_test(bool): A flag indicating whether execution is in test phase.
         use_mkldnn(bool): Use mkldnn kernel or not, it is valid only when the mkldnn
@@ -932,6 +949,10 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
     helper = LayerHelper('dropout', **locals())
     out = helper.create_tmp_variable(dtype=x.dtype)
     mask = helper.create_tmp_variable(dtype=x.dtype, stop_gradient=True)
+
+    if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
+        seed = helper.main_program.random_seed
+
     helper.append_op(
         type='dropout',
         inputs={'X': [x]},
@@ -1296,13 +1317,16 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
 
 def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None):
     """
-    The input of the softmax layer is a 2-D tensor with shape N x K (N is the
-    batch_size, K is the dimension of input feature). The output tensor has the
-    same shape as the input tensor.
+    The input of the softmax operator is a tensor of any rank. The output tensor 
+    has the same shape as the input.
 
-    For each row of the input tensor, the softmax operator squashes the
-    K-dimensional vector of arbitrary real values to a K-dimensional vector of real
-    values in the range [0, 1] that add up to 1.
+    The input tensor will first be logically flattened to a 2-D matrix. The matrix's 
+    second dimension(row length) is as same as the last dimension of the input 
+    tensor, and the first dimension(column length) is the product of all other 
+    dimensions of the input tensor. For each row of the matrix, the softmax operator 
+    squashes the K-dimensional(K is the width of the matrix, which is also the size 
+    of the input tensor's last dimension) vector of arbitrary real values to a 
+    K-dimensional vector of real values in the range [0, 1] that add up to 1.
 
     It computes the exponential of the given dimension and the sum of exponential
     values of all the other dimensions in the K-dimensional vector input.
@@ -1310,7 +1334,7 @@ def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None):
     exponential values of all the other dimensions is the output of the softmax
     operator.
 
-    For each row :math:`i` and each column :math:`j` in Input(X), we have:
+    For each row :math:`i` and each column :math:`j` in the matrix, we have:
 
     .. math::
 
@@ -2945,7 +2969,7 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
             # x is a Tensor variable with following elements:
             #    [[0.2, 0.3, 0.5, 0.9]
             #     [0.1, 0.2, 0.6, 0.7]]
-            # Each example is followed by the correspending output tensor.
+            # Each example is followed by the corresponding output tensor.
             fluid.layers.reduce_sum(x)  # [3.5]
             fluid.layers.reduce_sum(x, dim=0)  # [0.3, 0.5, 1.1, 1.6]
             fluid.layers.reduce_sum(x, dim=-1)  # [1.9, 1.6]
@@ -2954,7 +2978,7 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
             # x is a Tensor variable with shape [2, 2, 2] and elements as below:
             #      [[[1, 2], [3, 4]],
             #      [[5, 6], [7, 8]]]
-            # Each example is followed by the correspending output tensor.
+            # Each example is followed by the corresponding output tensor.
             fluid.layers.reduce_sum(x, dim=[1, 2]) # [10, 26]
             fluid.layers.reduce_sum(x, dim=[0, 1]) # [16, 20]
 
@@ -3857,6 +3881,74 @@ def nce(input,
     return cost / (num_neg_samples + 1)
 
 
+def hsigmoid(input, label, num_classes, param_attr=None, bias_attr=None):
+    """
+    The hierarchical sigmoid operator is used to accelerate the training
+    process of language model. This operator organizes the classes into a 
+    complete binary tree, each leaf node represents a class(a word) and each
+    internal node acts as a binary classifier. For each word there's a unique
+    path from root to it's leaf node, hsigmoid calculate the cost for each
+    internal node on the path, and sum them to get a total cost. hsigmoid can
+    achive a acceleration from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
+    represents the size of word dict.
+
+    Refer to `Hierarchical Probabilistic Neural Network Language Model
+    <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_
+    
+    Args:
+        input (Variable): The input tensor variable with shape 
+            :math:`[N \\times D]`, where :math:`N` is the size of mini-batch,
+            and :math:`D` is the feature size.
+        label (Variable): The tensor variable contains labels of training data.
+            It's a tensor with shape is :math:`[N \\times 1]`.
+        num_classes: (int), The number of classes, must not be less than 2.
+        param_attr (ParamAttr|list of ParamAttr, default None): The parameter
+             attribute for learnable parameters/weights of this layer.
+        bias_attr (ParamAttr|list of ParamAttr, default None):  The parameter 
+             attribute for the bias of this layer. If it is set to False, no
+             bias will be applied.
+
+    Returns:
+        Out: (Tensor) The cost of hierarchical sigmoid operator. the shape is [N, 1]
+
+    Examples:
+
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[2], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[1], dtype='int64')
+            out = fluid.layers.hsigmoid(input=x, label=y, num_classes=6)
+    """
+
+    helper = LayerHelper('hierarchical_sigmoid', **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    pre_out = helper.create_tmp_variable(dtype)
+    dim = input.shape[1]
+    if num_classes < 2:
+        raise ValueError("num_classes must not be less than 2.")
+    weights = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=[num_classes - 1, dim],
+        is_bias=False,
+        dtype=input.dtype)
+    inputs = {"X": input, "W": weights, "Label": label}
+    if helper.bias_attr:
+        bias = helper.create_parameter(
+            attr=helper.bias_attr,
+            shape=[1, num_classes - 1],
+            is_bias=True,
+            dtype=input.dtype)
+        inputs['Bias'] = bias
+    helper.append_op(
+        type="hierarchical_sigmoid",
+        inputs=inputs,
+        outputs={"Out": out,
+                 "PreOut": pre_out},
+        attrs={"num_classes": num_classes})
+    return out
+
+
 def transpose(x, perm, name=None):
     """
     Permute the dimensions of `input` according to `perm`.
@@ -3900,7 +3992,13 @@ def transpose(x, perm, name=None):
     return out
 
 
-def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
+def im2sequence(input,
+                filter_size=1,
+                stride=1,
+                padding=0,
+                input_image_size=None,
+                out_stride=1,
+                name=None):
     """
     Extracts image patches from the input tensor to form a tensor of shape
     {input.batch_size * output_height * output_width, filter_size_H *
@@ -3937,6 +4035,15 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
             padding_up = padding_down = padding_left = padding_right = padding
             Default: padding = 0.
 
+        input_image_size(Variable): the input contains image real size.It's dim
+            is [batchsize, 2]. It is dispensable.It is just for batch inference.
+
+        out_stride(int|tuple): The scaling of image through CNN. It is
+            dispensable. It is valid only when input_image_size is not null.
+            If out_stride is tuple,  it must contain two intergers,
+            (out_stride_H, out_stride_W). Otherwise,
+            the out_stride_H = out_stride_W = out_stride.
+
         name (int): The name of this layer. It is optional.
 
     Returns:
@@ -3987,7 +4094,7 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
                            [ 5.  7.  2.  4.  1.  3.  9.  0.]
                            [ 7.  9.  4.  8.  3.  5.  0.  8.]]
 
-            output.dims = {8, 9}
+            output.dims = {8, 8}
 
             output.lod = [[4, 4]]
 
@@ -4009,18 +4116,17 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
     if len(padding) == 2:
         padding.append(padding[0])
         padding.append(padding[1])
-
+    inputs = {"X": input}
+    attrs = {"kernels": filter_size, "strides": stride, "padding": padding}
+    if input_image_size:
+        if isinstance(out_stride, int):
+            out_stride = [out_stride, out_stride]
+        inputs["Y"] = input_image_size
+        attrs["out_stride"] = out_stride
     helper = LayerHelper('im2sequence', **locals())
     out = helper.create_tmp_variable(dtype=helper.input_dtype())
     helper.append_op(
-        type='im2sequence',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs={
-            'kernels': filter_size,
-            'strides': stride,
-            'paddings': padding,
-        })
+        type='im2sequence', inputs=inputs, outputs={'Out': out}, attrs=attrs)
     return out
 
 
@@ -4270,7 +4376,7 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
         helper.set_variable_initializer(
             counter, initializer=Constant(
                 value=begin - 1, force_cpu=True))
-        helper.main_program.global_block().prepend_op(
+        helper.main_program.global_block()._prepend_op(
             type='increment',
             inputs={'X': [counter]},
             outputs={'Out': [counter]},
@@ -4374,15 +4480,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
                 "except one unknown dimension.")
 
     helper = LayerHelper("reshape", **locals())
-    reshaped = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_tmp_variable(dtype=x.dtype)
     helper.append_op(
         type="reshape",
         inputs=inputs,
-        attrs={"shape": shape,
-               "inplace": inplace},
-        outputs={"Out": reshaped})
+        attrs={"shape": shape},
+        outputs={"Out": out})
 
-    return helper.append_activation(reshaped)
+    return helper.append_activation(out)
 
 
 def lod_reset(x, y=None, target_lod=None):
@@ -5184,3 +5289,74 @@ def crop(x, shape=None, offsets=None, name=None):
         outputs={'Out': out},
         attrs=None if len(attrs) == 0 else attrs)
     return out
+
+
+def rank_loss(label, left, right, name=None):
+    """
+    **Rank loss layer for RankNet**
+
+    RankNet(http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf)
+    is a pairwise ranking model with a training sample consisting of a pair
+    of documents, A and B. Label P indicates whether A is ranked higher than B
+    or not:
+ 
+    P = {0, 1} or {0, 0.5, 1}, where 0.5 means that there is no information
+    about the rank of the input pair.
+    
+    Rank loss layer takes three inputs: left (o_i), right (o_j) and
+    label (P_{i,j}). The inputs respectively represent RankNet's output scores
+    for documents A and B and the value of label P. The following equation
+    computes rank loss C_{i,j} from the inputs:
+    
+    $$
+      C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\
+      o_{i,j} =  o_i - o_j  \\
+      \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
+    $$
+    
+    Rank loss layer takes batch inputs with size batch_size (batch_size >= 1).   
+ 
+    Args:
+        label (Variable): Indicats whether A ranked higher than B or not.
+        left (Variable): RankNet's output score for doc A.
+        right (Variable): RankNet's output score for doc B.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        list: The value of rank loss.
+
+    Raises:
+        ValueError: Any of label, left, and right is not a variable.
+
+    Examples:
+
+        .. code-block:: python
+
+            label = fluid.layers.data(name="label", shape=[4, 1], dtype="float32")
+            left = fluid.layers.data(name="left", shape=[4, 1], dtype="float32")
+            right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32")
+            out = fluid.layers.rank_loss(label, left, right)
+
+
+    """
+    helper = LayerHelper('rank_loss', **locals())
+
+    if not (isinstance(label, Variable)):
+        raise ValueError("The label should be a Variable")
+
+    if not (isinstance(left, Variable)):
+        raise ValueError("The left should be a Variable")
+
+    if not (isinstance(right, Variable)):
+        raise ValueError("The right should be a Variable")
+
+    out = helper.create_tmp_variable("float32")
+
+    helper.append_op(
+        type='rank_loss',
+        inputs={"Label": label,
+                "Left": left,
+                "Right": right},
+        outputs={'Out': out})
+    return out
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 9e97ec9a6f55680a2eb44ad712ac002df4fecda5..01db8645b3aff77371f01c3dec51c85f99065552 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -66,9 +66,7 @@ __all__ = [
     'scatter',
     'sum',
     'slice',
-    'polygon_box_transform',
     'shape',
-    'iou_similarity',
     'maxout',
 ] + __activations__
 
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index 17bb0826a6ea86c98a069263dfab84b99e1177ad..b37b09ac81687882443c948569d9c4fca9310f78 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -591,7 +591,7 @@ class Auc(MetricBase):
                       for i in range(self._num_thresholds - 2)]
         thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
 
-        # caculate TP, FN, TN, FP count
+        # calculate TP, FN, TN, FP count
         for idx_thresh, thresh in enumerate(thresholds):
             tp, fn, tn, fp = 0, 0, 0, 0
             for i, lbl in enumerate(labels):
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 75ee40fa9ca94cdd84ee7acbb62d6e652ac7fa33..3fe99f55011ab7f745c3ad98ec44dfe277a13e05 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -29,7 +29,7 @@ __all__ = [
     'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
     'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
     'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer',
-    'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'Optimizer', 'RMSPropOptimizer'
+    'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'RMSPropOptimizer'
 ]
 
 
@@ -67,7 +67,7 @@ class Optimizer(object):
         self._LARS_weight_decay = LARS_weight_decay
 
     def _create_global_learning_rate(self):
-        lr = self.global_learning_rate()
+        lr = self._global_learning_rate()
 
         if isinstance(lr, framework.Variable):
             return
@@ -86,7 +86,7 @@ class Optimizer(object):
             dtype='float32' if self._dtype == None else self._dtype,
             persistable=True)
 
-    def global_learning_rate(self, program=None):
+    def _global_learning_rate(self, program=None):
         """
         get global decayed learning rate
         :return:
@@ -110,9 +110,9 @@ class Optimizer(object):
             return param_lr
         else:
             if param_lr == 1.0:
-                return self.global_learning_rate()
+                return self._global_learning_rate()
             else:
-                return self.global_learning_rate() * param_lr
+                return self._global_learning_rate() * param_lr
 
     def _create_accumulators(self, block, parameters):
         """Create all accumulators needed by the parameters
@@ -123,7 +123,7 @@ class Optimizer(object):
         """
         pass
 
-    def _finish_update(self, block):
+    def _finish_update(self, block, parameters_and_grads):
         """Finish any custom updates needed
            before completing an optimization step
 
@@ -132,7 +132,7 @@ class Optimizer(object):
             parameters: list of parameter variables for the optimizer
 
         Returns:
-            list of finish ops or None
+            None
         """
         pass
 
@@ -185,10 +185,10 @@ class Optimizer(object):
                             format(name, param.name))
         return self._accumulators[name][param.name]
 
-    def create_optimization_pass(self,
-                                 parameters_and_grads,
-                                 loss,
-                                 startup_program=None):
+    def _create_optimization_pass(self,
+                                  parameters_and_grads,
+                                  loss,
+                                  startup_program=None):
         """Add optimization operators to update gradients to variables.
 
         Args:
@@ -221,25 +221,26 @@ class Optimizer(object):
             self._create_global_learning_rate()
             if self._LARS_weight_decay > 0.0:
                 layers.append_LARS(parameters_and_grads,
-                                   self.global_learning_rate(),
+                                   self._global_learning_rate(),
                                    self._LARS_weight_decay)
 
             optimize_ops = []
             for param_and_grad in parameters_and_grads:
+                if param_and_grad[1] is None:
+                    continue
                 with param_and_grad[0].block.program.optimized_guard(
-                        param_and_grad[0]):
-                    if param_and_grad[0].trainable is True and param_and_grad[
-                            1] is not None:
+                        param_and_grad):
+                    if param_and_grad[0].trainable is True:
                         optimize_op = self._append_optimize_op(loss.block,
                                                                param_and_grad)
                         optimize_ops.append(optimize_op)
 
             # Get custom finish ops for subclasses
             # FIXME: Need to fix this once we figure out how to handle dependencies
-            self._finish_update(loss.block)
+            self._finish_update(loss.block, parameters_and_grads)
 
             end = len(global_block.ops)
-            return global_block.slice_ops(start, end)
+            return global_block._slice_ops(start, end)
 
     def minimize(self,
                  loss,
@@ -262,8 +263,8 @@ class Optimizer(object):
         params_grads = append_regularization_ops(params_grads,
                                                  self.regularization)
 
-        optimize_ops = self.create_optimization_pass(params_grads, loss,
-                                                     startup_program)
+        optimize_ops = self._create_optimization_pass(params_grads, loss,
+                                                      startup_program)
         return optimize_ops, params_grads
 
 
@@ -323,7 +324,7 @@ class MomentumOptimizer(Optimizer):
 
         & if (use\_nesterov):
 
-        &\quad   param = param - gradient * learning\_rate + mu * velocity * learning\_rate
+        &\quad   param = param - (gradient + mu * velocity) * learning\_rate
 
         & else:
 
@@ -486,6 +487,8 @@ class AdamOptimizer(Optimizer):
     """
     _moment1_acc_str = "moment1"
     _moment2_acc_str = "moment2"
+    _beta1_pow_acc_str = "beta1_pow_acc"
+    _beta2_pow_acc_str = "beta2_pow_acc"
 
     def __init__(self,
                  learning_rate=0.001,
@@ -507,32 +510,22 @@ class AdamOptimizer(Optimizer):
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
 
-        main_block = block.program.global_block()
-        # Create beta1 and beta2 power tensors
-        beta_shape = [1]
-        self._beta1_pow_acc = self.helper.create_global_variable(
-            name=unique_name.generate('beta1_pow_acc'),
-            dtype='float32' if self._dtype == None else self._dtype,
-            shape=beta_shape,
-            lod_level=0,
-            persistable=True)
-        self.helper.set_variable_initializer(
-            self._beta1_pow_acc, initializer=Constant(self._beta1))
-
-        self._beta2_pow_acc = self.helper.create_global_variable(
-            name=unique_name.generate('beta2_pow_acc'),
-            dtype='float32' if self._dtype == None else self._dtype,
-            shape=beta_shape,
-            lod_level=0,
-            persistable=True)
-
-        self.helper.set_variable_initializer(
-            self._beta2_pow_acc, initializer=Constant(self._beta2))
-
         # Create accumulator tensors for first and second moments
         for p in parameters:
             self._add_accumulator(self._moment1_acc_str, p)
             self._add_accumulator(self._moment2_acc_str, p)
+            self._add_accumulator(
+                name=self._beta1_pow_acc_str,
+                param=p,
+                dtype='float32',
+                fill_value=self._beta1,
+                shape=[1])
+            self._add_accumulator(
+                name=self._beta2_pow_acc_str,
+                param=p,
+                dtype='float32',
+                fill_value=self._beta2,
+                shape=[1])
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
@@ -541,6 +534,11 @@ class AdamOptimizer(Optimizer):
                                         param_and_grad[0])
         moment2 = self._get_accumulator(self._moment2_acc_str,
                                         param_and_grad[0])
+        beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
+                                              param_and_grad[0])
+        beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
+                                              param_and_grad[0])
+
         # create the adam optimize op
         adam_op = block.append_op(
             type=self.type,
@@ -550,8 +548,8 @@ class AdamOptimizer(Optimizer):
                 "LearningRate": self._create_param_lr(param_and_grad),
                 "Moment1": moment1,
                 "Moment2": moment2,
-                "Beta1Pow": self._beta1_pow_acc,
-                "Beta2Pow": self._beta2_pow_acc
+                "Beta1Pow": beta1_pow_acc,
+                "Beta2Pow": beta2_pow_acc
             },
             outputs={
                 "ParamOut": param_and_grad[0],
@@ -566,24 +564,30 @@ class AdamOptimizer(Optimizer):
 
         return adam_op
 
-    def _finish_update(self, block):
+    def _finish_update(self, block, param_and_grads):
         """Update Beta1 and Beta2 Power accumulators
         """
         assert isinstance(block, framework.Block)
         main_block = block.program.global_block()
-        scale_beta1 = main_block.append_op(
-            type="scale",
-            inputs={"X": self._beta1_pow_acc},
-            outputs={"Out": self._beta1_pow_acc},
-            attrs={"scale": self._beta1})
-
-        scale_beta2 = main_block.append_op(
-            type="scale",
-            inputs={"X": self._beta2_pow_acc},
-            outputs={"Out": self._beta2_pow_acc},
-            attrs={"scale": self._beta2})
-
-        return [scale_beta1, scale_beta2]
+        for param, grad in param_and_grads:
+            if grad is None:
+                continue
+            with param.block.program.optimized_guard([param, grad]):
+                beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
+                                                      param)
+                beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
+                                                      param)
+                main_block.append_op(
+                    type="scale",
+                    inputs={"X": beta1_pow_acc},
+                    outputs={"Out": beta1_pow_acc},
+                    attrs={"scale": self._beta1})
+
+                main_block.append_op(
+                    type="scale",
+                    inputs={"X": beta2_pow_acc},
+                    outputs={"Out": beta2_pow_acc},
+                    attrs={"scale": self._beta2})
 
 
 class AdamaxOptimizer(Optimizer):
@@ -626,6 +630,7 @@ class AdamaxOptimizer(Optimizer):
     """
     _moment_acc_str = "moment"
     _inf_norm_acc_str = "inf_norm"
+    _beta1_pow_acc_str = "beta1_pow_acc"
 
     def __init__(self,
                  learning_rate=0.001,
@@ -645,21 +650,16 @@ class AdamaxOptimizer(Optimizer):
         self._epsilon = epsilon
 
     def _create_accumulators(self, block, parameters):
-        # Create beta1 power accumulator tensor
-        beta_shape = [1]
-        self._beta1_pow_acc = self.helper.create_global_variable(
-            name=unique_name.generate('beta1_pow_acc'),
-            dtype='float32' if self._dtype == None else self._dtype,
-            shape=beta_shape,
-            lod_level=0,
-            persistable=True)
-        self.helper.set_variable_initializer(
-            self._beta1_pow_acc, initializer=Constant(self._beta1))
-
         # Create accumulator tensors for first moment and infinity norm
         for p in parameters:
             self._add_accumulator(self._moment_acc_str, p)
             self._add_accumulator(self._inf_norm_acc_str, p)
+            self._add_accumulator(
+                name=self._beta1_pow_acc_str,
+                param=p,
+                dtype='float32',
+                fill_value=self._beta1,
+                shape=[1])
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
@@ -667,6 +667,8 @@ class AdamaxOptimizer(Optimizer):
         moment = self._get_accumulator(self._moment_acc_str, param_and_grad[0])
         inf_norm = self._get_accumulator(self._inf_norm_acc_str,
                                          param_and_grad[0])
+        beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
+                                              param_and_grad[0])
         # create the adamax optimize op
         adamax_op = block.append_op(
             type=self.type,
@@ -676,7 +678,7 @@ class AdamaxOptimizer(Optimizer):
                 "LearningRate": self._create_param_lr(param_and_grad),
                 "Moment": moment,
                 "InfNorm": inf_norm,
-                "Beta1Pow": self._beta1_pow_acc
+                "Beta1Pow": beta1_pow_acc
             },
             outputs={
                 "ParamOut": param_and_grad[0],
@@ -691,18 +693,22 @@ class AdamaxOptimizer(Optimizer):
 
         return adamax_op
 
-    def _finish_update(self, block):
+    def _finish_update(self, block, parameters_and_grads):
         """Update Beta1 Power accumulator
         """
         assert isinstance(block, framework.Block)
         main_block = block.program.global_block()
-        scale_beta1 = main_block.append_op(
-            type="scale",
-            inputs={"X": self._beta1_pow_acc},
-            outputs={"Out": self._beta1_pow_acc},
-            attrs={"scale": self._beta1})
-
-        return [scale_beta1]
+        for param, grad in parameters_and_grads:
+            if grad is None:
+                continue
+            with param.block.program.optimized_guard([param, grad]):
+                beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
+                                                      param)
+                main_block.append_op(
+                    type="scale",
+                    inputs={"X": beta1_pow_acc},
+                    outputs={"Out": beta1_pow_acc},
+                    attrs={"scale": self._beta1})
 
 
 class DecayedAdagradOptimizer(Optimizer):
@@ -1156,7 +1162,10 @@ class ModelAverage(Optimizer):
                 self.params_grads.append((param, grad))
 
         for param, grad in self.params_grads:
-            self._append_average_accumulate_op(param)
+            if grad is None:
+                continue
+            with param.block.program.optimized_guard([param, grad]):
+                self._append_average_accumulate_op(param)
 
         self.apply_program = Program()
         block = self.apply_program.global_block()
@@ -1171,16 +1180,16 @@ class ModelAverage(Optimizer):
                 self._add_average_restore_op(block, param_grad)
 
     def _add_average_apply_op(self, block, param_grad):
-        param = block.clone_variable(param_grad[0])
-        grad = block.clone_variable(param_grad[1])
-        sum_1 = block.clone_variable(self._get_accumulator('sum_1', param))
-        sum_2 = block.clone_variable(self._get_accumulator('sum_2', param))
-        sum_3 = block.clone_variable(self._get_accumulator('sum_3', param))
-        num_accumulates = block.clone_variable(
+        param = block._clone_variable(param_grad[0])
+        grad = block._clone_variable(param_grad[1])
+        sum_1 = block._clone_variable(self._get_accumulator('sum_1', param))
+        sum_2 = block._clone_variable(self._get_accumulator('sum_2', param))
+        sum_3 = block._clone_variable(self._get_accumulator('sum_3', param))
+        num_accumulates = block._clone_variable(
             self._get_accumulator('num_accumulates', param))
-        old_num_accumulates = block.clone_variable(
+        old_num_accumulates = block._clone_variable(
             self._get_accumulator('old_num_accumulates', param))
-        num_updates = block.clone_variable(
+        num_updates = block._clone_variable(
             self._get_accumulator('num_updates', param))
         # backup param value to grad
         layers.assign(input=param, output=grad)
@@ -1194,8 +1203,8 @@ class ModelAverage(Optimizer):
         layers.elementwise_div(x=sum, y=tmp, out=param)
 
     def _add_average_restore_op(self, block, param_grad):
-        param = block.clone_variable(param_grad[0])
-        grad = block.clone_variable(param_grad[1])
+        param = block._clone_variable(param_grad[0])
+        grad = block._clone_variable(param_grad[1])
         layers.assign(input=grad, output=param)
 
     def _append_average_accumulate_op(self, param):
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 6baf648198585022f992709c519038688af293e1..3bbd11d9836a62cdf9f2a84fc75e933001e12159 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -121,7 +121,7 @@ class ParallelExecutor(object):
             else:
                 cpu_num = int(
                     os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-                exec_strategy.num_threads = cpu_num
+                exec_strategy.num_threads = cpu_num * 2
 
         if build_strategy is None:
             build_strategy = BuildStrategy()
@@ -152,7 +152,7 @@ class ParallelExecutor(object):
         self.executor = core.ParallelExecutor(
             self._places,
             set([
-                p.name for p in main.global_block().iter_parameters()
+                p.name for p in main.global_block()._iter_parameters()
                 if not p.stop_gradient
             ]),
             set(self.persistable_vars), main.desc, loss_name
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index 0a42b9fca8dba7a11b414990be6c04c93158864f..4a61f85ec4b5c5108ded31632af75dbbdaaaba71 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -67,7 +67,7 @@ class ParamAttr(object):
         self.gradient_clip = gradient_clip
         self.model_average = do_model_average
 
-    def set_default_initializer(self, initializer):
+    def _set_default_initializer(self, initializer):
         """
         Set the default initializer, the initializer should be Constant,
         Uniform, Normal, Xavier, MSRA.
@@ -88,7 +88,7 @@ class ParamAttr(object):
 
         self.initializer = initializer
 
-    def set_default_param_initializer(self):
+    def _set_default_param_initializer(self):
         """
         Set the default initializer for the parameter with Xavier.
 
@@ -98,9 +98,9 @@ class ParamAttr(object):
         Returns:
             None.
         """
-        self.set_default_initializer(Xavier())
+        self._set_default_initializer(Xavier())
 
-    def set_default_bias_initializer(self):
+    def _set_default_bias_initializer(self):
         """
         Set the default initializer for the bias with Constant(0.0).
 
@@ -110,10 +110,10 @@ class ParamAttr(object):
         Returns:
             None.
         """
-        self.set_default_initializer(Constant(0.0))
+        self._set_default_initializer(Constant(0.0))
 
     @staticmethod
-    def to_attr(arg):
+    def _to_attr(arg):
         """
         Create ParamAttr[s].
 
@@ -131,7 +131,7 @@ class ParamAttr(object):
         if arg is None:
             return ParamAttr()
         elif isinstance(arg, list) or isinstance(arg, tuple):
-            return [ParamAttr.to_attr(a) for a in arg]
+            return [ParamAttr._to_attr(a) for a in arg]
         elif isinstance(arg, ParamAttr):
             return arg
         elif isinstance(arg, str) or isinstance(arg, unicode):
@@ -141,11 +141,11 @@ class ParamAttr(object):
         elif isinstance(arg, WeightDecayRegularizer):
             return ParamAttr(regularizer=arg)
         elif isinstance(arg, bool):
-            return ParamAttr.to_attr(None) if arg else False
+            return ParamAttr._to_attr(None) if arg else False
         else:
             raise TypeError("{0} cast to ParamAttr".format(type(arg)))
 
-    def to_kwargs(self, with_initializer=False):
+    def _to_kwargs(self, with_initializer=False):
         """
         Returns the attributes of this parameter.
 
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
index dac474d5ee76590a75311d6bf2c4cb2fe85b6c40..3712955b3b32de457a0d47120a00ab7d4ecd5a66 100644
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -15,10 +15,7 @@
 import framework
 from . import core
 
-__all__ = [
-    'append_regularization_ops', 'L1Decay', 'L2Decay', 'L1DecayRegularizer',
-    'L2DecayRegularizer'
-]
+__all__ = ['L1Decay', 'L2Decay', 'L1DecayRegularizer', 'L2DecayRegularizer']
 
 
 def append_regularization_ops(parameters_and_grads, regularization=None):
@@ -44,12 +41,11 @@ def append_regularization_ops(parameters_and_grads, regularization=None):
     """
     params_and_grads = []
     for param, grad in parameters_and_grads:
-        with param.block.program.optimized_guard(param):
-            # If no gradient then we don't need to do anything
-            if grad is None:
-                params_and_grads.append((param, grad))
-                continue
-
+        # If no gradient then we don't need to do anything
+        if grad is None:
+            params_and_grads.append((param, grad))
+            continue
+        with param.block.program.optimized_guard([param, grad]):
             regularization_term = None
             if param.regularizer is not None:
                 # Add variable for regularization term in grad block
@@ -146,14 +142,20 @@ class L2DecayRegularizer(WeightDecayRegularizer):
             dtype="float32", shape=param.shape, lod_level=param.lod_level)
 
         if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
+            idx = block.create_var(
+                dtype="int64",
+                shape=param.shape,
+                type=core.VarDesc.VarType.LOD_TENSOR)
             decay = block.create_var(
                 dtype="float32",
                 shape=param.shape,
                 type=core.VarDesc.VarType.SELECTED_ROWS)
+            block.append_op(
+                type='extract_rows', inputs={'X': grad}, outputs={'Out': idx})
             block.append_op(
                 type='lookup_table',
                 inputs={'W': param,
-                        'Ids': grad},
+                        'Ids': idx},
                 outputs={'Out': decay},
                 attrs={'is_sparse': True})
             param = decay
@@ -220,14 +222,20 @@ class L1DecayRegularizer(WeightDecayRegularizer):
             dtype="float32", shape=param.shape, lod_level=param.lod_level)
 
         if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
+            idx = block.create_var(
+                dtype="int64",
+                shape=param.shape,
+                type=core.VarDesc.VarType.LOD_TENSOR)
             decay = block.create_var(
                 dtype="float32",
                 shape=param.shape,
                 type=core.VarDesc.VarType.SELECTED_ROWS)
+            block.append_op(
+                type='extract_rows', inputs={'X': grad}, outputs={'Out': idx})
             block.append_op(
                 type='lookup_table',
                 inputs={'W': param,
-                        'Ids': grad},
+                        'Ids': idx},
                 outputs={'Out': decay},
                 attrs={'is_sparse': True})
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
index 9a09db25dc0e2c71772aa06e6d0cf993321612e4..fd278f45f1c1b71a1653c3b28ace8bca8e4b1545 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
@@ -14,6 +14,7 @@
 from __future__ import print_function
 import argparse
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import paddle
 import sys
 import numpy
@@ -134,4 +135,4 @@ def main(use_cuda):
 
 if __name__ == '__main__':
     # for use_cuda in (False, True):
-    main(use_cuda=True)
+    main(use_cuda=core.is_compiled_with_cuda())
diff --git a/python/paddle/fluid/tests/book/notest_understand_sentiment.py b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
index 1df7b99aad6094a8b8ddfe783b9de35cef61c524..95002aa7f9bb639828b47eb1e86e4ef954fb85ff 100644
--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import print_function
-
+from paddle.fluid.layers.device import get_places
 import unittest
 import paddle.fluid as fluid
 import paddle
@@ -144,7 +144,7 @@ def train(word_dict,
         cost, acc_out, prediction = net_method(
             data, label, input_dim=dict_dim, class_dim=class_dim)
     else:
-        places = fluid.layers.get_places()
+        places = get_places()
         pd = fluid.layers.ParallelDo(places)
         with pd.do():
             cost, acc, _ = net_method(
diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py
index 5f5c8544bbdb87421f129b201a0ebaf4cb8602a1..c471863920999a28cbede93a7965f07ee784f96d 100644
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -12,15 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import print_function
-import argparse
-import paddle.fluid as fluid
-import paddle
-import sys
-import numpy
-import unittest
+
+import paddle.fluid.core as core
 import math
-import sys
 import os
+import sys
+import unittest
+
+import numpy
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.layers.device import get_places
 
 BATCH_SIZE = 64
 
@@ -76,7 +79,7 @@ def train(nn_type,
         net_conf = conv_net
 
     if parallel:
-        places = fluid.layers.get_places()
+        places = get_places()
         pd = fluid.layers.ParallelDo(places)
         with pd.do():
             img_ = pd.read_input(img)
@@ -255,6 +258,8 @@ def inject_test_method(use_cuda, parallel, nn_type, combine):
 
 def inject_all_tests():
     for use_cuda in (False, True):
+        if use_cuda and not core.is_compiled_with_cuda():
+            continue
         for parallel in (False, True):
             for nn_type in ('mlp', 'conv'):
                 inject_test_method(use_cuda, parallel, nn_type, True)
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
index 49bd72c7a53c0ae740bdbabe15b1d37340699d41..3b957508ca1f11fea3bbc182dca7eaa938594cb6 100644
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -14,6 +14,7 @@
 
 import paddle
 import paddle.fluid as fluid
+from paddle.fluid.layers.device import get_places
 import unittest
 import os
 import numpy as np
@@ -80,7 +81,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
         avg_cost, predict_word = __network__(
             [first_word, second_word, third_word, forth_word, next_word])
     else:
-        places = fluid.layers.get_places()
+        places = get_places()
         pd = fluid.layers.ParallelDo(places)
         with pd.do():
             avg_cost, predict_word = __network__(
@@ -244,7 +245,7 @@ def inject_test_method(use_cuda, is_sparse, is_parallel):
                     is_sparse=is_sparse,
                     is_parallel=is_parallel)
 
-    if use_cuda and is_sparse:
+    if (not fluid.core.is_compiled_with_cuda() or use_cuda) and is_sparse:
         fn = __impl__
     else:
         # skip the other test when on CI server
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
index be347cd5315668dde0454d7959dbf9bcfa465b5f..bec9f8594ff7c1aff8ae5ed55c9623754d9ea091 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -12,12 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
-import paddle
-import paddle.fluid as fluid
 import math
 import sys
 
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.layers.device import get_places
+
 # need to fix random seed and training data to compare the loss
 # value accurately calculated by the default and the memory optimization
 # version.
@@ -34,7 +35,7 @@ if fluid.core.is_compiled_with_cuda():
     use_nccl = False
     place = fluid.CUDAPlace(0)
 
-places = fluid.layers.get_places(device_count=0, device_type=device_type)
+places = get_places(device_count=0, device_type=device_type)
 pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
 with pd.do():
     x_ = pd.read_input(x)
diff --git a/python/paddle/fluid/tests/demo/text_classification/.gitignore b/python/paddle/fluid/tests/demo/file_reader/.gitignore
similarity index 100%
rename from python/paddle/fluid/tests/demo/text_classification/.gitignore
rename to python/paddle/fluid/tests/demo/file_reader/.gitignore
diff --git a/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py b/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
similarity index 88%
rename from python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py
rename to python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
index 9425d472a48056e71da5da364f659971ef6c2520..b839e14889884bca8d27586aa8c1d76fba3458c1 100644
--- a/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py
+++ b/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
@@ -31,8 +31,11 @@ def load_vocab(filename):
 
 
 # load word dict with paddle inner function
-word_dict = load_vocab(sys.argv[1])
-word_dict["<unk>"] = len(word_dict)
+if len(sys.argv) == 1:
+    word_dict = paddle.dataset.imdb.word_dict()
+else:
+    word_dict = load_vocab(sys.argv[1])
+    word_dict["<unk>"] = len(word_dict)
 print "Dict dim = ", len(word_dict)
 
 # input text data
@@ -47,7 +50,7 @@ feeder = fluid.DataFeeder(feed_list=[data, label], place=fluid.CPUPlace())
 BATCH_SIZE = 128
 train_reader = paddle.batch(
     paddle.reader.shuffle(
-        paddle.dataset.imdb.train(word_dict), buf_size=10000),
+        paddle.dataset.imdb.train(word_dict), buf_size=25000),
     batch_size=BATCH_SIZE)
 
 test_reader = paddle.batch(
diff --git a/python/paddle/fluid/tests/demo/file_reader/train.py b/python/paddle/fluid/tests/demo/file_reader/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc3a6dc81d24afec66ed1489aead1cff79a59bca
--- /dev/null
+++ b/python/paddle/fluid/tests/demo/file_reader/train.py
@@ -0,0 +1,138 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import numpy
+import sys
+
+TRAIN_FILES = ['train.recordio']
+TEST_FILES = ['test.recordio']
+
+DICT_DIM = 5147
+
+# embedding dim
+emb_dim = 128
+
+# hidden dim
+hid_dim = 128
+
+# class num
+class_dim = 2
+
+# epoch num
+epoch_num = 10
+
+
+def build_program(is_train):
+    file_obj_handle = fluid.layers.io.open_files(
+        filenames=TRAIN_FILES if is_train else TEST_FILES,
+        shapes=[[-1, 1], [-1, 1]],
+        lod_levels=[1, 0],
+        dtypes=['int64', 'int64'])
+
+    file_obj = fluid.layers.io.double_buffer(file_obj_handle)
+
+    with fluid.unique_name.guard():
+
+        data, label = fluid.layers.read_file(file_obj)
+
+        emb = fluid.layers.embedding(input=data, size=[DICT_DIM, emb_dim])
+
+        conv_3 = fluid.nets.sequence_conv_pool(
+            input=emb,
+            num_filters=hid_dim,
+            filter_size=3,
+            act="tanh",
+            pool_type="sqrt")
+
+        conv_4 = fluid.nets.sequence_conv_pool(
+            input=emb,
+            num_filters=hid_dim,
+            filter_size=4,
+            act="tanh",
+            pool_type="sqrt")
+
+        prediction = fluid.layers.fc(input=[conv_3, conv_4],
+                                     size=class_dim,
+                                     act="softmax")
+
+        # cross entropy loss
+        cost = fluid.layers.cross_entropy(input=prediction, label=label)
+
+        # mean loss
+        avg_cost = fluid.layers.mean(x=cost)
+        acc = fluid.layers.accuracy(input=prediction, label=label)
+
+        if is_train:
+            # SGD optimizer
+            sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.001)
+            sgd_optimizer.minimize(avg_cost)
+
+    return {'loss': avg_cost, 'log': [avg_cost, acc], 'file': file_obj_handle}
+
+
+def main():
+    train = fluid.Program()
+    startup = fluid.Program()
+    test = fluid.Program()
+
+    with fluid.program_guard(train, startup):
+        train_args = build_program(is_train=True)
+
+    with fluid.program_guard(test, startup):
+        test_args = build_program(is_train=False)
+
+    use_cuda = fluid.core.is_compiled_with_cuda()
+    # startup
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place=place)
+    exe.run(startup)
+
+    train_exe = fluid.ParallelExecutor(
+        use_cuda=use_cuda,
+        loss_name=train_args['loss'].name,
+        main_program=train)
+    test_exe = fluid.ParallelExecutor(
+        use_cuda=use_cuda, main_program=test, share_vars_from=train_exe)
+
+    fetch_var_list = [var.name for var in train_args['log']]
+    for epoch_id in range(epoch_num):
+        # train
+        try:
+            batch_id = 0
+            while True:
+                loss, acc = map(numpy.array,
+                                train_exe.run(fetch_list=fetch_var_list))
+                print 'Train epoch', epoch_id, 'batch', batch_id, 'loss:', loss, 'acc:', acc
+                batch_id += 1
+        except fluid.core.EOFException:
+            print 'End of epoch', epoch_id
+            train_args['file'].reset()
+
+        # test
+        loss = []
+        acc = []
+        try:
+            while True:
+                loss_np, acc_np = map(numpy.array,
+                                      test_exe.run(fetch_list=fetch_var_list))
+                loss.append(loss_np[0])
+                acc.append(acc_np[0])
+        except:
+            test_args['file'].reset()
+            print 'Test loss:', numpy.mean(loss), 'acc:', numpy.mean(acc)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/fluid/tests/demo/pyreader.py b/python/paddle/fluid/tests/demo/pyreader.py
new file mode 100644
index 0000000000000000000000000000000000000000..82065401935036ca346fa395c033f0f57100f01b
--- /dev/null
+++ b/python/paddle/fluid/tests/demo/pyreader.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy
+
+import paddle
+import paddle.dataset.mnist as mnist
+import paddle.fluid as fluid
+import paddle.v2
+
+
+def network(is_train):
+    reader = fluid.layers.py_reader(
+        capacity=10,
+        shapes=((-1, 784), (-1, 1)),
+        dtypes=('float32', 'int64'),
+        name="train_reader" if is_train else "test_reader",
+        use_double_buffer=True)
+    img, label = fluid.layers.read_file(reader)
+
+    hidden = img
+
+    for i in xrange(2):
+        hidden = fluid.layers.fc(input=hidden, size=100, act='tanh')
+        hidden = fluid.layers.dropout(
+            hidden, dropout_prob=0.5, is_test=not is_train)
+
+    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    return fluid.layers.mean(loss), reader
+
+
+def main():
+    train_prog = fluid.Program()
+    startup_prog = fluid.Program()
+
+    with fluid.program_guard(train_prog, startup_prog):
+        with fluid.unique_name.guard():
+            loss, train_reader = network(True)
+            adam = fluid.optimizer.Adam(learning_rate=0.01)
+            adam.minimize(loss)
+
+    test_prog = fluid.Program()
+    test_startup = fluid.Program()
+    with fluid.program_guard(test_prog, test_startup):
+        with fluid.unique_name.guard():
+            test_loss, test_reader = network(False)
+
+    use_cuda = fluid.core.is_compiled_with_cuda()
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    fluid.Executor(place).run(startup_prog)
+    fluid.Executor(place).run(test_startup)
+
+    trainer = fluid.ParallelExecutor(
+        use_cuda=use_cuda, loss_name=loss.name, main_program=train_prog)
+
+    tester = fluid.ParallelExecutor(
+        use_cuda=use_cuda, share_vars_from=trainer, main_program=test_prog)
+
+    train_reader.decorate_paddle_reader(
+        paddle.v2.reader.shuffle(
+            paddle.batch(mnist.train(), 512), buf_size=8192))
+
+    test_reader.decorate_paddle_reader(paddle.batch(mnist.test(), 512))
+
+    for epoch_id in xrange(10):
+        train_reader.start()
+        try:
+            while True:
+                print 'train_loss', numpy.array(
+                    trainer.run(fetch_list=[loss.name]))
+        except fluid.core.EOFException:
+            print 'End of epoch', epoch_id
+            train_reader.reset()
+
+        test_reader.start()
+        try:
+            while True:
+                print 'test loss', numpy.array(
+                    tester.run(fetch_list=[test_loss.name]))
+        except fluid.core.EOFException:
+            print 'End of testing'
+            test_reader.reset()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/fluid/tests/demo/text_classification/train.py b/python/paddle/fluid/tests/demo/text_classification/train.py
deleted file mode 100644
index e408684c6e0941a1b317ffeac66f071c1382836d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/demo/text_classification/train.py
+++ /dev/null
@@ -1,148 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import numpy
-import sys
-
-TRAIN_FILES = ['train.recordio']
-TEST_FILES = ['test.recordio']
-
-DICT_DIM = 89528
-
-# embedding dim
-emb_dim = 128
-
-# hidden dim
-hid_dim = 128
-
-# hidden dim2
-hid_dim2 = 96
-
-# class num
-class_dim = 2
-
-
-def network_cfg(is_train, pass_num=100):
-    with fluid.unique_name.guard():
-        train_file_obj = fluid.layers.open_files(
-            filenames=TRAIN_FILES,
-            pass_num=pass_num,
-            shapes=[[-1, 1], [-1, 1]],
-            lod_levels=[1, 0],
-            dtypes=['int64', 'int64'],
-            thread_num=1)
-
-        test_file_obj = fluid.layers.open_files(
-            filenames=TEST_FILES,
-            pass_num=1,
-            shapes=[[-1, 1], [-1, 1]],
-            lod_levels=[1, 0],
-            dtypes=['int64', 'int64'],
-            thread_num=1)
-
-        if is_train:
-            file_obj = fluid.layers.shuffle(train_file_obj, buffer_size=1000)
-        else:
-            file_obj = test_file_obj
-
-        file_obj = fluid.layers.double_buffer(
-            file_obj,
-            name="train_double_buffer" if is_train else 'test_double_buffer')
-
-        data, label = fluid.layers.read_file(file_obj)
-
-        emb = fluid.layers.embedding(input=data, size=[DICT_DIM, emb_dim])
-
-        # sequence conv with window size = 3
-        win_size = 3
-        conv_3 = fluid.nets.sequence_conv_pool(
-            input=emb,
-            num_filters=hid_dim,
-            filter_size=win_size,
-            act="tanh",
-            pool_type="max")
-
-        # fc layer after conv
-        fc_1 = fluid.layers.fc(input=[conv_3], size=hid_dim2)
-
-        # probability of each class
-        prediction = fluid.layers.fc(input=[fc_1],
-                                     size=class_dim,
-                                     act="softmax")
-        # cross entropy loss
-        cost = fluid.layers.cross_entropy(input=prediction, label=label)
-
-        # mean loss
-        avg_cost = fluid.layers.mean(x=cost)
-        acc = fluid.layers.accuracy(input=prediction, label=label)
-
-        if is_train:
-            # SGD optimizer
-            sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.01)
-            sgd_optimizer.minimize(avg_cost)
-
-        return {
-            'loss': avg_cost,
-            'log': [avg_cost, acc],
-            'file': train_file_obj if is_train else test_file_obj
-        }
-
-
-def main():
-    train = fluid.Program()
-    startup = fluid.Program()
-
-    with fluid.program_guard(train, startup):
-        train_args = network_cfg(is_train=True)
-
-    test = fluid.Program()
-
-    with fluid.program_guard(test, fluid.Program()):
-        test_args = network_cfg(is_train=False)
-
-    # startup
-    place = fluid.CUDAPlace(0)
-    exe = fluid.Executor(place=place)
-    exe.run(startup)
-
-    train_exe = fluid.ParallelExecutor(
-        use_cuda=True, loss_name=train_args['loss'].name, main_program=train)
-
-    fetch_var_list = [var.name for var in train_args['log']]
-    for i in xrange(sys.maxint):
-        result = map(numpy.array,
-                     train_exe.run(fetch_list=fetch_var_list
-                                   if i % 1000 == 0 else []))
-        if len(result) != 0:
-            print 'Train: ', result
-
-        if i % 1000 == 0:
-            test_exe = fluid.ParallelExecutor(
-                use_cuda=True, main_program=test, share_vars_from=train_exe)
-            loss = []
-            acc = []
-            try:
-                while True:
-                    loss_np, acc_np = map(
-                        numpy.array, test_exe.run(fetch_list=fetch_var_list))
-                    loss.append(loss_np[0])
-                    acc.append(acc_np[0])
-            except:
-                test_args['file'].reset()
-                print 'TEST: ', numpy.mean(loss), numpy.mean(acc)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/python/paddle/fluid/tests/test_beam_search_decoder.py b/python/paddle/fluid/tests/test_beam_search_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a2502fa2f9733a7280e8e8d884b61b419719492
--- /dev/null
+++ b/python/paddle/fluid/tests/test_beam_search_decoder.py
@@ -0,0 +1,265 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+A simple machine translation demo using beam search decoder.
+"""
+
+import contextlib
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.framework as framework
+import paddle.fluid.layers as layers
+from paddle.fluid.executor import Executor
+from paddle.fluid.contrib.decoder.beam_search_decoder import *
+import unittest
+import os
+
+dict_size = 30000
+source_dict_dim = target_dict_dim = dict_size
+src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+hidden_dim = 32
+word_dim = 32
+decoder_size = hidden_dim
+IS_SPARSE = True
+batch_size = 2
+max_length = 8
+topk_size = 50
+trg_dic_size = 10000
+beam_size = 2
+
+
+def encoder():
+    # encoder
+    src_word = layers.data(
+        name="src_word", shape=[1], dtype='int64', lod_level=1)
+    src_embedding = layers.embedding(
+        input=src_word,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE)
+
+    fc1 = layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
+    lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
+    encoder_out = layers.sequence_last_step(input=lstm_hidden0)
+    return encoder_out
+
+
+def decoder_state_cell(context):
+    h = InitState(init=context, need_reorder=True)
+    state_cell = StateCell(inputs={'x': None}, states={'h': h}, out_state='h')
+
+    @state_cell.state_updater
+    def updater(state_cell):
+        current_word = state_cell.get_input('x')
+        prev_h = state_cell.get_state('h')
+        # make sure lod of h heritted from prev_h
+        h = layers.fc(input=[prev_h, current_word],
+                      size=decoder_size,
+                      act='tanh')
+        state_cell.set_state('h', h)
+
+    return state_cell
+
+
+def decoder_train(state_cell):
+    # decoder
+    trg_language_word = layers.data(
+        name="target_word", shape=[1], dtype='int64', lod_level=1)
+    trg_embedding = layers.embedding(
+        input=trg_language_word,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE)
+
+    decoder = TrainingDecoder(state_cell)
+
+    with decoder.block():
+        current_word = decoder.step_input(trg_embedding)
+        decoder.state_cell.compute_state(inputs={'x': current_word})
+        current_score = layers.fc(input=decoder.state_cell.get_state('h'),
+                                  size=target_dict_dim,
+                                  act='softmax')
+        decoder.state_cell.update_states()
+        decoder.output(current_score)
+
+    return decoder()
+
+
+def decoder_decode(state_cell):
+    init_ids = layers.data(
+        name="init_ids", shape=[1], dtype="int64", lod_level=2)
+    init_scores = layers.data(
+        name="init_scores", shape=[1], dtype="float32", lod_level=2)
+
+    decoder = BeamSearchDecoder(
+        state_cell=state_cell,
+        init_ids=init_ids,
+        init_scores=init_scores,
+        target_dict_dim=target_dict_dim,
+        word_dim=word_dim,
+        input_var_dict={},
+        topk_size=topk_size,
+        sparse_emb=IS_SPARSE,
+        max_len=max_length,
+        beam_size=beam_size,
+        end_id=1,
+        name=None)
+    decoder.decode()
+    translation_ids, translation_scores = decoder()
+
+    return translation_ids, translation_scores
+
+
+def train_main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    context = encoder()
+    state_cell = decoder_state_cell(context)
+    rnn_out = decoder_train(state_cell)
+    label = layers.data(
+        name="target_next_word", shape=[1], dtype='int64', lod_level=1)
+    cost = layers.cross_entropy(input=rnn_out, label=label)
+    avg_cost = layers.mean(x=cost)
+
+    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-3)
+    optimizer.minimize(avg_cost)
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=batch_size)
+    feed_order = ['src_word', 'target_word', 'target_next_word']
+
+    exe = Executor(place)
+
+    def train_loop(main_program):
+        exe.run(framework.default_startup_program())
+
+        feed_list = [
+            main_program.global_block().var(var_name) for var_name in feed_order
+        ]
+        feeder = fluid.DataFeeder(feed_list, place)
+
+        for pass_id in xrange(1):
+            for batch_id, data in enumerate(train_reader()):
+                outs = exe.run(main_program,
+                               feed=feeder.feed(data),
+                               fetch_list=[avg_cost])
+                avg_cost_val = np.array(outs[0])
+                print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
+                      " avg_cost=" + str(avg_cost_val))
+                if batch_id > 3:
+                    break
+
+    train_loop(framework.default_main_program())
+
+
+def decode_main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    context = encoder()
+    state_cell = decoder_state_cell(context)
+    translation_ids, translation_scores = decoder_decode(state_cell)
+
+    exe = Executor(place)
+    exe.run(framework.default_startup_program())
+
+    init_ids_data = np.array([0 for _ in range(batch_size)], dtype='int64')
+    init_scores_data = np.array(
+        [1. for _ in range(batch_size)], dtype='float32')
+    init_ids_data = init_ids_data.reshape((batch_size, 1))
+    init_scores_data = init_scores_data.reshape((batch_size, 1))
+    init_lod = [1] * batch_size
+    init_lod = [init_lod, init_lod]
+
+    init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place)
+    init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place)
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=batch_size)
+
+    feed_order = ['src_word']
+    feed_list = [
+        framework.default_main_program().global_block().var(var_name)
+        for var_name in feed_order
+    ]
+    feeder = fluid.DataFeeder(feed_list, place)
+
+    data = train_reader().next()
+    feed_dict = feeder.feed(map(lambda x: [x[0]], data))
+    feed_dict['init_ids'] = init_ids
+    feed_dict['init_scores'] = init_scores
+
+    result_ids, result_scores = exe.run(
+        framework.default_main_program(),
+        feed=feed_dict,
+        fetch_list=[translation_ids, translation_scores],
+        return_numpy=False)
+    print result_ids.lod()
+
+
+class TestBeamSearchDecoder(unittest.TestCase):
+    pass
+
+
+@contextlib.contextmanager
+def scope_prog_guard():
+    prog = fluid.Program()
+    startup_prog = fluid.Program()
+    scope = fluid.core.Scope()
+    with fluid.scope_guard(scope):
+        with fluid.program_guard(prog, startup_prog):
+            yield
+
+
+def inject_test_train(use_cuda):
+    f_name = 'test_{0}_train'.format('cuda' if use_cuda else 'cpu')
+
+    def f(*args):
+        with scope_prog_guard():
+            train_main(use_cuda)
+
+    setattr(TestBeamSearchDecoder, f_name, f)
+
+
+def inject_test_decode(use_cuda, decorator=None):
+    f_name = 'test_{0}_decode'.format('cuda' if use_cuda else 'cpu', 'sparse')
+
+    def f(*args):
+        with scope_prog_guard():
+            decode_main(use_cuda)
+
+    if decorator is not None:
+        f = decorator(f)
+
+    setattr(TestBeamSearchDecoder, f_name, f)
+
+
+for _use_cuda_ in (False, True):
+    inject_test_train(_use_cuda_)
+
+for _use_cuda_ in (False, True):
+    _decorator_ = None
+    inject_test_decode(use_cuda=_use_cuda_, decorator=_decorator_)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/test_error_clip.py b/python/paddle/fluid/tests/test_error_clip.py
index 89f4c64975802dc1827ec17ed3626b91e36d6971..3dc858971c584cca947cd958680dbdcf25df9e99 100644
--- a/python/paddle/fluid/tests/test_error_clip.py
+++ b/python/paddle/fluid/tests/test_error_clip.py
@@ -36,7 +36,7 @@ with fluid.program_guard(main_program=prog):
     avg_cost = fluid.layers.mean(cost)
 
 prog_clip = prog.clone()
-prog_clip.block(0).var(hidden1.name).set_error_clip(
+prog_clip.block(0).var(hidden1.name)._set_error_clip(
     fluid.clip.ErrorClipByValue(
         max=CLIP_MAX, min=CLIP_MIN))
 
diff --git a/python/paddle/fluid/tests/test_mnist_if_else_op.py b/python/paddle/fluid/tests/test_if_else_op.py
similarity index 63%
rename from python/paddle/fluid/tests/test_mnist_if_else_op.py
rename to python/paddle/fluid/tests/test_if_else_op.py
index d34f52db5ffc889f17513d034ad2c99f696b0cdf..799c31dfe5161ff6aef47601f1b6f6e38885760b 100644
--- a/python/paddle/fluid/tests/test_mnist_if_else_op.py
+++ b/python/paddle/fluid/tests/test_if_else_op.py
@@ -14,10 +14,15 @@
 
 import paddle
 import paddle.fluid.layers as layers
-from paddle.fluid.framework import Program, program_guard, default_main_program, default_startup_program
+from paddle.fluid.framework import Program, program_guard
 from paddle.fluid.executor import Executor
 from paddle.fluid.optimizer import MomentumOptimizer
 import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid.layers.control_flow import split_lod_tensor
+from paddle.fluid.layers.control_flow import merge_lod_tensor
+from paddle.fluid.layers.control_flow import ConditionalBlock
+
 import unittest
 import numpy as np
 
@@ -31,14 +36,12 @@ class TestMNISTIfElseOp(unittest.TestCase):
 
             label = layers.data(name='y', shape=[1], dtype='int64')
 
-            limit = layers.fill_constant_batch_size_like(
-                input=label, dtype='int64', shape=[1], value=5.0)
+            limit = layers.fill_constant(shape=[1], dtype='int64', value=5)
             cond = layers.less_than(x=label, y=limit)
-            true_image, false_image = layers.split_lod_tensor(
-                input=image, mask=cond)
+            true_image, false_image = split_lod_tensor(input=image, mask=cond)
 
             true_out = layers.create_tensor(dtype='float32')
-            true_cond = layers.ConditionalBlock([true_image])
+            true_cond = ConditionalBlock([cond])
 
             with true_cond.block():
                 hidden = layers.fc(input=true_image, size=100, act='tanh')
@@ -46,14 +49,14 @@ class TestMNISTIfElseOp(unittest.TestCase):
                 layers.assign(input=prob, output=true_out)
 
             false_out = layers.create_tensor(dtype='float32')
-            false_cond = layers.ConditionalBlock([false_image])
+            false_cond = ConditionalBlock([cond])
 
             with false_cond.block():
                 hidden = layers.fc(input=false_image, size=200, act='tanh')
                 prob = layers.fc(input=hidden, size=10, act='softmax')
                 layers.assign(input=prob, output=false_out)
 
-            prob = layers.merge_lod_tensor(
+            prob = merge_lod_tensor(
                 in_true=true_out, in_false=false_out, mask=cond, x=image)
             loss = layers.cross_entropy(input=prob, label=label)
             avg_loss = layers.mean(loss)
@@ -64,7 +67,7 @@ class TestMNISTIfElseOp(unittest.TestCase):
         train_reader = paddle.batch(
             paddle.reader.shuffle(
                 paddle.dataset.mnist.train(), buf_size=8192),
-            batch_size=200)
+            batch_size=10)
 
         place = core.CPUPlace()
         exe = Executor(place)
@@ -94,8 +97,7 @@ class TestMNISTIfElseOp(unittest.TestCase):
 
             label = layers.data(name='y', shape=[1], dtype='int64')
 
-            limit = layers.fill_constant_batch_size_like(
-                input=label, dtype='int64', shape=[1], value=5.0)
+            limit = layers.fill_constant(shape=[1], dtype='int64', value=5)
             cond = layers.less_than(x=label, y=limit)
             ie = layers.IfElse(cond)
 
@@ -125,7 +127,7 @@ class TestMNISTIfElseOp(unittest.TestCase):
         place = core.CPUPlace()
         exe = Executor(place)
 
-        exe.run(kwargs['startup_program'])
+        exe.run(startup_prog)
         PASS_NUM = 100
         for pass_id in range(PASS_NUM):
             for data in train_reader():
@@ -133,7 +135,7 @@ class TestMNISTIfElseOp(unittest.TestCase):
                 y_data = np.array(map(lambda x: x[1], data)).astype("int64")
                 y_data = y_data.reshape((y_data.shape[0], 1))
 
-                outs = exe.run(kwargs['main_program'],
+                outs = exe.run(prog,
                                feed={'x': x_data,
                                      'y': y_data},
                                fetch_list=[avg_loss])
@@ -143,6 +145,67 @@ class TestMNISTIfElseOp(unittest.TestCase):
         self.assertFalse(True)
 
 
+class TestIfElse(unittest.TestCase):
+    def set_test_case(self):
+        # condiction is: self.data < self.cond_value
+        self.cond_value = 0.5
+        self.data = np.random.rand(25, 1).astype(np.float32)
+
+    def compare_ifelse_op_and_numpy(self, place):
+        self.set_test_case()
+
+        prog = Program()
+        startup_prog = Program()
+        with program_guard(prog, startup_prog):
+            src = layers.data(name='data', shape=[1], dtype='float32')
+            cond = layers.fill_constant(
+                [1], dtype='float32', value=self.cond_value)
+            ifcond = layers.less_than(x=src, y=cond)
+            ie = layers.IfElse(ifcond)
+            with ie.true_block():
+                true_target = ie.input(src)
+                ie.output(true_target)
+
+            with ie.false_block():
+                false_target = ie.input(src)
+                ie.output(false_target)
+            if_out = ie()
+            out = layers.reduce_sum(if_out)
+
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            fetch_list = [out]
+            o1, = exe.run(fluid.default_main_program(),
+                          feed={'data': self.data},
+                          fetch_list=[out])
+            o2 = np.sum(self.data)
+            self.assertTrue(
+                np.allclose(
+                    o1, o2, atol=1e-8),
+                "IfElse result : " + str(o1) + "\n Numpy result :" + str(o2))
+
+    def test_cpu(self):
+        self.compare_ifelse_op_and_numpy(fluid.CPUPlace())
+
+    def test_cuda(self):
+        if not core.is_compiled_with_cuda():
+            return
+        self.compare_ifelse_op_and_numpy(fluid.CUDAPlace(0))
+
+
+class TestIfElseTrueBranch(TestIfElse):
+    def set_test_case(self):
+        # condiction is: self.data < self.cond_value
+        self.cond_value = 10.
+        self.data = np.random.rand(25, 1).astype(np.float32)
+
+
+class TestIfElseFalseBranch(TestIfElse):
+    def set_test_case(self):
+        # condiction is: self.data < self.cond_value
+        self.cond_value = -10.
+        self.data = np.random.rand(25, 1).astype(np.float32)
+
+
 if __name__ == '__main__':
-    # temp disable if else unittest since it could be buggy.
-    exit(0)
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index f6c8dcabcbc592024188f4742e6c532a704d2289..a6a911721dfa31e5fb8d57645071af42adc968be 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -12,6 +12,11 @@ endif(NOT WITH_MKLDNN)
 
 if(NOT WITH_DISTRIBUTE)
     list(REMOVE_ITEM TEST_OPS test_recv_op)
+    list(REMOVE_ITEM TEST_OPS test_dist_transpiler)
+    list(REMOVE_ITEM TEST_OPS test_simple_dist_transpiler)
+    list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
+    LIST(REMOVE_ITEM TEST_OPS test_dist_mnist)
+    LIST(REMOVE_ITEM TEST_OPS test_dist_word2vec)
 endif(NOT WITH_DISTRIBUTE)
 
 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
@@ -35,7 +40,7 @@ function(py_test_modules TARGET_NAME)
              ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     if (py_test_modules_SERIAL)
-        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
     endif()
   endif()
 endfunction()
@@ -43,13 +48,23 @@ list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_dist_train)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
+list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
+list(REMOVE_ITEM TEST_OPS test_dist_transformer)
+list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
+list(REMOVE_ITEM TEST_OPS test_image_classification_resnet)
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
-py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
+if(WITH_DISTRIBUTE)
+    py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
+    set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
+    set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 180)
+    set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 180)
+endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
-set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
-set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 180)
-set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 180)
+py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
+py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
+py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
+py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f35d96f67ad5ef79ec9cb20f070a8352f0e97e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
@@ -0,0 +1,355 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import sys
+import signal
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class SE_ResNeXt():
+    def __init__(self, layers=50):
+        self.params = train_parameters
+        self.layers = layers
+
+    def net(self, input, class_dim=1000):
+        layers = self.layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+        if layers == 50:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 6, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 101:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 23, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 152:
+            cardinality = 64
+            reduction_ratio = 16
+            depth = [3, 8, 36, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=3,
+                stride=2,
+                act='relu')
+            conv = self.conv_bn_layer(
+                input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=128,
+                filter_size=3,
+                stride=1,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv, pool_size=3, pool_stride=2, pool_padding=1, \
+                pool_type='max')
+
+        for block in range(len(depth)):
+            for i in range(depth[block]):
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1,
+                    cardinality=cardinality,
+                    reduction_ratio=reduction_ratio)
+
+        pool = fluid.layers.pool2d(
+            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
+        drop = fluid.layers.dropout(x=pool, dropout_prob=0.2)
+        stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
+        out = fluid.layers.fc(input=drop, size=class_dim, act='softmax')
+        return out
+
+    def shortcut(self, input, ch_out, stride):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            filter_size = 1
+            return self.conv_bn_layer(input, ch_out, filter_size, stride)
+        else:
+            return input
+
+    def bottleneck_block(self, input, num_filters, stride, cardinality,
+                         reduction_ratio):
+        conv0 = self.conv_bn_layer(
+            input=input, num_filters=num_filters, filter_size=1, act='relu')
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            groups=cardinality,
+            act='relu')
+        conv2 = self.conv_bn_layer(
+            input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
+        scale = self.squeeze_excitation(
+            input=conv2,
+            num_channels=num_filters * 2,
+            reduction_ratio=reduction_ratio)
+
+        short = self.shortcut(input, num_filters * 2, stride)
+
+        return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
+
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) / 2,
+            groups=groups,
+            act=None,
+            # avoid pserver CPU init differs from GPU
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant()),
+            bias_attr=False)
+        return fluid.layers.batch_norm(input=conv, act=act)
+
+    def squeeze_excitation(self, input, num_channels, reduction_ratio):
+        pool = fluid.layers.pool2d(
+            input=input, pool_size=0, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        squeeze = fluid.layers.fc(input=pool,
+                                  size=num_channels / reduction_ratio,
+                                  act='relu')
+        stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
+        excitation = fluid.layers.fc(input=squeeze,
+                                     size=num_channels,
+                                     act='sigmoid')
+        scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
+        return scale
+
+
+def get_model(batch_size):
+    # Input data
+    image = fluid.layers.data(name="data", shape=[3, 224, 224], dtype='float32')
+    label = fluid.layers.data(name="int64", shape=[1], dtype='int64')
+
+    # Train program
+    model = SE_ResNeXt(layers=50)
+    out = model.net(input=image, class_dim=102)
+    cost = fluid.layers.cross_entropy(input=out, label=label)
+
+    avg_cost = fluid.layers.mean(x=cost)
+    acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+    acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+
+    # Evaluator
+    test_program = fluid.default_main_program().clone(for_test=True)
+
+    # Optimization
+    total_images = 6149  # flowers
+    epochs = [30, 60, 90]
+    step = int(total_images / batch_size + 1)
+
+    bd = [step * e for e in epochs]
+    base_lr = 0.1
+    lr = []
+    lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+
+    optimizer = fluid.optimizer.Momentum(
+        # FIXME(typhoonzero): add back LR decay once ParallelExecutor fixed.
+        #learning_rate=fluid.layers.piecewise_decay(
+        #    boundaries=bd, values=lr),
+        learning_rate=base_lr,
+        momentum=0.9,
+        regularization=fluid.regularizer.L2Decay(1e-4))
+    optimizer.minimize(avg_cost)
+
+    # Reader
+    train_reader = paddle.batch(
+        paddle.dataset.flowers.train(), batch_size=batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size)
+
+    return test_program, avg_cost, train_reader, test_reader, acc_top1, out
+
+
+def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
+    t = fluid.DistributeTranspiler()
+    t.transpile(
+        trainer_id=trainer_id,
+        program=main_program,
+        pservers=pserver_endpoints,
+        trainers=trainers)
+    return t
+
+
+class DistSeResneXt2x2:
+    def run_pserver(self, pserver_endpoints, trainers, current_endpoint,
+                    trainer_id):
+        get_model(batch_size=2)
+        t = get_transpiler(trainer_id,
+                           fluid.default_main_program(), pserver_endpoints,
+                           trainers)
+        pserver_prog = t.get_pserver_program(current_endpoint)
+        startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+        exe.run(pserver_prog)
+
+    def _wait_ps_ready(self, pid):
+        retry_times = 20
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(3)
+            print("waiting ps ready: ", pid)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                retry_times -= 1
+
+    def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True):
+        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = get_model(
+            batch_size=2)
+        if is_dist:
+            t = get_transpiler(trainer_id,
+                               fluid.default_main_program(), endpoints,
+                               trainers)
+            trainer_prog = t.get_trainer_program()
+        else:
+            trainer_prog = fluid.default_main_program()
+
+        startup_exe = fluid.Executor(place)
+        startup_exe.run(fluid.default_startup_program())
+
+        strategy = fluid.ExecutionStrategy()
+        strategy.num_threads = 1
+        strategy.allow_op_delay = False
+        exe = fluid.ParallelExecutor(
+            True, loss_name=avg_cost.name, exec_strategy=strategy)
+
+        feed_var_list = [
+            var for var in trainer_prog.global_block().vars.itervalues()
+            if var.is_data
+        ]
+
+        feeder = fluid.DataFeeder(feed_var_list, place)
+        reader_generator = test_reader()
+
+        data = next(reader_generator)
+        first_loss, = exe.run(fetch_list=[avg_cost.name],
+                              feed=feeder.feed(data))
+        print(first_loss)
+
+        for i in xrange(5):
+            data = next(reader_generator)
+            loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))
+
+        data = next(reader_generator)
+        last_loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))
+        print(last_loss)
+
+
+def main(role="pserver",
+         endpoints="127.0.0.1:9123",
+         trainer_id=0,
+         current_endpoint="127.0.0.1:9123",
+         trainers=1,
+         is_dist=True):
+    model = DistSeResneXt2x2()
+    if role == "pserver":
+        model.run_pserver(endpoints, trainers, current_endpoint, trainer_id)
+    else:
+        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        model.run_trainer(p, endpoints, trainer_id, trainers, is_dist)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 7:
+        print(
+            "Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]"
+        )
+    role = sys.argv[1]
+    endpoints = sys.argv[2]
+    trainer_id = int(sys.argv[3])
+    current_endpoint = sys.argv[4]
+    trainers = int(sys.argv[5])
+    is_dist = True if sys.argv[6] == "TRUE" else False
+    main(
+        role=role,
+        endpoints=endpoints,
+        trainer_id=trainer_id,
+        current_endpoint=current_endpoint,
+        trainers=trainers,
+        is_dist=is_dist)
diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee8020a73546cb9037e9dc4be589c62bb1b6b937
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -0,0 +1,280 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+import os
+import sys
+import transformer_model
+import paddle.dataset.wmt16 as wmt16
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+WMT16_RECORDIO_FILE = "/tmp/wmt16.recordio"
+
+
+class ModelHyperParams(object):
+    # Dictionary size for source and target language. This model directly uses
+    # paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
+    # alreay been added, but the <pad> token is not added. Transformer requires
+    # sequences in a mini-batch are padded to have the same length. A <pad> token is
+    # added into the original dictionary in paddle.dateset.wmt16.
+
+    # size of source word dictionary.
+    src_vocab_size = 10000
+    # index for <pad> token in source language.
+    src_pad_idx = src_vocab_size
+
+    # size of target word dictionay
+    trg_vocab_size = 10000
+    # index for <pad> token in target language.
+    trg_pad_idx = trg_vocab_size
+
+    # position value corresponding to the <pad> token.
+    pos_pad_idx = 0
+
+    # max length of sequences. It should plus 1 to include position
+    # padding token for position encoding.
+    max_length = 50
+
+    # the dimension for word embeddings, which is also the last dimension of
+    # the input and output of multi-head attention, position-wise feed-forward
+    # networks, encoder and decoder.
+
+    d_model = 512
+    # size of the hidden layer in position-wise feed-forward networks.
+    d_inner_hid = 1024
+    # the dimension that keys are projected to for dot-product attention.
+    d_key = 64
+    # the dimension that values are projected to for dot-product attention.
+    d_value = 64
+    # number of head used in multi-head attention.
+    n_head = 8
+    # number of sub-layers to be stacked in the encoder and decoder.
+    n_layer = 6
+    # dropout rate used by all dropout layers.
+    dropout = 0.1
+
+
+def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and attention bias. Then, convert the numpy
+    data to tensors and return a dict mapping names to tensors.
+    """
+
+    def __pad_batch_data(insts,
+                         pad_idx,
+                         is_target=False,
+                         return_pos=True,
+                         return_attn_bias=True,
+                         return_max_len=True):
+        """
+        Pad the instances to the max sequence length in batch, and generate the
+        corresponding position data and attention bias.
+        """
+        return_list = []
+        max_len = max(len(inst) for inst in insts)
+        inst_data = np.array(
+            [inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
+        return_list += [inst_data.astype("int64").reshape([-1, 1])]
+        if return_pos:
+            inst_pos = np.array([[
+                pos_i + 1 if w_i != pad_idx else 0
+                for pos_i, w_i in enumerate(inst)
+            ] for inst in inst_data])
+
+            return_list += [inst_pos.astype("int64").reshape([-1, 1])]
+        if return_attn_bias:
+            if is_target:
+                # This is used to avoid attention on paddings and subsequent
+                # words.
+                slf_attn_bias_data = np.ones((inst_data.shape[0], max_len,
+                                              max_len))
+                slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape(
+                    [-1, 1, max_len, max_len])
+                slf_attn_bias_data = np.tile(slf_attn_bias_data,
+                                             [1, n_head, 1, 1]) * [-1e9]
+            else:
+                # This is used to avoid attention on paddings.
+                slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] *
+                                               (max_len - len(inst))
+                                               for inst in insts])
+                slf_attn_bias_data = np.tile(
+                    slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
+                    [1, n_head, max_len, 1])
+            return_list += [slf_attn_bias_data.astype("float32")]
+        if return_max_len:
+            return_list += [max_len]
+        return return_list if len(return_list) > 1 else return_list[0]
+
+    src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data(
+        [inst[0] for inst in insts], src_pad_idx, is_target=False)
+    trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data(
+        [inst[1] for inst in insts], trg_pad_idx, is_target=True)
+    trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
+                                [1, 1, trg_max_len, 1]).astype("float32")
+    lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False,
+                                False, False, False)
+    lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])
+
+    return [
+        src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias,
+        trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
+    ]
+
+
+def transformer(use_feed):
+    assert not use_feed, "transfomer doesn't support feed yet"
+    return transformer_model.transformer(
+        ModelHyperParams.src_vocab_size + 1,
+        ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,
+        ModelHyperParams.n_layer, ModelHyperParams.n_head,
+        ModelHyperParams.d_key, ModelHyperParams.d_value,
+        ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
+        ModelHyperParams.dropout, ModelHyperParams.src_pad_idx,
+        ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)
+
+
+def get_model():
+    avg_cost = transformer(use_feed=False)
+    optimizer = fluid.optimizer.Adam()
+    optimizer.minimize(avg_cost)
+    return avg_cost
+
+
+def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
+    t = fluid.DistributeTranspiler()
+    t.transpile(
+        trainer_id=trainer_id,
+        program=main_program,
+        pservers=pserver_endpoints,
+        trainers=trainers)
+    return t
+
+
+class DistTransformer2x2(object):
+    def run_pserver(self, pserver_endpoints, trainers, current_endpoint,
+                    trainer_id):
+        get_model()
+        t = get_transpiler(trainer_id,
+                           fluid.default_main_program(), pserver_endpoints,
+                           trainers)
+        pserver_prog = t.get_pserver_program(current_endpoint)
+        startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+        exe.run(pserver_prog)
+
+    def _wait_ps_ready(self, pid):
+        retry_times = 20
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(3)
+            print("waiting ps ready: ", pid)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                retry_times -= 1
+
+    def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True):
+        avg_cost = get_model()
+        if is_dist:
+            t = get_transpiler(trainer_id,
+                               fluid.default_main_program(), endpoints,
+                               trainers)
+            trainer_prog = t.get_trainer_program()
+        else:
+            trainer_prog = fluid.default_main_program()
+
+        startup_exe = fluid.Executor(place)
+        startup_exe.run(fluid.default_startup_program())
+
+        strategy = fluid.ExecutionStrategy()
+        strategy.num_threads = 1
+        strategy.allow_op_delay = False
+        exe = fluid.ParallelExecutor(
+            True, loss_name=avg_cost.name, exec_strategy=strategy)
+
+        first_loss, = exe.run(fetch_list=[avg_cost.name])
+        print(first_loss)
+        for i in xrange(5):
+            _ = exe.run(fetch_list=[avg_cost.name])
+        last_loss, = exe.run(fetch_list=[avg_cost.name])
+        print(last_loss)
+
+
+def main(role="pserver",
+         endpoints="127.0.0.1:9123",
+         trainer_id=0,
+         current_endpoint="127.0.0.1:9123",
+         trainers=1,
+         is_dist=True):
+
+    reader = paddle.batch(
+        wmt16.train(ModelHyperParams.src_vocab_size,
+                    ModelHyperParams.trg_vocab_size),
+        batch_size=transformer_model.batch_size)
+
+    with fluid.recordio_writer.create_recordio_writer(
+            WMT16_RECORDIO_FILE) as writer:
+        for batch in reader():
+            for tensor in prepare_batch_input(
+                    batch, ModelHyperParams.src_pad_idx,
+                    ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head):
+                t = fluid.LoDTensor()
+                t.set(tensor, fluid.CPUPlace())
+                writer.append_tensor(t)
+            writer.complete_append_tensor()
+
+    model = DistTransformer2x2()
+    if role == "pserver":
+        model.run_pserver(endpoints, trainers, current_endpoint, trainer_id)
+    else:
+        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        model.run_trainer(p, endpoints, trainer_id, trainers, is_dist)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 7:
+        print(
+            "Usage: python dist_transformer.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]"
+        )
+    role = sys.argv[1]
+    endpoints = sys.argv[2]
+    trainer_id = int(sys.argv[3])
+    current_endpoint = sys.argv[4]
+    trainers = int(sys.argv[5])
+    is_dist = True if sys.argv[6] == "TRUE" else False
+    main(
+        role=role,
+        endpoints=endpoints,
+        trainer_id=trainer_id,
+        current_endpoint=current_endpoint,
+        trainers=trainers,
+        is_dist=is_dist)
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index e056ef9952a519d6c4d580b27f1118a3a91f13af..2ddfd47fe0c33b0e9771fe6f502b90eb77161100 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -60,12 +60,16 @@ def get_numeric_gradient(place,
         return np.array(sum).mean()
 
     tensor_to_check = scope.find_var(input_to_check).get_tensor()
-    tensor_size = product(tensor_to_check.get_dims())
-    tensor_to_check_dtype = tensor_to_check.dtype()
+    tensor_size = product(tensor_to_check.shape())
+    tensor_to_check_dtype = tensor_to_check._dtype()
     if tensor_to_check_dtype == core.VarDesc.VarType.FP32:
         tensor_to_check_dtype = np.float32
     elif tensor_to_check_dtype == core.VarDesc.VarType.FP64:
         tensor_to_check_dtype = np.float64
+    elif tensor_to_check_dtype == core.VarDesc.VarType.FP16:
+        tensor_to_check_dtype = np.float16
+        # set delta as np.float16, will automatic convert to float32, float64
+        delta = np.array(delta).astype(np.float16)
     else:
         raise ValueError("Not supported data type " + str(
             tensor_to_check_dtype))
@@ -73,16 +77,27 @@ def get_numeric_gradient(place,
     gradient_flat = np.zeros(shape=(tensor_size, ), dtype=tensor_to_check_dtype)
 
     def __get_elem__(tensor, i):
-        if tensor_to_check_dtype == np.float32:
-            return tensor.get_float_element(i)
+        if tensor_to_check_dtype == np.float16:
+            numpy_tensor = np.array(tensor).astype(np.float16)
+            numpy_tensor = numpy_tensor.flatten()
+            return numpy_tensor[i]
+        elif tensor_to_check_dtype == np.float32:
+            return tensor._get_float_element(i)
         else:
-            return tensor.get_double_element(i)
+            return tensor._get_double_element(i)
 
     def __set_elem__(tensor, i, e):
-        if tensor_to_check_dtype == np.float32:
-            tensor.set_float_element(i, e)
+        if tensor_to_check_dtype == np.float16:
+            numpy_tensor = np.array(tensor).astype(np.float16)
+            shape = numpy_tensor.shape
+            numpy_tensor = numpy_tensor.flatten()
+            numpy_tensor[i] = e
+            numpy_tensor = numpy_tensor.reshape(shape).view(np.uint16)
+            tensor.set(numpy_tensor, place)
+        elif tensor_to_check_dtype == np.float32:
+            tensor._set_float_element(i, e)
         else:
-            tensor.set_double_element(i, e)
+            tensor._set_double_element(i, e)
 
     # we only compute gradient of one element each time.
     # we use a for loop to compute the gradient of every element.
@@ -107,7 +122,7 @@ def get_numeric_gradient(place,
         __set_elem__(tensor_to_check, i, origin)
         gradient_flat[i] = (y_pos - y_neg) / delta / 2
 
-    return gradient_flat.reshape(tensor_to_check.get_dims())
+    return gradient_flat.reshape(tensor_to_check.shape())
 
 
 class OpTest(unittest.TestCase):
@@ -125,7 +140,7 @@ class OpTest(unittest.TestCase):
 
     @classmethod
     def tearDownClass(cls):
-        '''Restore random seeds'''
+        """Restore random seeds"""
         np.random.set_state(cls._np_rand_state)
         random.setstate(cls._py_rand_state)
 
@@ -133,6 +148,11 @@ class OpTest(unittest.TestCase):
         if not self.call_once:
             self.call_once = True
             self.dtype = data_type
+            # See the comment of np_dtype_to_fluid_dtype
+            # If the input type is uint16, we assume use float16
+            # for lodtensor dtype.
+            if self.dtype == np.uint16:
+                self.dtype == np.float16
 
     def infer_dtype_from_inputs_outputs(self, inputs, outputs):
         def infer_dtype(numpy_dict):
@@ -161,19 +181,25 @@ class OpTest(unittest.TestCase):
                 for name, np_value in self.inputs[var_name]:
                     tensor = core.LoDTensor()
                     if isinstance(np_value, tuple):
-                        tensor.set(np_value[0], place)
+                        tensor.set(
+                            OpTest.np_value_to_fluid_value(np_value[0]), place)
                         tensor.set_recursive_sequence_lengths(np_value[1])
                     else:
-                        tensor.set(np_value, place)
+                        tensor.set(
+                            OpTest.np_value_to_fluid_value(np_value), place)
                     feed_map[name] = tensor
             else:
                 tensor = core.LoDTensor()
                 if isinstance(self.inputs[var_name], tuple):
-                    tensor.set(self.inputs[var_name][0], place)
+                    tensor.set(
+                        OpTest.np_value_to_fluid_value(self.inputs[var_name][
+                            0]), place)
                     tensor.set_recursive_sequence_lengths(self.inputs[var_name][
                         1])
                 else:
-                    tensor.set(self.inputs[var_name], place)
+                    tensor.set(
+                        OpTest.np_value_to_fluid_value(self.inputs[var_name]),
+                        place)
                 feed_map[var_name] = tensor
 
         return feed_map
@@ -251,7 +277,7 @@ class OpTest(unittest.TestCase):
             for out_name, out_dup in Operator.get_op_outputs(self.op_type):
                 fetch_list.append(str(out_name))
         # fetch_list = map(block.var, fetch_list)
-        if not isinstance(fetch_list[0], Variable):
+        if not isinstance(fetch_list[0], fluid.framework.Variable):
             fetch_list = map(block.var, fetch_list)
         outs = executor.run(program,
                             feed=feed_map,
@@ -307,13 +333,22 @@ class OpTest(unittest.TestCase):
                     np.allclose(
                         actual_t, expect_t, atol=atol),
                     "Output (" + out_name + ") has diff at " + str(place) +
-                    str(actual_t) + "\n" + str(expect_t))
+                    "\nExpect " + str(expect_t) + "\n" + "But Got" +
+                    str(actual_t))
                 if isinstance(expect, tuple):
                     self.assertListEqual(actual.recursive_sequence_lengths(),
                                          expect[1], "Output (" + out_name +
                                          ") has different lod at " + str(place))
 
     def _get_places(self):
+        if self.dtype == np.float16:
+            if core.is_compiled_with_cuda() and core.op_support_gpu(
+                    self.op_type):
+                place = core.CUDAPlace(0)
+                if core.is_float16_supported(place):
+                    return [place]
+            else:
+                return []
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
             places.append(core.CUDAPlace(0))
@@ -344,9 +379,9 @@ class OpTest(unittest.TestCase):
             def err_msg():
                 offset = np.argmax(diff_mat > max_relative_error)
                 return ("%s Variable %s max gradient diff %f over limit %f, "
-                        "the first error element is %d, %f, %f") % (
-                            msg_prefix, name, max_diff, max_relative_error,
-                            offset, a.flatten()[offset], b.flatten()[offset])
+                        "the first error element is %d, expected %f, but got %f"
+                        ) % (msg_prefix, name, max_diff, max_relative_error,
+                             offset, a.flatten()[offset], b.flatten()[offset])
 
             self.assertLessEqual(max_diff, max_relative_error, err_msg())
 
@@ -435,6 +470,21 @@ class OpTest(unittest.TestCase):
             input.dtype = np.uint16
         return input
 
+    @staticmethod
+    def fluid_dtype_to_np_dtype(self, dtype):
+        """
+        See above, convert the dtype to normal type.
+        """
+        if dtype == np.uint16:
+            dtype = np.float16
+        return dtype
+
+    @staticmethod
+    def np_value_to_fluid_value(input):
+        if input.dtype == np.float16:
+            input = input.view(np.uint16)
+        return input
+
     def _get_gradient(self,
                       input_to_check,
                       place,
@@ -457,7 +507,7 @@ class OpTest(unittest.TestCase):
             if isinstance(place, fluid.CUDAPlace(0)):
                 use_cuda = True
             executor = fluid.ParallelExecutor(
-                use_cuda=use_cuda, loss_name=loss.name, main_program=program)
+                use_cuda=use_cuda, loss_name=loss.name, main_program=prog)
         else:
             executor = Executor(place)
         return map(np.array,
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index cddf00765f4894126988c794763c34629449e8e6..fcf86cc5839113b75855ce97459b2ee4881238cd 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -35,7 +35,8 @@ class TestParallelExecutorBase(unittest.TestCase):
                                   feed_dict=None,
                                   seed=None,
                                   use_parallel_executor=True,
-                                  balance_parameter_opt_between_cards=False):
+                                  use_reduce=False,
+                                  optimizer=fluid.optimizer.Adam):
         def run_executor(exe, feed, fetch_list, program=None):
             if isinstance(exe, fluid.ParallelExecutor):
                 res = exe.run(fetch_list=fetch_list, feed=feed)
@@ -50,14 +51,19 @@ class TestParallelExecutorBase(unittest.TestCase):
         main = fluid.Program()
         startup = fluid.Program()
         startup.random_seed = 1  # Fix random seed
+        main.random_seed = 1
         with fluid.program_guard(main, startup):
             if seed is not None:
                 startup.random_seed = seed
+                main.random_seed = seed
+
             loss = method(use_feed=feed_dict is not None)
-            adam = fluid.optimizer.Adam()
-            adam.minimize(loss)
+
+            optimizer().minimize(loss)
+
             if memory_opt:
                 fluid.memory_optimize(main)
+
             place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
             startup_exe = fluid.Executor(place)
             startup_exe.run(startup)
@@ -65,7 +71,8 @@ class TestParallelExecutorBase(unittest.TestCase):
             exec_strategy.allow_op_delay = allow_op_delay
 
             build_strategy = fluid.BuildStrategy()
-            build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce if balance_parameter_opt_between_cards else fluid.BuildStrategy.ReduceStrategy.AllReduce
+            build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
+                if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
 
             if use_parallel_executor:
                 exe = fluid.ParallelExecutor(
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 5ed387fb1247f1a91147cb6981f1adc7c2eeb8a2..34f9cf0620fd1351111e93e16ed5f7e765d7078b 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -313,9 +313,9 @@ class TestAbs(OpTest):
         self.init_dtype()
 
         x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
-        # Because we set delta = 0.005 in caculating numeric gradient,
+        # Because we set delta = 0.005 in calculating numeric gradient,
         # if x is too small, such as 0.002, x_neg will be -0.003
-        # x_pos will be 0.007, so the numeric gradient is unaccurate.
+        # x_pos will be 0.007, so the numeric gradient is inaccurate.
         # we should avoid this
         x[np.abs(x) < 0.005] = 0.02
         out = np.abs(x)
diff --git a/python/paddle/fluid/tests/unittests/test_auc_op.py b/python/paddle/fluid/tests/unittests/test_auc_op.py
index 948836039be48ad74d5556100f06231bb89f26d3..6580c70ca68c4ba24919f03d071f6f88fb68953c 100644
--- a/python/paddle/fluid/tests/unittests/test_auc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auc_op.py
@@ -15,63 +15,42 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+from paddle.fluid import metrics
 
 
 class TestAucOp(OpTest):
     def setUp(self):
         self.op_type = "auc"
         pred = np.random.random((128, 2)).astype("float32")
-        indices = np.random.randint(0, 2, (128, 2))
         labels = np.random.randint(0, 2, (128, 1))
         num_thresholds = 200
-        self.inputs = {'Out': pred, 'Indices': indices, 'Label': labels}
+        tp = np.zeros((num_thresholds, )).astype("int64")
+        tn = np.zeros((num_thresholds, )).astype("int64")
+        fp = np.zeros((num_thresholds, )).astype("int64")
+        fn = np.zeros((num_thresholds, )).astype("int64")
+
+        self.inputs = {
+            'Predict': pred,
+            'Label': labels,
+            'TP': tp,
+            'TN': tn,
+            'FP': fp,
+            'FN': fn
+        }
         self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds}
-        # NOTE: sklearn use a different way to generate thresholds
-        #       which will cause the result differs slightly:
-        # from sklearn.metrics import roc_curve, auc
-        # fpr, tpr, thresholds = roc_curve(labels, pred)
-        # auc_value = auc(fpr, tpr)
-        # we caculate AUC again using numpy for testing
-        kepsilon = 1e-7  # to account for floating point imprecisions
-        thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
-                      for i in range(num_thresholds - 2)]
-        thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
 
-        # caculate TP, FN, TN, FP count
-        tp_list = np.ndarray((num_thresholds, ))
-        fn_list = np.ndarray((num_thresholds, ))
-        tn_list = np.ndarray((num_thresholds, ))
-        fp_list = np.ndarray((num_thresholds, ))
-        for idx_thresh, thresh in enumerate(thresholds):
-            tp, fn, tn, fp = 0, 0, 0, 0
-            for i, lbl in enumerate(labels):
-                if lbl:
-                    if pred[i, 0] >= thresh:
-                        tp += 1
-                    else:
-                        fn += 1
-                else:
-                    if pred[i, 0] >= thresh:
-                        fp += 1
-                    else:
-                        tn += 1
-            tp_list[idx_thresh] = tp
-            fn_list[idx_thresh] = fn
-            tn_list[idx_thresh] = tn
-            fp_list[idx_thresh] = fp
-
-        epsilon = 1e-6
-        tpr = (tp_list.astype("float32") + epsilon) / (
-            tp_list + fn_list + epsilon)
-        fpr = fp_list.astype("float32") / (fp_list + tn_list + epsilon)
-        rec = (tp_list.astype("float32") + epsilon) / (
-            tp_list + fp_list + epsilon)
-
-        x = fpr[:num_thresholds - 1] - fpr[1:]
-        y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0
-        auc_value = np.sum(x * y)
-
-        self.outputs = {'AUC': auc_value}
+        python_auc = metrics.Auc(name="auc",
+                                 curve='ROC',
+                                 num_thresholds=num_thresholds)
+        python_auc.update(pred, labels)
+
+        self.outputs = {
+            'AUC': python_auc.eval(),
+            'TPOut': python_auc.tp_list,
+            'FNOut': python_auc.fn_list,
+            'TNOut': python_auc.tn_list,
+            'FPOut': python_auc.fp_list
+        }
 
     def test_check_output(self):
         self.check_output()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index a62ee9596d0f6c58135b4a13249b638e84e63c3c..fcb2612326e74cf6417aa93f2691154c79b5e44c 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -129,7 +129,6 @@ def create_or_get_tensor(scope, var_name, var, place):
     if var is not None:
         assert isinstance(var, np.ndarray)
         tensor.set_recursive_sequence_lengths([])
-        tensor.set_dims(var.shape)
         tensor.set(var, place)
     return tensor
 
diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
index db5771f7b0ad74c73b81d502209c17dce3ce8457..4a3ac2a31e072eb1a15af31f558cf9f626a7ac53 100644
--- a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
@@ -100,6 +100,8 @@ class TestBeamSearchDecodeOp(unittest.TestCase):
             np.array_equal(np.array(sentence_scores), expected_data))
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestBeamSearchDecodeOpGPU(TestBeamSearchDecodeOp):
     def setUp(self):
         self.scope = core.Scope()
diff --git a/python/paddle/fluid/tests/unittests/test_calc_gradient.py b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
index 06e676cd83e77549afd679e730426c590cc046bf..7f2a9e6971ed933463216e38498d48ab132a1a37 100644
--- a/python/paddle/fluid/tests/unittests/test_calc_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
@@ -16,8 +16,6 @@ import unittest
 
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
-import paddle.fluid.framework as framework
-import paddle.fluid.optimizer as optimizer
 from paddle.fluid.backward import calc_gradient
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_checkpoint.py b/python/paddle/fluid/tests/unittests/test_checkpoint.py
deleted file mode 100644
index e22400a045ced16c46b0bf005155f621f249d263..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_checkpoint.py
+++ /dev/null
@@ -1,75 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import unittest
-import os
-import tempfile
-
-
-class TestCheckpoint(unittest.TestCase):
-    def setUp(self):
-        self.dirname = tempfile.mktemp()
-        self.max_num_checkpoints = 3
-        self.epoch_interval = 1
-        self.step_interval = 1
-        self.trainer_id = 0
-        self.chief = self.trainer_id == 0
-        self.place = fluid.CPUPlace()
-        self.epoch_id = 100
-        self.step_id = 20
-
-    def test_checkpoint(self):
-        self.save_checkpoint()
-        serial = fluid.io.get_latest_checkpoint_serial(self.dirname)
-        self.assertTrue(serial >= 0)
-        trainer_args = ["epoch_id", "step_id"]
-        epoch_id, step_id = fluid.io.load_trainer_args(
-            self.dirname, serial, self.trainer_id, trainer_args)
-        self.assertEqual(self.step_id, int(step_id))
-        self.assertEqual(self.epoch_id, int(epoch_id))
-
-        program = fluid.Program()
-        with fluid.program_guard(program):
-            exe = fluid.Executor(self.place)
-            fluid.io.load_checkpoint(exe, self.dirname, serial, program)
-
-        fluid.io.clean_checkpoint(self.dirname, delete_dir=True)
-        self.assertFalse(os.path.isdir(self.dirname))
-
-    def save_checkpoint(self):
-        config = fluid.CheckpointConfig(self.dirname, self.max_num_checkpoints,
-                                        self.epoch_interval, self.step_interval)
-
-        trainer_args = {}
-        trainer_args["epoch_id"] = self.epoch_id
-        trainer_args["step_id"] = self.step_id
-
-        program = fluid.Program()
-        with fluid.program_guard(program):
-            program.global_block().create_var(
-                name="scale_0",
-                psersistable=True,
-                dtype="float32",
-                shape=[32, 32])
-
-            exe = fluid.Executor(self.place)
-            for i in xrange(10):
-                fluid.io.save_checkpoint(exe, config.checkpoint_dir,
-                                         self.trainer_id, trainer_args, program,
-                                         config.max_num_checkpoints)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conditional_block.py b/python/paddle/fluid/tests/unittests/test_conditional_block.py
index 084b8d37386fac0366c190f5f30dd39467072498..d9f83905e6135e22f74e749857f9b0fbe464d3f4 100644
--- a/python/paddle/fluid/tests/unittests/test_conditional_block.py
+++ b/python/paddle/fluid/tests/unittests/test_conditional_block.py
@@ -18,14 +18,15 @@ import paddle.fluid.core as core
 from paddle.fluid.framework import default_startup_program, default_main_program
 from paddle.fluid.executor import Executor
 from paddle.fluid.backward import append_backward
+from paddle.fluid.layers.control_flow import ConditionalBlock
 import numpy
 
 
-class ConditionalBlock(unittest.TestCase):
+class ConditionalBlockTest(unittest.TestCase):
     def test_forward(self):
         data = layers.data(name='X', shape=[1], dtype='float32')
         data.stop_gradient = False
-        cond = layers.ConditionalBlock(inputs=[data])
+        cond = ConditionalBlock(inputs=[data])
         out = layers.create_tensor(dtype='float32')
         with cond.block():
             hidden = layers.fc(input=data, size=10)
diff --git a/python/paddle/fluid/tests/unittests/test_const_value.py b/python/paddle/fluid/tests/unittests/test_const_value.py
index d1075d514e9b2b692f271f10a005815a66b421fb..58ac6fa0a9a30a08a831111513777cca59062724 100644
--- a/python/paddle/fluid/tests/unittests/test_const_value.py
+++ b/python/paddle/fluid/tests/unittests/test_const_value.py
@@ -16,7 +16,7 @@ import unittest
 import paddle.fluid.framework as framework
 
 
-class ConditionalBlock(unittest.TestCase):
+class ConstantTest(unittest.TestCase):
     def test_const_value(self):
         self.assertEqual(framework.GRAD_VAR_SUFFIX, "@GRAD")
         self.assertEqual(framework.TEMP_VAR_NAME, "@TEMP@")
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
index 07545e7feb46c85a4b80f9b846be27d36cbfb59a..af6cd99b0d7e6b0a2dfd4fc1d33e8390017a5906 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
@@ -191,12 +191,16 @@ class TestWithDilation(TestConv2dTransposeOp):
 
 
 # ------------ test_cudnn ------------
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNN(TestConv2dTransposeOp):
     def init_op_type(self):
         self.use_cudnn = True
         self.op_type = "conv2d_transpose"
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNNWithPad(TestWithPad):
     def init_test_case(self):
         self.pad = [1, 1]
@@ -212,6 +216,8 @@ class TestCUDNNWithPad(TestWithPad):
         self.op_type = "conv2d_transpose"
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNNWithStride(TestWithStride):
     def init_test_case(self):
         self.pad = [1, 1]
@@ -227,6 +233,8 @@ class TestCUDNNWithStride(TestWithStride):
         self.op_type = "conv2d_transpose"
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNNWithGroups(TestWithGroups):
     def init_test_case(self):
         self.pad = [1, 1]
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
index c9f26d10df8ff39d6bd77b1597336600f676d362..300fa5e8bde001e0f66c5f924a81c30add99aead 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
@@ -197,12 +197,16 @@ class TestWithDilation(TestConv3dTransposeOp):
 
 
 # ------------ test_cudnn ------------
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNN(TestConv3dTransposeOp):
     def init_op_type(self):
         self.use_cudnn = True
         self.op_type = "conv3d_transpose"
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNNWithPad(TestWithPad):
     def init_test_case(self):
         self.pad = [1, 1, 1]
@@ -218,6 +222,8 @@ class TestCUDNNWithPad(TestWithPad):
         self.op_type = "conv3d_transpose"
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNNWithStride(TestWithStride):
     def init_test_case(self):
         self.pad = [1, 1, 1]
@@ -233,6 +239,8 @@ class TestCUDNNWithStride(TestWithStride):
         self.op_type = "conv3d_transpose"
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNNWithGroups(TestWithGroups):
     def init_test_case(self):
         self.pad = [1, 1, 1]
diff --git a/python/paddle/fluid/tests/unittests/test_data_balance.py b/python/paddle/fluid/tests/unittests/test_data_balance.py
index 6d810920d55ccf069ff408c553069e8f5e590271..aa09b0ea445adccae3f741b53850f8182f3270cc 100644
--- a/python/paddle/fluid/tests/unittests/test_data_balance.py
+++ b/python/paddle/fluid/tests/unittests/test_data_balance.py
@@ -142,8 +142,7 @@ class TestDataBalance(unittest.TestCase):
                 filenames=[self.lod_data_file_name],
                 shapes=[[-1, 3], [-1, 1]],
                 lod_levels=[1, 0],
-                dtypes=['float32', 'int32'],
-                thread_num=1)
+                dtypes=['float32', 'int32'])
             ins, label = fluid.layers.read_file(data_reader)
 
             place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
@@ -156,7 +155,7 @@ class TestDataBalance(unittest.TestCase):
                 main_program=main_prog,
                 build_strategy=build_strategy)
 
-            if (parallel_exe.device_count > self.batch_size):
+            if parallel_exe.device_count > self.batch_size:
                 print("WARNING: Unittest TestDataBalance skipped. \
                     For the result is not correct when device count \
                     is larger than batch size.")
@@ -190,3 +189,7 @@ class TestDataBalance(unittest.TestCase):
     def test_all(self):
         self.main()
         self.main_lod()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..58cfd4e1fd958d8d59e49c87fbbabd0182975add
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -0,0 +1,138 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+
+import unittest
+import os
+import sys
+import signal
+import subprocess
+
+
+class TestDistBase(unittest.TestCase):
+    def setUp(self):
+        self._trainers = 2
+        self._pservers = 2
+        self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
+        self._python_interp = "python"
+
+    def start_pserver(self, model_file):
+        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
+        ps0_cmd = "%s %s pserver %s 0 %s %d TRUE" % \
+            (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
+             self._trainers)
+        ps1_cmd = "%s %s pserver %s 0 %s %d TRUE" % \
+            (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
+             self._trainers)
+
+        ps0_proc = subprocess.Popen(
+            ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        ps1_proc = subprocess.Popen(
+            ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        return ps0_proc, ps1_proc
+
+    def _wait_ps_ready(self, pid):
+        retry_times = 50
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(3)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error as e:
+                sys.stderr.write('waiting for pserver: %s, left retry %d\n' %
+                                 (e, retry_times))
+                retry_times -= 1
+
+    def check_with_place(self, model_file, delta=1e-3):
+        # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
+        required_envs = {
+            "PATH": os.getenv("PATH"),
+            "PYTHONPATH": os.getenv("PYTHONPATH"),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"),
+            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
+            "FLAGS_cudnn_deterministic": "1"
+        }
+        # Run local to get a base line
+        env_local = {"CUDA_VISIBLE_DEVICES": "0"}
+        env_local.update(required_envs)
+        local_cmd = "%s %s trainer %s 0 %s %d FLASE" % \
+            (self._python_interp, model_file,
+             "127.0.0.1:1234", "127.0.0.1:1234", 1)
+        local_proc = subprocess.Popen(
+            local_cmd.split(" "),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=env_local)
+        local_proc.wait()
+        out, err = local_proc.communicate()
+        local_ret = out
+        sys.stderr.write('local_loss: %s\n' % local_ret)
+        sys.stderr.write('local_stderr: %s\n' % err)
+
+        # Run dist train to compare with local results
+        ps0, ps1 = self.start_pserver(model_file)
+        self._wait_ps_ready(ps0.pid)
+        self._wait_ps_ready(ps1.pid)
+
+        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
+        tr0_cmd = "%s %s trainer %s 0 %s %d TRUE" % \
+            (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
+             self._trainers)
+        tr1_cmd = "%s %s trainer %s 1 %s %d TRUE" % \
+            (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
+             self._trainers)
+
+        env0 = {"CUDA_VISIBLE_DEVICES": "0"}
+        env1 = {"CUDA_VISIBLE_DEVICES": "1"}
+        env0.update(required_envs)
+        env1.update(required_envs)
+        FNULL = open(os.devnull, 'w')
+
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.split(" "),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=env0)
+        tr1_proc = subprocess.Popen(
+            tr1_cmd.split(" "),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=env1)
+
+        tr0_proc.wait()
+        tr1_proc.wait()
+        out, err = tr0_proc.communicate()
+        sys.stderr.write('dist_stderr: %s\n' % err)
+        loss_data0 = out
+        sys.stderr.write('dist_loss: %s\n' % loss_data0)
+        lines = loss_data0.split("\n")
+        dist_first_loss = eval(lines[0].replace(" ", ","))[0]
+        dist_last_loss = eval(lines[1].replace(" ", ","))[0]
+
+        local_lines = local_ret.split("\n")
+        local_first_loss = eval(local_lines[0])[0]
+        local_last_loss = eval(local_lines[1])[0]
+
+        self.assertAlmostEqual(local_first_loss, dist_first_loss, delta=delta)
+        self.assertAlmostEqual(local_last_loss, dist_last_loss, delta=delta)
+
+        # check tr0_out
+        # FIXME: ensure the server process is killed
+        # replace with ps0.terminate()
+        os.kill(ps0.pid, signal.SIGKILL)
+        os.kill(ps1.pid, signal.SIGKILL)
+        FNULL.close()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3a5fd6985bab1d04f6e1484534367548f383dfb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
@@ -0,0 +1,24 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from test_dist_base import TestDistBase
+
+
+class TestDistSeResneXt2x2(TestDistBase):
+    def test_se_resnext(self):
+        self.check_with_place("dist_se_resnext.py")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_train.py b/python/paddle/fluid/tests/unittests/test_dist_train.py
index 562e66b0625083fe840d64967249f0215cfda1f9..aab8969a96ff69d1a306506337a0e009f14758b9 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_train.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_train.py
@@ -22,6 +22,9 @@ import numpy
 
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
+from paddle.fluid.layers.io import ListenAndServ
+from paddle.fluid.layers.io import Recv
+from paddle.fluid.layers.io import Send
 
 
 class TestSendOp(unittest.TestCase):
@@ -65,8 +68,7 @@ class TestSendOp(unittest.TestCase):
         main = fluid.Program()
 
         with fluid.program_guard(main):
-            serv = layers.ListenAndServ(
-                "127.0.0.1:0", ["X"], optimizer_mode=False)
+            serv = ListenAndServ("127.0.0.1:0", ["X"], optimizer_mode=False)
             with serv.do():
                 out_var = main.global_block().create_var(
                     name="scale_0.tmp_0",
@@ -99,8 +101,8 @@ class TestSendOp(unittest.TestCase):
                 persistable=False,
                 shape=[32, 32])
             fluid.initializer.Constant(value=2.3)(get_var, main.global_block())
-            layers.Send("127.0.0.1:%d" % port, [x])
-            o = layers.Recv("127.0.0.1:%d" % port, [get_var])
+            Send("127.0.0.1:%d" % port, [x])
+            o = Recv("127.0.0.1:%d" % port, [get_var])
 
         exe = fluid.Executor(place)
         self.dist_out = exe.run(main, fetch_list=o)  # o is a list
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transformer.py b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..68cd35d751dbce7eef9919dc8678fc0dd117757b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
@@ -0,0 +1,27 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from test_dist_base import TestDistBase
+
+
+class TestDistTransformer2x2(TestDistBase):
+    def test_transformer(self):
+        # TODO(paddle-dev): check if the delta is OK.
+        # Usually start around ~8000 and converge to ~5000
+        self.check_with_place("dist_transformer.py", delta=400)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 75b4b4e50da04521021dcb1e97cfe495f2619433..abd372126848c5779cf7d989dc03e421dc94b1cf 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -27,7 +27,6 @@ class TranspilerTest(unittest.TestCase):
         self.pserver_eps = "127.0.0.1:6174,127.0.0.1:6175"
         self.pserver1_ep = "127.0.0.1:6174"
         self.pserver2_ep = "127.0.0.1:6175"
-        self.slice_var_up = True
         self.sync_mode = True
         self.transpiler = None
 
@@ -52,32 +51,41 @@ class TranspilerTest(unittest.TestCase):
         self.origin_prog = main.clone()
         return main
 
-    def get_trainer(self):
-        t = self._transpiler_instance()
+    def get_trainer(self, config=None, sync_mode=True):
+        t = self._transpiler_instance(config, sync_mode)
         return t.get_trainer_program()
 
-    def get_pserver(self, ep):
-        t = self._transpiler_instance()
+    def get_pserver(self, ep, config=None, sync_mode=True):
+        t = self._transpiler_instance(config, sync_mode)
         pserver = t.get_pserver_program(ep)
         startup = t.get_startup_program(ep, pserver)
         return pserver, startup
 
-    def _transpiler_instance(self):
+    def _transpiler_instance(self, config=None, sync_mode=True):
         if not self.transpiler:
             main = self.get_main_program()
-            self.transpiler = fluid.DistributeTranspiler()
+            self.transpiler = fluid.DistributeTranspiler(config=config)
             self.transpiler.transpile(
                 self.trainer_id,
                 program=main,
                 pservers=self.pserver_eps,
                 trainers=self.trainers,
-                slice_var_up=self.slice_var_up,
-                sync_mode=self.sync_mode)
+                sync_mode=sync_mode)
+
         return self.transpiler
 
+    def transpiler_test_impl(self):
+        pass
 
-class TestBasicModel(TranspilerTest):
     def test_transpiler(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            self.transpiler_test_impl()
+
+
+class TestBasicModel(TranspilerTest):
+    def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
         pserver2, startup2 = self.get_pserver(self.pserver2_ep)
 
@@ -124,14 +132,67 @@ class TestBasicModel(TranspilerTest):
         self.assertEqual(set(pserver_params), set(trainer_params))
 
 
+class TestBasicModelWithLargeBlockSize(TranspilerTest):
+    def transpiler_test_impl(self):
+        config = fluid.DistributeTranspilerConfig()
+        config.min_block_size = 1048576
+
+        pserver, startup = self.get_pserver(self.pserver1_ep, config)
+        pserver2, startup2 = self.get_pserver(self.pserver2_ep, config)
+
+        trainer = self.get_trainer(config)
+
+        self.assertEqual([op.type for op in trainer.global_block().ops], [
+            'mul', 'elementwise_add', 'elementwise_sub', 'square', 'mean',
+            'fill_constant', 'mean_grad', 'square_grad', 'elementwise_sub_grad',
+            'elementwise_add_grad', 'send', 'mul_grad', 'send', 'send_barrier',
+            'recv', 'recv', 'fetch_barrier'
+        ])
+
+        self.assertEqual(len(pserver.blocks), 2)
+        # block0: listen_and_serv
+        self.assertEqual([op.type for op in pserver.blocks[0].ops],
+                         ["listen_and_serv"])
+        # block1~2: optimize pass
+        self.assertEqual([op.type for op in pserver.blocks[1].ops],
+                         ["sum", "scale", "sgd"])
+        # confirm startup program
+        self.assertEqual([op.type for op in startup.global_block().ops],
+                         ["fill_constant", "fill_constant"])
+        # the variable #fc_w will be split into two blocks
+        fc_w_var = startup2.global_block().var("fc_w")
+        self.assertEqual(fc_w_var.shape, (1000L, 1000L))
+        # all parameters should be optimized on pserver
+
+        pserver_params = []
+        for prog in [pserver, pserver2]:
+            for blk in prog.blocks:
+                for op in blk.ops:
+                    if "Param" in op.input_names:
+                        param_name = op.input("Param")[0]
+                        is_block_idx = param_name.find(".block")
+                        if is_block_idx != -1:
+                            origin_param_name = param_name[:is_block_idx]
+                        else:
+                            origin_param_name = param_name
+                        pserver_params.append(origin_param_name)
+        trainer_params = []
+        for op in self.origin_prog.global_block().ops:
+            if "Param" in op.input_names:
+                trainer_params.append(op.input("Param")[0])
+        self.assertEqual(set(pserver_params), set(trainer_params))
+
+
 class TestNoSliceVar(TranspilerTest):
     def setUp(self):
         super(TestNoSliceVar, self).setUp()
-        self.slice_var_up = False
 
-    def test_transpiler(self):
-        _, startup = self.get_pserver(self.pserver1_ep)
-        _, startup2 = self.get_pserver(self.pserver2_ep)
+    def transpiler_test_impl(self):
+        config = fluid.DistributeTranspilerConfig()
+        config.slice_var_up = False
+
+        _, startup = self.get_pserver(self.pserver1_ep, config)
+        _, startup2 = self.get_pserver(self.pserver2_ep, config)
 
         if startup.global_block().vars.has_key("fc_w"):
             fc_w_var = startup.global_block().vars["fc_w"]
@@ -161,7 +222,7 @@ class TestLRDecay(TranspilerTest):
         sgd_optimizer.minimize(avg_cost)
         return
 
-    def test_transpiler(self):
+    def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
         trainer = self.get_trainer()
 
@@ -191,7 +252,7 @@ class TestLRDecayConditional(TranspilerTest):
         sgd_optimizer.minimize(avg_cost)
         return
 
-    def test_transpiler(self):
+    def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
         trainer = self.get_trainer()
 
@@ -240,7 +301,7 @@ class TestL2Decay(TranspilerTest):
         sgd_optimizer.minimize(avg_cost)
         return
 
-    def test_transpiler(self):
+    def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
         trainer = self.get_trainer()
 
@@ -253,10 +314,226 @@ class TestL2Decay(TranspilerTest):
         # TODO(typhoonzero): test clipping and L2Decay ops are removed from trainer
 
 
-    # FIXME(typhoonzero): need to add test for async case:
-    # see https://github.com/PaddlePaddle/Paddle/issues/11691
-class TestAsyncSGD(TranspilerTest):
-    pass
+class TestL2DecayWithPiecewise(TranspilerTest):
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'),
+                                    bias_attr=fluid.ParamAttr(name='fc_b'))
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        base_lr = 1.0
+        bd = [1, 10, 20, 30]
+        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+        sgd_optimizer = fluid.optimizer.Momentum(
+            learning_rate=fluid.layers.piecewise_decay(
+                boundaries=bd, values=lr),
+            momentum=0.9,
+            regularization=fluid.regularizer.L2Decay(1e-4))
+        sgd_optimizer.minimize(avg_cost)
+        return
+
+    def transpiler_test_impl(self):
+        pserver, startup = self.get_pserver(self.pserver1_ep)
+        trainer = self.get_trainer()
+
+        self.assertEqual(len(pserver.blocks), 9)
+        self.assertEqual([op.type for op in pserver.blocks[1].ops], [
+            "increment", "cast", "fill_constant", "fill_constant", "less_than",
+            "logical_not", "conditional_block", "fill_constant",
+            "fill_constant", "less_than", "logical_not", "logical_and",
+            "logical_and", "conditional_block", "fill_constant",
+            "fill_constant", "less_than", "logical_not", "logical_and",
+            "logical_and", "conditional_block", "fill_constant",
+            "fill_constant", "less_than", "logical_not", "logical_and",
+            "logical_and", "conditional_block", "fill_constant",
+            "conditional_block"
+        ])
+        self.assertEqual(
+            [op.type for op in pserver.blocks[7].ops],
+            ["sum", "scale", "scale", "elementwise_add", "momentum"])
+        self.assertEqual(
+            [op.type for op in pserver.blocks[8].ops],
+            ["sum", "scale", "scale", "elementwise_add", "momentum"])
+
+
+class TestDistLookupTableBase(TranspilerTest):
+    def network_with_table(self, is_sparse, is_distributed):
+        def emb_pool(ids):
+            table_size = 1000
+            emb_size = 64
+            emb = fluid.layers.embedding(
+                input=ids,
+                size=[table_size, emb_size],
+                dtype='float32',
+                param_attr='shared_w',  # share parameter
+                is_sparse=is_sparse,
+                is_distributed=is_distributed)
+            pool = fluid.layers.sequence_pool(input=emb, pool_type='average')
+            return pool
+
+        title_ids = fluid.layers.data(
+            name='title_ids', shape=[1], dtype='int64', lod_level=1)
+        brand_ids = fluid.layers.data(
+            name='brand_ids', shape=[1], dtype='int64', lod_level=1)
+        title_emb = emb_pool(title_ids)
+        brand_emb = emb_pool(brand_ids)
+        fc0 = fluid.layers.concat(input=[title_emb, brand_emb], axis=1)
+        predict = fluid.layers.fc(input=fc0,
+                                  size=2,
+                                  act=None,
+                                  param_attr=fluid.ParamAttr(name='fc_w'),
+                                  bias_attr=fluid.ParamAttr(name='fc_b'))
+
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(cost)
+        optimizer = fluid.optimizer.Adam(learning_rate=0.003)
+        optimizer.minimize(avg_cost)
+
+
+class TestLocalLookupTable(TestDistLookupTableBase):
+    def net_conf(self):
+        self.network_with_table(is_sparse=True, is_distributed=False)
+
+    def transpiler_test_impl(self):
+        pserver1, startup1 = self.get_pserver(self.pserver1_ep)
+
+        self.assertEqual(len(pserver1.blocks), 3)
+        # 0 listen_and_serv
+        # 1 optimize for fc_w or fc_b adam
+        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
+                         ["sum", "scale", "adam", "scale", "scale"])
+        # 2 optimize for table adam
+        # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
+        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
+                         ["sum", "adam", "scale", "scale"])
+
+        trainer = self.get_trainer()
+        self.assertEqual(len(trainer.blocks), 1)
+        ops = [
+            'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
+            'concat', 'mul', 'elementwise_add', 'cross_entropy', 'mean',
+            'fill_constant', 'mean_grad', 'cross_entropy_grad',
+            'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad',
+            'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
+            'lookup_table_grad', 'sum', 'split_selected_rows', 'send',
+            'send_barrier', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat'
+        ]
+        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
+
+
+class TestDistLookupTable(TestDistLookupTableBase):
+    def net_conf(self):
+        self.network_with_table(is_sparse=True, is_distributed=True)
+
+    def transpiler_test_impl(self):
+        pserver1, startup1 = self.get_pserver(self.pserver1_ep)
+
+        self.assertEqual(len(pserver1.blocks), 6)
+        # 0 listen_and_serv
+        # 1 optimize for fc_w or fc_b adam
+        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
+                         ["sum", "scale", "adam", "scale", "scale"])
+        # 2 optimize for table sgd
+        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
+                         ["sum", "sgd"])
+        # 3 prefetch -> lookup_sparse_table for data0
+        self.assertEqual([op.type for op in pserver1.blocks[3].ops],
+                         ["lookup_sparse_table"])
+        # 4 prefetch -> lookup_sparse_table for data1
+        self.assertEqual([op.type for op in pserver1.blocks[4].ops],
+                         ["lookup_sparse_table"])
+        # 5 save table
+        self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
+
+        trainer = self.get_trainer()
+        self.assertEqual(len(trainer.blocks), 1)
+        ops = [
+            'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids',
+            'prefetch', 'merge_ids', 'sequence_pool', 'concat', 'mul',
+            'elementwise_add', 'cross_entropy', 'mean', 'fill_constant',
+            'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send',
+            'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
+            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
+            'sum', 'split_ids', 'send', 'send_barrier', 'recv', 'recv',
+            'fetch_barrier'
+        ]
+        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
+
+
+class TestAsyncLocalLookupTable(TestDistLookupTableBase):
+    def net_conf(self):
+        self.network_with_table(is_sparse=True, is_distributed=False)
+
+    def transpiler_test_impl(self):
+        config = fluid.DistributeTranspilerConfig()
+        pserver1, startup1 = self.get_pserver(self.pserver1_ep, config, False)
+
+        self.assertEqual(len(pserver1.blocks), 3)
+        # 0 listen_and_serv
+        # 1 optimize for fc_w or fc_b adam
+        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
+                         ["adam", "scale", "scale"])
+        # 2 optimize for table adam
+        # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
+        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
+                         ["adam", "scale", "scale"])
+
+        trainer = self.get_trainer(config)
+        self.assertEqual(len(trainer.blocks), 1)
+        ops = [
+            'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
+            'concat', 'mul', 'elementwise_add', 'cross_entropy', 'mean',
+            'fill_constant', 'mean_grad', 'cross_entropy_grad',
+            'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad',
+            'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
+            'lookup_table_grad', 'sum', 'split_selected_rows', 'send', 'recv',
+            'recv', 'recv', 'concat'
+        ]
+        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
+
+
+class TestAsyncDistLookupTable(TestDistLookupTableBase):
+    def net_conf(self):
+        self.network_with_table(is_sparse=True, is_distributed=True)
+
+    def transpiler_test_impl(self):
+        config = fluid.DistributeTranspilerConfig()
+
+        pserver1, startup1 = self.get_pserver(self.pserver1_ep, config, False)
+
+        self.assertEqual(len(pserver1.blocks), 6)
+        # 0 listen_and_serv
+        # 1 optimize for fc_w or fc_b adam
+        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
+                         ["adam", "scale", "scale"])
+        # 2 optimize for table sgd
+        self.assertEqual([op.type for op in pserver1.blocks[2].ops], ["sgd"])
+        # 3 prefetch -> lookup_sparse_table for data0
+        self.assertEqual([op.type for op in pserver1.blocks[3].ops],
+                         ["lookup_sparse_table"])
+        # 4 prefetch -> lookup_sparse_table for data1
+        self.assertEqual([op.type for op in pserver1.blocks[4].ops],
+                         ["lookup_sparse_table"])
+        # 5 save table
+        self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
+
+        trainer = self.get_trainer(config)
+        self.assertEqual(len(trainer.blocks), 1)
+        ops = [
+            'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids',
+            'prefetch', 'merge_ids', 'sequence_pool', 'concat', 'mul',
+            'elementwise_add', 'cross_entropy', 'mean', 'fill_constant',
+            'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send',
+            'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
+            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
+            'sum', 'split_ids', 'send', 'recv', 'recv'
+        ]
+        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
index 0faed94deb4808783027d776e0f4c61da0db457a..4448de8839d7ad4ad1f70ecdc4ac94da1e619adb 100644
--- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
@@ -17,6 +17,12 @@ import paddle
 import unittest
 import numpy
 
+from paddle.fluid.layers.control_flow import lod_rank_table
+from paddle.fluid.layers.control_flow import max_sequence_len
+from paddle.fluid.layers.control_flow import lod_tensor_to_array
+from paddle.fluid.layers.control_flow import array_to_lod_tensor
+from paddle.fluid.layers.control_flow import shrink_memory
+
 
 class TestDynRNN(unittest.TestCase):
     def setUp(self):
@@ -38,12 +44,11 @@ class TestDynRNN(unittest.TestCase):
 
             label = fluid.layers.data(name='label', shape=[1], dtype='float32')
 
-            rank_table = fluid.layers.lod_rank_table(x=sent_emb)
+            rank_table = lod_rank_table(x=sent_emb)
 
-            sent_emb_array = fluid.layers.lod_tensor_to_array(
-                x=sent_emb, table=rank_table)
+            sent_emb_array = lod_tensor_to_array(x=sent_emb, table=rank_table)
 
-            seq_len = fluid.layers.max_sequence_len(rank_table=rank_table)
+            seq_len = max_sequence_len(rank_table=rank_table)
             i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
             i.stop_gradient = False
 
@@ -66,7 +71,7 @@ class TestDynRNN(unittest.TestCase):
                 mem = fluid.layers.array_read(array=mem_array, i=i)
                 ipt = fluid.layers.array_read(array=sent_emb_array, i=i)
 
-                mem = fluid.layers.shrink_memory(x=mem, i=i, table=rank_table)
+                mem = shrink_memory(x=mem, i=i, table=rank_table)
 
                 hidden = fluid.layers.fc(input=[mem, ipt], size=100, act='tanh')
 
@@ -75,8 +80,7 @@ class TestDynRNN(unittest.TestCase):
                 fluid.layers.array_write(x=hidden, i=i, array=mem_array)
                 fluid.layers.less_than(x=i, y=seq_len, cond=cond)
 
-            all_timesteps = fluid.layers.array_to_lod_tensor(
-                x=out, table=rank_table)
+            all_timesteps = array_to_lod_tensor(x=out, table=rank_table)
             last = fluid.layers.sequence_last_step(input=all_timesteps)
             logits = fluid.layers.fc(input=last, size=1, act=None)
             loss = fluid.layers.sigmoid_cross_entropy_with_logits(
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
index 92e718662dfd7998be3ede2994f160059679fa8a..31af1245720405ee067a0acf3575e3ae86372c13 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
@@ -65,10 +65,10 @@ class TestDyRnnStaticInput(unittest.TestCase):
         return self._lodtensor_to_ndarray(fetch_outs[0])
 
     def _lodtensor_to_ndarray(self, lod_tensor):
-        dims = lod_tensor.get_dims()
+        dims = lod_tensor.shape()
         ndarray = np.zeros(shape=dims).astype('float32')
         for i in xrange(np.product(dims)):
-            ndarray.ravel()[i] = lod_tensor.get_float_element(i)
+            ndarray.ravel()[i] = lod_tensor._get_float_element(i)
         return ndarray, lod_tensor.recursive_sequence_lengths()
 
     def build_graph(self, only_forward=False):
@@ -185,19 +185,19 @@ class TestDyRnnStaticInput(unittest.TestCase):
 
         actual_gradients, actual_lod = self.fetch_value(static_input_grad)
 
-        static_input_shape = self.static_input_tensor.get_dims()
+        static_input_shape = self.static_input_tensor.shape()
         numeric_gradients = np.zeros(shape=static_input_shape).astype('float32')
         # calculate numeric gradients
         tensor_size = np.product(static_input_shape)
         for i in xrange(tensor_size):
-            origin = self.static_input_tensor.get_float_element(i)
+            origin = self.static_input_tensor._get_float_element(i)
             x_pos = origin + self._delta
-            self.static_input_tensor.set_float_element(i, x_pos)
+            self.static_input_tensor._set_float_element(i, x_pos)
             y_pos = self.fetch_value(loss)[0][0]
             x_neg = origin - self._delta
-            self.static_input_tensor.set_float_element(i, x_neg)
+            self.static_input_tensor._set_float_element(i, x_neg)
             y_neg = self.fetch_value(loss)[0][0]
-            self.static_input_tensor.set_float_element(i, origin)
+            self.static_input_tensor._set_float_element(i, origin)
             numeric_gradients.ravel()[i] = (y_pos - y_neg) / self._delta / 2
         self.assertTrue(np.allclose(actual_gradients, numeric_gradients, 0.001))
         self.assertTrue(
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
index acf652d3fb9743d69b7f7e248ff7a3ee83fc4c50..1854232194963bcbe302010320a30d85747eea96 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
@@ -20,8 +20,8 @@ class TestElementwiseOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.inputs = {
-            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
-            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+            'X': np.random.uniform(0.1, 1, [2, 3]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [2, 3]).astype("float32")
         }
         self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
 
diff --git a/python/paddle/fluid/tests/unittests/test_extract_rows_op.py b/python/paddle/fluid/tests/unittests/test_extract_rows_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a41c44fe655b18626bdb727745dae032babe8ad
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_extract_rows_op.py
@@ -0,0 +1,58 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from op_test import OpTest
+
+
+class TestExtractRows(OpTest):
+    def check_with_place(self, place):
+        scope = core.Scope()
+
+        # create and initialize Variable
+        feature_len = 12
+        rows = [0, 4, 4, 7]
+        np_array = np.ones((len(rows), feature_len)).astype("float32")
+
+        in_x = scope.var('X').get_selected_rows()
+        in_x.set_height(len(rows))
+        in_x.set_rows(rows)
+        in_x_tensor = in_x.get_tensor()
+        in_x_tensor.set(np_array, place)
+
+        # create Out Variable
+        out_tensor = scope.var('Out').get_tensor()
+
+        # create and run lookup_table operator
+        extract_rows_op = Operator("extract_rows", X='X', Out='Out')
+        extract_rows_op.run(scope, place)
+
+        # get result from Out
+        result_array = np.array(out_tensor)
+        result_array = [ele[0] for ele in result_array]
+        assert result_array == rows
+
+    def test_concat_rows(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.check_with_place(place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
index 281068e945e76a42635868d19573498f79fde1f3..026ac2112b2d78644b3315b9cab8019ca27e9714 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
@@ -40,7 +40,6 @@ class TestFakeDequantizeMaxAbsOp(OpTest):
         self.op_type = "fake_dequantize_max_abs"
         x = np.random.randn(31, 65).astype("float32")
         yq, scale = quantize_max_abs(x, self.num_bits)
-        print 'scale ', scale
         ydq = dequantize_max_abs(yq, self.num_bits, scale)
 
         self.inputs = {'X': yq}
diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c6aa9d3bb656740c528c728efafc6a47e8bff91
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -0,0 +1,51 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestFakeQuantizeOp(OpTest):
+    def setUp(self):
+        self.op_type = "fake_quantize"
+        self.attrs = {
+            'bit_length': 8,
+            'quantize_type': 'abs_max',
+            'window_size': 10000
+        }
+        self.inputs = {
+            'X': np.random.random((10, 10)).astype("float32"),
+            'InScales': np.zeros(self.attrs['window_size']).astype("float32"),
+            'InCurrentIter': np.zeros(1).astype("float32"),
+            'InMovingScale': np.zeros(1).astype("float32")
+        }
+        self.scale = {
+            'abs_max': np.max(np.abs(self.inputs['X'])).astype("float32")
+        }
+        self.outputs = {
+            'Out': np.round(self.inputs['X'] / self.scale['abs_max'] * (
+                (1 << (self.attrs['bit_length'] - 1)) - 1)),
+            'OutScales': np.zeros(self.attrs['window_size']).astype("float32"),
+            'OutMovingScale':
+            np.array([self.scale['abs_max']]).astype("float32"),
+            'OutCurrentIter': np.zeros(1).astype("float32")
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_flatten_op.py b/python/paddle/fluid/tests/unittests/test_flatten_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8692ce2ea66ef61c63bc41e77df050398ac63fd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_flatten_op.py
@@ -0,0 +1,68 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+from op_test import OpTest
+
+
+class TestFlattenOp(OpTest):
+    def setUp(self):
+        self.op_type = "flatten"
+        self.init_test_case()
+        self.inputs = {"X": np.random.random(self.in_shape).astype("float32")}
+        self.init_attrs()
+        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+    def init_test_case(self):
+        self.in_shape = (3, 2, 2, 5)
+        self.axis = 1
+        self.new_shape = (3, 20)
+
+    def init_attrs(self):
+        self.attrs = {"axis": self.axis}
+
+
+class TestFlattenOp(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 2, 3)
+        self.axis = 0
+        self.new_shape = (1, 36)
+
+
+class TestFlattenOpWithDefaultAxis(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 2, 3)
+        self.new_shape = (3, 12)
+
+    def init_attrs(self):
+        self.attrs = {}
+
+
+class TestFlattenOpSixDims(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 3, 2, 4, 4)
+        self.axis = 4
+        self.new_shape = (36, 16)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec0a939e9ec21952a6657ea849bb9844bb69cc8d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
@@ -0,0 +1,818 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+
+# scale + add
+#   TestElementwiseAddOp
+#   TestFusedOperatorsOp_scalar
+#   TestFusedOperatorsOp_scalar2
+#   TestFusedOperatorsOp_Vector
+#   TestFusedOperatorsOp_broadcast_0
+#   TestFusedOperatorsOp_broadcast_1
+#   TestFusedOperatorsOp_broadcast_2
+#   TestFusedOperatorsOp_broadcast_3
+#   TestFusedOperatorsOp_broadcast_4
+#   TestFusedOperatorsOp_rowwise_add_0
+#   TestFusedOperatorsOp_rowwise_add_1
+#   TestFusedOperatorsOp_channelwise_add
+
+
+class TestElementwiseAddOp(OpTest):
+    def setUp(self):
+        self.op_type = "fused_elemwise_activation"
+        self.dtype = np.float32
+        self.axis = -1
+
+        self.init_axis()
+        self.init_dtype()
+        self.init_input()
+        self.init_output()
+        self.init_attr()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.outputs = {'Out': self.out}
+
+    def init_input(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["scale", "elementwise_add"]
+        }
+
+    def init_dtype(self):
+        pass
+
+    def init_axis(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
+
+
+class TestFusedOperatorsOp_scalar(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y) * self.scale
+
+
+class TestFusedOperatorsOp_scalar2(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1, 1).astype(self.dtype)
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y) * self.scale
+
+
+class TestFusedOperatorsOp_Vector(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.random((32, )).astype(self.dtype)
+        self.y = np.random.random((32, )).astype(self.dtype)
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y) * self.scale
+
+
+class TestFusedOperatorsOp_broadcast_0(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(2).astype(self.dtype)
+
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y.reshape(2, 1, 1)) * self.scale
+
+
+class TestFusedOperatorsOp_broadcast_1(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(3).astype(self.dtype)
+
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y.reshape(1, 3, 1)) * self.scale
+
+
+class TestFusedOperatorsOp_broadcast_2(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(4).astype(self.dtype)
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y.reshape(1, 1, 4)) * self.scale
+
+
+class TestFusedOperatorsOp_broadcast_3(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(3, 4).astype(self.dtype)
+
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y.reshape(1, 3, 4, 1)) * self.scale
+
+
+class TestFusedOperatorsOp_broadcast_4(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(2, 1).astype(self.dtype)
+
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y.reshape(2, 1, 1, 1)) * self.scale
+
+
+class TestFusedOperatorsOp_rowwise_add_0(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(3, 4).astype(self.dtype)
+
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y.reshape(1, 3, 4)) * self.scale
+
+
+class TestFusedOperatorsOp_rowwise_add_1(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 1).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y.reshape(1, 1)) * self.scale
+
+
+class TestFusedOperatorsOp_channelwise_add(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(3, 20, 20).astype(self.dtype)
+        self.y = np.random.rand(3, 1, 1).astype(self.dtype)
+
+    def init_axis(self):
+        self.axis = -1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y) * self.scale
+
+
+# add + scale
+#   TestElementwiseAddOp_f_add_scale
+#   TestFusedOperatorsOp_scalar_f_add_scale
+#   TestFusedOperatorsOp_scalar2_f_add_scale
+#   TestFusedOperatorsOp_Vector_f_add_scale
+#   TestFusedOperatorsOp_broadcast_0_f_add_scale
+#   TestFusedOperatorsOp_broadcast_1_f_add_scale
+#   TestFusedOperatorsOp_broadcast_2_f_add_scale
+#   TestFusedOperatorsOp_broadcast_3_f_add_scale
+#   TestFusedOperatorsOp_broadcast_4_f_add_scale
+#   TestFusedOperatorsOp_rowwise_add_0_f_add_scale
+#   TestFusedOperatorsOp_rowwise_add_1_f_add_scale
+#   TestFusedOperatorsOp_channelwise_add_f_add_scale
+
+
+class TestFusedOperatorsOp_f_add_scale(TestElementwiseAddOp):
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_scalar_f_add_scale(TestFusedOperatorsOp_scalar):
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_scalar2_f_add_scale(TestFusedOperatorsOp_scalar2):
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_Vector_f_add_scale(TestFusedOperatorsOp_Vector):
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_0_f_add_scale(
+        TestFusedOperatorsOp_broadcast_0):
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y.reshape(2, 1, 1) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_1_f_add_scale(
+        TestFusedOperatorsOp_broadcast_1):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y.reshape(1, 3, 1) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_2_f_add_scale(
+        TestFusedOperatorsOp_broadcast_2):
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y.reshape(1, 1, 4) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_3_f_add_scale(
+        TestFusedOperatorsOp_broadcast_3):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y.reshape(1, 3, 4, 1) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_4_f_add_scale(
+        TestFusedOperatorsOp_broadcast_4):
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.scale = 0.2
+        self.out = self.x + self.y.reshape(2, 1, 1, 1) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_rowwise_add_0_f_add_scale(
+        TestFusedOperatorsOp_rowwise_add_0):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y.reshape(1, 3, 4) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_rowwise_add_1_f_add_scale(
+        TestFusedOperatorsOp_rowwise_add_1):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.2
+        self.out = self.x + self.y.reshape(1, 1) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_channelwise_add_f_add_scale(
+        TestFusedOperatorsOp_channelwise_add):
+    def init_axis(self):
+        self.axis = -1
+
+    def init_output(self):
+        self.scale = 0.2
+        self.out = self.x + self.y * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+# add + relu
+#   TestElementwiseAddOp_f_add_relu
+#   TestFusedOperatorsOp_scalar_f_add_relu
+#   TestFusedOperatorsOp_scalar2_f_add_relu
+#   TestFusedOperatorsOp_Vector_f_add_relu
+#   TestFusedOperatorsOp_broadcast_0_f_add_relu
+#   TestFusedOperatorsOp_broadcast_1_f_add_relu
+#   TestFusedOperatorsOp_broadcast_2_f_add_relu
+#   TestFusedOperatorsOp_broadcast_3_f_add_relu
+#   TestFusedOperatorsOp_broadcast_4_f_add_relu
+#   TestFusedOperatorsOp_rowwise_add_0_f_add_relu
+#   TestFusedOperatorsOp_rowwise_add_1_f_add_relu
+#   TestFusedOperatorsOp_channelwise_add_f_add_relu
+
+
+class TestFusedOperatorsOp_f_add_relu(TestElementwiseAddOp):
+    def init_output(self):
+        # Copy from test_activation_op.py
+        # Because we set delta = 0.005 in calculating numeric gradient,
+        # if x is too small, such as 0.002, x_neg will be -0.003
+        # x_pos will be 0.007, so the numeric gradient is inaccurate.
+        # we should avoid this
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y, 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_scalar_f_add_relu(TestFusedOperatorsOp_scalar):
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y, 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_scalar2_f_add_relu(TestFusedOperatorsOp_scalar2):
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y, 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_Vector_f_add_relu(TestFusedOperatorsOp_Vector):
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y, 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_0_f_add_relu(
+        TestFusedOperatorsOp_broadcast_0):
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y.reshape(2, 1, 1), 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_1_f_add_relu(
+        TestFusedOperatorsOp_broadcast_1):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y.reshape(1, 3, 1), 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_2_f_add_relu(
+        TestFusedOperatorsOp_broadcast_2):
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y.reshape(1, 1, 4), 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_3_f_add_relu(
+        TestFusedOperatorsOp_broadcast_3):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y.reshape(1, 3, 4, 1), 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_4_f_add_relu(
+        TestFusedOperatorsOp_broadcast_4):
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y.reshape(2, 1, 1, 1), 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_rowwise_add_0_f_add_relu(
+        TestFusedOperatorsOp_rowwise_add_0):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y.reshape(1, 3, 4), 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_rowwise_add_1_f_add_relu(
+        TestFusedOperatorsOp_rowwise_add_1):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y.reshape(1, 1), 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_channelwise_add_f_add_relu(
+        TestFusedOperatorsOp_channelwise_add):
+    def init_axis(self):
+        self.axis = -1
+
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y, 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+# relu + add
+#   TestElementwiseAddOp_f_relu_add
+#   TestFusedOperatorsOp_scalar_f_relu_add
+#   TestFusedOperatorsOp_scalar2_f_relu_add
+#   TestFusedOperatorsOp_Vector_f_relu_add
+#   TestFusedOperatorsOp_broadcast_0_f_relu_add
+#   TestFusedOperatorsOp_broadcast_1_f_relu_add
+#   TestFusedOperatorsOp_broadcast_2_f_relu_add
+#   TestFusedOperatorsOp_broadcast_3_f_relu_add
+#   TestFusedOperatorsOp_broadcast_4_f_relu_add
+#   TestFusedOperatorsOp_rowwise_add_0_f_relu_add
+#   TestFusedOperatorsOp_rowwise_add_1_f_relu_add
+#   TestFusedOperatorsOp_channelwise_add_f_relu_add
+
+
+class TestFusedOperatorsOp_f_relu_add(TestElementwiseAddOp):
+    def init_output(self):
+        # Copy from test_activation_op.py
+        # Because we set delta = 0.005 in calculating numeric gradient,
+        # if x is too small, such as 0.002, x_neg will be -0.003
+        # x_pos will be 0.007, so the numeric gradient is inaccurate.
+        # we should avoid this
+        self.out = self.x + self.y
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_scalar_f_relu_add(TestFusedOperatorsOp_scalar):
+    def init_output(self):
+        self.out = self.x + self.y
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_scalar2_f_relu_add(TestFusedOperatorsOp_scalar2):
+    def init_output(self):
+        self.out = self.x + self.y
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_Vector_f_relu_add(TestFusedOperatorsOp_Vector):
+    def init_output(self):
+        self.out = self.x + self.y
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_0_f_relu_add(
+        TestFusedOperatorsOp_broadcast_0):
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.out = self.x + self.y.reshape(2, 1, 1)
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_1_f_relu_add(
+        TestFusedOperatorsOp_broadcast_1):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.out = self.x + self.y.reshape(1, 3, 1)
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_2_f_relu_add(
+        TestFusedOperatorsOp_broadcast_2):
+    def init_output(self):
+        self.out = self.x + self.y.reshape(1, 1, 4)
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_3_f_relu_add(
+        TestFusedOperatorsOp_broadcast_3):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.out = self.x + self.y.reshape(1, 3, 4, 1)
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_4_f_relu_add(
+        TestFusedOperatorsOp_broadcast_4):
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.out = self.x + self.y.reshape(2, 1, 1, 1)
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_rowwise_add_0_f_relu_add(
+        TestFusedOperatorsOp_rowwise_add_0):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.out = self.x + self.y.reshape(1, 3, 4)
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_rowwise_add_1_f_relu_add(
+        TestFusedOperatorsOp_rowwise_add_1):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.out = self.x + self.y.reshape(1, 1)
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_channelwise_add_f_relu_add(
+        TestFusedOperatorsOp_channelwise_add):
+    def init_axis(self):
+        self.axis = -1
+
+    def init_output(self):
+        self.out = self.x + self.y
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_get_places_op.py b/python/paddle/fluid/tests/unittests/test_get_places_op.py
index 6dab1e22f0c50ab011d6b8e8944097600cf3fecc..964423e2d2638224244b4ca774d8eee08f3ec989 100644
--- a/python/paddle/fluid/tests/unittests/test_get_places_op.py
+++ b/python/paddle/fluid/tests/unittests/test_get_places_op.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import paddle.fluid as fluid
+from paddle.fluid.layers.device import get_places
 import decorators
 import unittest
 
@@ -20,7 +21,7 @@ import unittest
 class TestGetPlaces(unittest.TestCase):
     @decorators.prog_scope()
     def test_get_places(self):
-        places = fluid.layers.get_places()
+        places = get_places()
         cpu = fluid.CPUPlace()
         exe = fluid.Executor(cpu)
         exe.run(fluid.default_main_program())
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..daa5da8d95129af0305b326832a557daeb4c5c9c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@@ -0,0 +1,101 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import math
+from op_test import OpTest
+
+np.random.seed(100)
+
+
+def find_latest_set(num):
+    return 1 + int(math.floor(math.log(num, 2)))
+
+
+class CodeTable(object):
+    def __init__(self, num_classes, code):
+        self.c = num_classes + code
+
+    def cal_index(self, bit):
+        return (self.c >> (bit + 1)) - 1
+
+    def get_length(self):
+        return find_latest_set(self.c) - 1
+
+    def cal_bit(self, bit):
+        return self.c & (1 << bit)
+
+
+def hsigmoid(x, w, label, bias, num_classes):
+    batch_size = x.shape[0]
+    code_length = find_latest_set(num_classes - 1)
+    code_table = [0 for _ in range(code_length)]
+    pre_output = np.zeros((batch_size, code_length))
+    pre_sum = np.zeros((batch_size, 1))
+    out = np.zeros((batch_size, 1)).astype("float32")
+    for i in range(batch_size):
+        code_table = CodeTable(num_classes, label[i])
+        length = code_table.get_length()
+        for j in range(length):
+            idx = code_table.cal_index(j)
+            pre_output[i][j] += bias[0][idx]
+    for i in range(batch_size):
+        code_table = CodeTable(num_classes, label[i])
+        length = code_table.get_length()
+        for j in range(length):
+            idx = code_table.cal_index(j)
+            pre_output[i][j] += np.dot(w[idx], x[i])
+    # clip[-40.0, 40.0]
+    pre_output = np.clip(pre_output, -40.0, 40.0)
+    # out(i, 0) = \sum_j  bit(i, j) * preout(i, j)
+    for i in range(batch_size):
+        code_table = CodeTable(num_classes, label[i])
+        length = code_table.get_length()
+        sum = 0.0
+        for j in range(length):
+            if code_table.cal_bit(j):
+                sum += pre_output[i][j]
+        out[i] = -1.0 * sum
+    # soft relu
+    pre_output = np.log(1 + np.exp(pre_output))
+    pre_sum = pre_output.sum(1).reshape((batch_size, 1))
+    out += pre_sum
+    return pre_output, out
+
+
+class TestHSigmoidOp(OpTest):
+    def setUp(self):
+        self.op_type = "hierarchical_sigmoid"
+        num_classes = 6
+        feature_size = 8
+        batch_size = 4
+        x = np.random.random((batch_size, feature_size)).astype("float32")
+        w = np.random.random((num_classes - 1, feature_size)).astype("float32")
+        label = np.random.randint(0, num_classes, (batch_size, 1))
+        bias = np.random.random((1, num_classes - 1)).astype("float32")
+        self.attrs = {'num_classes': num_classes}
+        self.inputs = {'X': x, 'W': w, 'Label': label, 'Bias': bias}
+        pre_output, out = hsigmoid(x, w, label, bias, num_classes)
+        self.outputs = {'PreOut': pre_output, 'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_im2sequence_op.py b/python/paddle/fluid/tests/unittests/test_im2sequence_op.py
index 4946475f11a4fc0ccaffeec6821d3976ea7c6560..13bc5768740ece00bbe285a0b47d82bb8a42d2c7 100644
--- a/python/paddle/fluid/tests/unittests/test_im2sequence_op.py
+++ b/python/paddle/fluid/tests/unittests/test_im2sequence_op.py
@@ -16,23 +16,48 @@ import numpy as np
 from op_test import OpTest
 
 
-def get_output_shape(attrs, in_shape):
+def get_output_shape(attrs, in_shape, img_real_size):
+    batchsize = in_shape[0]
     img_height = in_shape[2]
     img_width = in_shape[3]
+    paddings = np.array(attrs['paddings']).astype("int32")
+    kernels = np.array(attrs['kernels']).astype("int32")
+    strides = np.array(attrs['strides']).astype("int32")
+    output_height = np.zeros((1, batchsize)).astype("int32")
+    output_width = np.zeros((1, batchsize)).astype("int32")
+    if len(img_real_size):
+        out_stride = np.array(attrs['out_stride']).astype("int32")
+        imgreal_h = 0
+        imgreal_w = 0
+        for index in range(batchsize):
+            if img_real_size[index, 0] % out_stride[0] == 0:
+                imgreal_h = img_real_size[index, 0] / out_stride[0]
+            else:
+                imgreal_h = img_real_size[index, 0] / out_stride[0] + 1
+            if img_real_size[index, 0] % out_stride[1] == 0:
+                imgreal_w = img_real_size[index, 1] / out_stride[1]
+            else:
+                imgreal_w = img_real_size[index, 0] / out_stride[1] + 1
+            output_height[0,index] = \
+              1 +  \
+              (imgreal_h + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \
+                  strides[0]
 
-    paddings = attrs['paddings']
-    kernels = attrs['kernels']
-    strides = attrs['strides']
+            output_width[0,index] = \
+              1 + \
+              (imgreal_w + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \
+                  strides[1]
+    else:
+        for index in range(batchsize):
+            output_height[0,index] = \
+              1 +  \
+              (img_height + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \
+                  strides[0]
 
-    output_height = \
-      1 +  \
-      (img_height + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \
-          strides[0]
-
-    output_width = \
-      1 + \
-      (img_width + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \
-          strides[1]
+            output_width[0,index] = \
+              1 + \
+              (img_width + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \
+                  strides[1]
 
     return output_height, output_width
 
@@ -75,22 +100,25 @@ def im2col(attrs, im, col):
                                     im_row_offset][im_col_offset]
 
 
-def Im2Sequence(inputs, attrs):
-    output_height, output_width = get_output_shape(attrs, inputs.shape)
+def Im2Sequence(inputs, img_real_size, attrs):
+    output_height, output_width = get_output_shape(attrs, inputs.shape,
+                                                   img_real_size)
     img_channels = inputs.shape[1]
     batch_size = inputs.shape[0]
-    out = np.zeros([
-        batch_size, output_height, output_width, img_channels,
-        attrs['kernels'][0], attrs['kernels'][1]
-    ]).astype("float32")
-
-    for i in range(len(inputs)):
-        im2col(attrs, inputs[i], out[i])
-
-    out = out.reshape([
-        batch_size * output_height * output_width,
-        img_channels * attrs['kernels'][0] * attrs['kernels'][1]
-    ])
+    out = []
+    for index in range(batch_size):
+        tmp = np.zeros([
+            output_height[0, index], output_width[0, index], img_channels,
+            attrs['kernels'][0], attrs['kernels'][1]
+        ]).astype("float32")
+        out.append(tmp)
+    for index in range(len(inputs)):
+        im2col(attrs, inputs[index], out[index])
+        out[index] = out[index].reshape([
+            output_height[0, index] * output_width[0, index],
+            img_channels * attrs['kernels'][0] * attrs['kernels'][1]
+        ])
+    out = np.concatenate(out, axis=0)
     return out
 
 
@@ -103,7 +131,7 @@ class TestBlockExpandOp(OpTest):
         self.attrs = {
             'kernels': [2, 2],
             'strides': [1, 1],
-            'paddings': [1, 1, 1, 1]
+            'paddings': [1, 1, 1, 1],
         }
 
     def setUp(self):
@@ -113,7 +141,8 @@ class TestBlockExpandOp(OpTest):
             self.batch_size, self.img_channels, self.img_height, self.img_width
         ]).astype("float32")
 
-        out = Im2Sequence(x, self.attrs)
+        real_size = np.array([]).astype("float32")
+        out = Im2Sequence(x, real_size, self.attrs)
         self.inputs = {'X': x}
         self.outputs = {'Out': out}
 
@@ -133,20 +162,20 @@ class TestBlockExpandOpCase2(TestBlockExpandOp):
         self.attrs = {
             'kernels': [2, 1],
             'strides': [2, 1],
-            'paddings': [2, 1, 2, 1]
+            'paddings': [2, 1, 2, 1],
         }
 
 
 class TestBlockExpandOpCase3(TestBlockExpandOp):
     def config(self):
-        self.batch_size = 3
+        self.batch_size = 2
         self.img_channels = 1
         self.img_height = 4
         self.img_width = 5
         self.attrs = {
             'kernels': [2, 1],
             'strides': [2, 1],
-            'paddings': [2, 0, 2, 0]
+            'paddings': [2, 0, 2, 0],
         }
 
 
@@ -159,9 +188,94 @@ class TestBlockExpandOpCase4(TestBlockExpandOp):
         self.attrs = {
             'kernels': [2, 2],
             'strides': [1, 1],
-            'paddings': [0, 0, 0, 0]
+            'paddings': [0, 0, 0, 0],
+        }
+
+
+class TestBlockExpandOpCase5(OpTest):
+    def config(self):
+        self.batch_size = 1
+        self.img_channels = 3
+        self.img_height = 4
+        self.img_width = 5
+        self.attrs = {
+            'kernels': [2, 1],
+            'strides': [2, 1],
+            'paddings': [2, 1, 2, 1],
+            'out_stride': [2, 2],
+        }
+
+    def setUp(self):
+        self.config()
+        self.op_type = "im2sequence"
+        x = np.random.uniform(0.1, 1, [
+            self.batch_size, self.img_channels, self.img_height, self.img_width
+        ]).astype("float32")
+        real_size = np.array([[8, 10], [5, 8]]).astype("float32")
+        out = np.array(Im2Sequence(x, real_size, self.attrs))
+        self.inputs = {'X': x, 'Y': real_size}  #l ??
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestBlockExpandOpCase6(OpTest):
+    def config(self):
+        self.batch_size = 3
+        self.img_channels = 1
+        self.img_height = 4
+        self.img_width = 5
+        self.attrs = {
+            'kernels': [2, 1],
+            'strides': [1, 1],
+            'paddings': [0, 0, 0, 0],
+            'out_stride': [1, 1],
+        }
+
+    def setUp(self):
+        self.config()
+        self.op_type = "im2sequence"
+        x = np.random.uniform(0.1, 1, [
+            self.batch_size, self.img_channels, self.img_height, self.img_width
+        ]).astype("float32")
+        real_size = np.array([[8, 10], [5, 8], [5, 8]]).astype("float32")
+        out = np.array(Im2Sequence(x, real_size, self.attrs))
+        self.inputs = {'X': x, 'Y': real_size}  #l ??
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestBlockExpandOpCase7(OpTest):
+    def config(self):
+        self.batch_size = 2
+        self.img_channels = 2
+        self.img_height = 3
+        self.img_width = 3
+        self.attrs = {
+            'kernels': [2, 2],
+            'strides': [1, 1],
+            'paddings': [1, 0, 1, 0],
+            'out_stride': [2, 2],
         }
 
+    def setUp(self):
+        self.config()
+        self.op_type = "im2sequence"
+        x = np.random.uniform(0.1, 1, [
+            self.batch_size, self.img_channels, self.img_height, self.img_width
+        ]).astype("float32")
+        real_size = np.array([[6, 6], [4, 4]]).astype("float32")
+        out = np.array(Im2Sequence(x, real_size, self.attrs))
+        self.inputs = {'X': x, 'Y': real_size}
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
 
 if __name__ == '__main__':
     unittest.main()
+#set shiftwidth=4 set expandtab set tabstop=4
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index 15a72cb605911dfe957fb927763174521a30a085..b215e379864e919af03591ab2566c08dddbb5743 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -27,12 +27,13 @@ class TestConstantInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.ConstantInitializer())
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.ConstantInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'fill_constant')
@@ -43,12 +44,13 @@ class TestConstantInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.ConstantInitializer(2.3))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.ConstantInitializer(2.3))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'fill_constant')
@@ -61,12 +63,13 @@ class TestUniformInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.UniformInitializer())
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.UniformInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -80,18 +83,19 @@ class TestUniformInitializer(unittest.TestCase):
         program = framework.Program()
         program.random_seed = 123
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.UniformInitializer())
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.UniformInitializer(seed=456))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param1",
+                initializer=initializer.UniformInitializer())
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param2",
+                initializer=initializer.UniformInitializer(seed=456))
         init_op = block.ops[1]
         self.assertEqual(init_op.attr("seed"), 123)
         init_op1 = block.ops[0]
@@ -102,12 +106,13 @@ class TestUniformInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.UniformInitializer(-4.2, 3.1, 123))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.UniformInitializer(-4.2, 3.1, 123))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -115,6 +120,25 @@ class TestUniformInitializer(unittest.TestCase):
         self.assertAlmostEqual(init_op.attr('max'), 3.1, delta=DELTA)
         self.assertEqual(init_op.attr('seed'), 123)
 
+    def test_uniform_initializer_two_op(self):
+        """Test uniform initializer with supplied attributes
+        """
+        program = framework.Program()
+        block = program.global_block()
+        for i in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.UniformInitializer(-4.2, float(i), 123))
+        self.assertEqual(len(block.ops), 1)
+        init_op0 = block.ops[0]
+        self.assertEqual(init_op0.type, 'uniform_random')
+        self.assertAlmostEqual(init_op0.attr('min'), -4.2, delta=DELTA)
+        self.assertAlmostEqual(init_op0.attr('max'), 0.0, delta=DELTA)
+        self.assertEqual(init_op0.attr('seed'), 123)
+
 
 class TestNormalInitializer(unittest.TestCase):
     def test_normal_initializer_default_value(self):
@@ -122,12 +146,13 @@ class TestNormalInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.NormalInitializer())
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.NormalInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -140,12 +165,13 @@ class TestNormalInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.NormalInitializer(2.3, 1.9, 123))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.NormalInitializer(2.3, 1.9, 123))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -161,12 +187,13 @@ class TestXavierInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.XavierInitializer())
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.XavierInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -181,12 +208,13 @@ class TestXavierInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10, 15, 20],
-            lod_level=0,
-            name="param",
-            initializer=initializer.XavierInitializer())
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10, 15, 20],
+                lod_level=0,
+                name="param",
+                initializer=initializer.XavierInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -203,12 +231,13 @@ class TestXavierInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.XavierInitializer(uniform=False))
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.XavierInitializer(uniform=False))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -223,12 +252,13 @@ class TestXavierInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10, 15, 20],
-            lod_level=0,
-            name="param",
-            initializer=initializer.XavierInitializer(uniform=False))
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10, 15, 20],
+                lod_level=0,
+                name="param",
+                initializer=initializer.XavierInitializer(uniform=False))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -244,13 +274,14 @@ class TestXavierInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.XavierInitializer(
-                fan_in=12, fan_out=23, seed=134))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.XavierInitializer(
+                    fan_in=12, fan_out=23, seed=134))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -267,12 +298,13 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.MSRAInitializer())
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.MSRAInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -287,12 +319,13 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10, 15, 20],
-            lod_level=0,
-            name="param",
-            initializer=initializer.MSRAInitializer())
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10, 15, 20],
+                lod_level=0,
+                name="param",
+                initializer=initializer.MSRAInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -308,12 +341,13 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.MSRAInitializer(uniform=False))
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.MSRAInitializer(uniform=False))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -328,12 +362,13 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10, 15, 20],
-            lod_level=0,
-            name="param",
-            initializer=initializer.MSRAInitializer(uniform=False))
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10, 15, 20],
+                lod_level=0,
+                name="param",
+                initializer=initializer.MSRAInitializer(uniform=False))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -348,13 +383,14 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.MSRAInitializer(
-                fan_in=12, seed=134))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.MSRAInitializer(
+                    fan_in=12, seed=134))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -370,12 +406,13 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[8, 1, 3, 3],
-            lod_level=0,
-            name="param",
-            initializer=initializer.BilinearInitializer())
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[8, 1, 3, 3],
+                lod_level=0,
+                name="param",
+                initializer=initializer.BilinearInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'assign_value')
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 842d34c07e94a79e3351347e2528ecc478cc56dc..9e1b47643a554bc14170fc57ac05b21afdb8117a 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 import unittest
 
 import paddle.fluid.layers as layers
+from paddle.fluid.layers.device import get_places
 import paddle.fluid.nets as nets
 from paddle.fluid.framework import Program, program_guard, default_main_program
 from paddle.fluid.param_attr import ParamAttr
@@ -173,6 +174,16 @@ class TestBook(unittest.TestCase):
                     x=dat, label=lbl))
         print(str(program))
 
+    def test_hsigmoid(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[2], dtype='float32')
+            y = layers.data(name='y', shape=[2], dtype='int64')
+            self.assertIsNotNone(
+                layers.hsigmoid(
+                    input=x, label=y, num_classes=2))
+        print(str(program))
+
     def test_sequence_expand(self):
         program = Program()
         with program_guard(program):
@@ -238,7 +249,7 @@ class TestBook(unittest.TestCase):
     def test_get_places(self):
         program = Program()
         with program_guard(program):
-            x = layers.get_places(device_count=4)
+            x = get_places(device_count=4)
             self.assertIsNotNone(x)
         print(str(program))
 
@@ -251,12 +262,16 @@ class TestBook(unittest.TestCase):
         print(str(program))
 
     def test_im2sequence(self):
-        print("test_im2sequence")
         program = Program()
         with program_guard(program):
             x = layers.data(name='x', shape=[3, 128, 128], dtype='float32')
+            y = layers.data(name='y', shape=[], dtype='float32')
             output = layers.im2sequence(
-                input=x, stride=[1, 1], filter_size=[2, 2])
+                input=x,
+                input_image_size=y,
+                stride=[1, 1],
+                filter_size=[2, 2],
+                out_stride=[1, 1])
             self.assertIsNotNone(output)
         print(str(program))
 
@@ -428,6 +443,37 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(ids)
         print(str(program))
 
+    def test_rank_loss(self):
+        program = Program()
+        with program_guard(program):
+            label = layers.data(
+                name='label',
+                append_batch_size=False,
+                shape=[16, 1],
+                dtype="float32")
+            left = layers.data(
+                name='left',
+                append_batch_size=False,
+                shape=[16, 1],
+                dtype="float32")
+            right = layers.data(
+                name='right',
+                append_batch_size=False,
+                shape=[16, 1],
+                dtype="float32")
+            out = layers.rank_loss(label, left, right, name="rank_loss")
+            self.assertIsNotNone(out)
+        print(str(program))
+
+    def test_shape(self):
+        program = Program()
+        with program_guard(program):
+            input = layers.data(
+                name="input", shape=[3, 100, 100], dtype="float32")
+            out = layers.shape(input, name="shape")
+            self.assertIsNotNone(out)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
index 6382e290eb30c621da64d5c600be6d8a7c6254f1..e628195e7265ec564bd64a212c4a35fdff495063 100644
--- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
@@ -91,20 +91,21 @@ class TestLearningRateDecay(unittest.TestCase):
 
     def check_decay_with_place(self, place, python_decay_fn, fluid_decay_fn,
                                kwargs):
+        main_prog = fluid.Program()
+        startup_prog = fluid.Program()
 
-        decayed_lr = fluid_decay_fn(**kwargs)
+        with fluid.program_guard(main_prog, startup_prog):
+            decayed_lr = fluid_decay_fn(**kwargs)
 
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
 
-        exe.run(fluid.default_startup_program())
+        exe.run(startup_prog)
 
-        fluid.memory_optimize(fluid.default_main_program())
+        fluid.memory_optimize(main_prog)
 
         for step in range(10):
-            lr_val, = exe.run(fluid.default_main_program(),
-                              feed={},
-                              fetch_list=[decayed_lr])
+            lr_val, = exe.run(main_prog, feed={}, fetch_list=[decayed_lr])
             python_decayed_lr = python_decay_fn(
                 global_step=float(step), **kwargs)
             self.assertAlmostEqual(
diff --git a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
index bac5e502318397b43e9867d5fc9e4e8cd33394b8..16e85830ffa51ec428951570cc7a038f3d10c873 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle.fluid.layers import lod_rank_table, data
+from paddle.fluid.layers import data
+from paddle.fluid.layers.control_flow import lod_rank_table
 from paddle.fluid.executor import Executor
 import paddle.fluid.core as core
 import numpy
diff --git a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
index cebe6997bb4152519dabbabfc0404d6036bc4e65..5a4580116bc7009c73f1de14a265bf2cea5acf9b 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
@@ -20,6 +20,11 @@ from paddle.fluid.framework import Program, program_guard
 from paddle.fluid.executor import Executor
 from paddle.fluid.backward import append_backward
 
+from paddle.fluid.layers.control_flow import lod_rank_table
+from paddle.fluid.layers.control_flow import max_sequence_len
+from paddle.fluid.layers.control_flow import lod_tensor_to_array
+from paddle.fluid.layers.control_flow import array_to_lod_tensor
+
 
 class TestCPULoDTensorArrayOps(unittest.TestCase):
     def place(self):
@@ -137,13 +142,13 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         with program_guard(program):
             x = layers.data(name='x', shape=[10])
             x.persistable = True
-            table = layers.lod_rank_table(x, level=level)
-            max_len = layers.max_sequence_len(table)
+            table = lod_rank_table(x, level=level)
+            max_len = max_sequence_len(table)
             max_len.persistable = True
-            array = layers.lod_tensor_to_array(x, table)
+            array = lod_tensor_to_array(x, table)
             array.persistable = True
 
-            result = layers.array_to_lod_tensor(array, table)
+            result = array_to_lod_tensor(array, table)
             result.persistable = True
         exe = Executor(place)
         scope = core.Scope()
@@ -181,9 +186,9 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
         with program_guard(program):
             x = layers.data(
                 name='x', shape=[1], dtype='float32', stop_gradient=False)
-            table = layers.lod_rank_table(x, level=0)
-            array = layers.lod_tensor_to_array(x, table)
-            result = layers.array_to_lod_tensor(array, table)
+            table = lod_rank_table(x, level=0)
+            array = lod_tensor_to_array(x, table)
+            result = array_to_lod_tensor(array, table)
 
             mean = layers.mean(result)
 
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
index f8d5785fbfe64843f4aa3b96b24809df60980c74..ad0d555198c36c12fd1cc39c41d39b24b40f64c3 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
@@ -35,6 +35,22 @@ class TestLookupTableOp(OpTest):
         self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
 
 
+class TestLookupTableOpWithTensorIds(OpTest):
+    def setUp(self):
+        self.op_type = "lookup_table"
+        table = np.random.random((17, 31)).astype("float32")
+        ids = np.random.randint(
+            low=0, high=17, size=(2, 4, 5, 1)).astype("int64")
+        self.inputs = {'W': table, 'Ids': ids}
+        self.outputs = {'Out': table[ids.flatten()].reshape((2, 4, 5, 31))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
+
+
 class TestLookupTableOpWithPadding(TestLookupTableOp):
     def test_check_output(self):
         ids = np.squeeze(self.inputs['Ids'])
@@ -44,68 +60,34 @@ class TestLookupTableOpWithPadding(TestLookupTableOp):
         self.check_output()
 
     def test_check_grad(self):
-        # Since paddings are not trainable and fixed in forward, the gradient of 
+        # Since paddings are not trainable and fixed in forward, the gradient of
         # paddings makes no sense and we don't test the gradient here.
         pass
 
 
-class TestLookupTableIdsIsSelectedRows(OpTest):
-    def check_with_place(self, place):
-        scope = core.Scope()
-
-        # create and initialize Variable
-        height = 10
-        rows = [0, 4, 4, 7]
-        row_numel = 12
-
-        # create and initialize W Variable
-        W = scope.var('W').get_tensor()
-        W_array = np.full((height, row_numel), 1.0).astype("float32")
-        for i in range(height):
-            W_array[i] *= i
-        W.set(W_array, place)
-
-        # create and initialize Ids Variable
-        ids_selected_rows = scope.var('Ids').get_selected_rows()
-        ids_selected_rows.set_height(len(rows))
-        ids_selected_rows.set_rows(rows)
-        np_array = np.ones((len(rows), row_numel)).astype("float32")
-        ids_tensor = ids_selected_rows.get_tensor()
-        ids_tensor.set(np_array, place)
-
-        # create Out Variable
-        Out = scope.var('Out').get_selected_rows()
-
-        # create and run lookup_table operator
-        concat_rows_op = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
-        concat_rows_op.run(scope, place)
-
-        # get result from Out
-        Out_tensor = Out.get_tensor()
-        result_array = np.array(Out_tensor)
-
-        # all(): return True if all elements of the iterable are true (or if the iterable is empty)
-        for idx, row in enumerate(rows):
-            assert (row == result_array[idx]).all()
+class TestLookupTableOpWithTensorIdsAndPadding(TestLookupTableOpWithTensorIds):
+    def test_check_output(self):
+        ids = self.inputs['Ids']
+        flatten_idx = ids.flatten()
+        padding_idx = np.random.choice(flatten_idx, 1)[0]
+        self.outputs['Out'][np.squeeze(ids == padding_idx)] = np.zeros(31)
+        self.attrs = {'padding_idx': long(padding_idx)}
+        self.check_output()
 
-    def test_concat_rows(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        for place in places:
-            self.check_with_place(place)
+    def test_check_grad(self):
+        # Since paddings are not trainable and fixed in forward, the gradient of
+        # paddings makes no sense and we don't test the gradient here.
+        pass
 
 
 class TestLookupTableWIsSelectedRows(OpTest):
-    def check_with_place(self, place):
-        scope = core.Scope()
-
-        # create and initialize Id Variable
+    def prepare_ids(self, scope, place):
         ids_tensor = scope.var('Ids').get_tensor()
         ids_array = np.array([[0], [4], [3], [5]]).astype("int64")
         ids_tensor.set(ids_array, place)
+        return ids_array
 
-        # create and initialize W Variable
+    def prepare_w(self, scope, place):
         rows = [0, 1, 2, 3, 4, 5, 6]
         row_numel = 12
 
@@ -118,8 +100,22 @@ class TestLookupTableWIsSelectedRows(OpTest):
         w_tensor = w_selected_rows.get_tensor()
         w_tensor.set(w_array, place)
 
-        # create Out Variable
-        out_tensor = scope.var('Out').get_tensor()
+    def create_out_tensor(self, scope, place):
+        return scope.var('Out').get_tensor()
+
+    def check_result(self, ids_array, result_array):
+        # all(): return True if all elements of the iterable are true (or if the iterable is empty)
+        for idx, row in enumerate(ids_array):
+            assert (row[0] == result_array[idx]).all()
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+
+        ids_array = self.prepare_ids(scope, place)
+
+        self.prepare_w(scope, place)
+
+        out_tensor = self.create_out_tensor(scope, place)
 
         # create and run lookup_table operator
         lookup_table = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
@@ -127,9 +123,8 @@ class TestLookupTableWIsSelectedRows(OpTest):
 
         # get result from Out
         result_array = np.array(out_tensor)
-        # all(): return True if all elements of the iterable are true (or if the iterable is empty)
-        for idx, row in enumerate(ids_array):
-            assert (row[0] == result_array[idx]).all()
+
+        self.check_result(ids_array, result_array)
 
     def test_w_is_selected_rows(self):
         places = [core.CPUPlace()]
@@ -138,5 +133,19 @@ class TestLookupTableWIsSelectedRows(OpTest):
             self.check_with_place(place)
 
 
+class TestLookupTableWithTensorIdsWIsSelectedRows(
+        TestLookupTableWIsSelectedRows):
+    def prepare_ids(self, scope, place):
+        ids_tensor = scope.var('Ids').get_tensor()
+        ids_array = np.random.randint(
+            low=0, high=6, size=(2, 4, 3, 1)).astype("int64")
+        ids_tensor.set(ids_array, place)
+        return ids_array
+
+    def check_result(self, ids_array, result_array):
+        for idx, row in np.ndenumerate(ids_array):
+            assert (row == result_array[idx]).all()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
index cfd6e63e12258a92447e68b4afbc7ead91b68cc1..67733807f8f8582f68dcfa3f361e13a631a29597 100644
--- a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
@@ -43,5 +43,29 @@ class TestControlFlowGraph(unittest.TestCase):
         print(str(result_program))
 
 
+class TestMemoryTranspiler2(unittest.TestCase):
+    def setUp(self):
+        program = Program()
+        with program_guard(program, startup_program=Program()):
+            x = layers.data(name='x', shape=[13], dtype='float32')
+            fc = layers.fc(input=x, size=10, act=None)
+            reshape = layers.reshape(x=fc, shape=[-1, 2, 5])
+            fc = layers.reshape(x=reshape, shape=[-1, 5, 2])
+            y_predict = layers.fc(input=fc, size=1, act=None)
+            y = layers.data(name='y', shape=[1], dtype='float32')
+            cost = layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = layers.mean(cost)
+            opt = optimizer.SGD(learning_rate=0.001)
+            opt.minimize(avg_cost)
+        self.program = program
+
+    def test_inplace_ops(self):
+        print("before optimization")
+        print(str(self.program))
+        result_program = memory_optimize(self.program)
+        print("after optimization")
+        print(str(result_program))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_memory_usage.py b/python/paddle/fluid/tests/unittests/test_memory_usage.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9daf83652e18faab0ab31402b9f5889a0beceaf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_memory_usage.py
@@ -0,0 +1,69 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+import contextlib
+import unittest
+
+
+def train_simulator(test_batch_size=10):
+    if test_batch_size <= 0:
+        raise ValueError("batch_size should be a positive integeral value, "
+                         "but got batch_size={}".format(test_batch_size))
+
+    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    avg_cost = fluid.layers.mean(cost)
+
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    sgd_optimizer.minimize(avg_cost)
+
+    # Calculate memory usage in current network config 
+    lower_usage, upper_usage, unit = fluid.contrib.memory_usage(
+        fluid.default_main_program(), batch_size=test_batch_size)
+
+    print("memory usage is about %.3f - %.3f %s" %
+          (lower_usage, upper_usage, unit))
+
+
+class TestMemoryUsage(unittest.TestCase):
+    def test_with_unit_B(self):
+        with self.program_scope_guard():
+            train_simulator()
+
+    def test_with_unit_KB(self):
+        with self.program_scope_guard():
+            train_simulator(test_batch_size=1000)
+
+    def test_with_unit_MB(self):
+        with self.program_scope_guard():
+            train_simulator(test_batch_size=100000)
+
+    @contextlib.contextmanager
+    def program_scope_guard(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py
index aaea9c1809213c5707e8540eebbdd6f269836fdc..c75d3bd276a5b494090c1aa1fea0bb4f2c067173 100644
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
@@ -39,7 +39,7 @@ class TestMomentumOp1(OpTest):
 
         velocity_out = mu * velocity + grad
         if use_nesterov:
-            param_out = param - grad * learning_rate + \
+            param_out = param - grad * learning_rate - \
                         velocity_out * mu * learning_rate
         else:
             param_out = param - learning_rate * velocity_out
@@ -75,7 +75,7 @@ class TestMomentumOp2(OpTest):
 
         velocity_out = mu * velocity + grad
         if use_nesterov:
-            param_out = param - grad * learning_rate + \
+            param_out = param - grad * learning_rate - \
                         velocity_out * mu * learning_rate
         else:
             param_out = param - learning_rate * velocity_out
diff --git a/python/paddle/fluid/tests/unittests/test_multi_file_reader.py b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
index dbd510e64ffdd6f3b78b22bb0d37d9a7ba3fd9b5..cb0ea96ff69ce32b0bb1b49f0318c353aa08d388 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
@@ -39,17 +39,17 @@ class TestMultipleReader(unittest.TestCase):
         copyfile('./mnist_0.recordio', './mnist_1.recordio')
         copyfile('./mnist_0.recordio', './mnist_2.recordio')
 
-    def main(self, thread_num):
+    def main(self, is_test=False):
         file_list = [
             './mnist_0.recordio', './mnist_1.recordio', './mnist_2.recordio'
         ]
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data_files = fluid.layers.open_files(
                 filenames=file_list,
-                thread_num=thread_num,
                 shapes=[(-1, 784), (-1, 1)],
                 lod_levels=[0, 0],
-                dtypes=['float32', 'int64'])
+                dtypes=['float32', 'int64'],
+                is_test=is_test)
             img, label = fluid.layers.read_file(data_files)
 
             if fluid.core.is_compiled_with_cuda():
@@ -71,6 +71,9 @@ class TestMultipleReader(unittest.TestCase):
             self.assertEqual(batch_count, self.num_batch * 3)
 
     def test_main(self):
-        self.main(thread_num=3)  # thread number equals to file number
-        self.main(thread_num=10)  # thread number is larger than file number
-        self.main(thread_num=2)  # thread number is less than file number
+        self.main(is_test=False)
+        self.main(is_test=True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py
index 7286c7c450108c4b5ad7136041bc4e989894a2ba..18921d727f94a85b69259c07273f09c3e19390c6 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@@ -97,7 +97,7 @@ class TestMomentumOptimizer(unittest.TestCase):
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
-        opts = momentum_optimizer.create_optimization_pass(
+        opts = momentum_optimizer._create_optimization_pass(
             params_grads, mul_out, init_program)
         self.assertEqual(len(opts), 3)
         sgd_op = opts[-1]
@@ -151,7 +151,7 @@ class TestMomentumOptimizer(unittest.TestCase):
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
-        opts = momentum_optimizer.create_optimization_pass(
+        opts = momentum_optimizer._create_optimization_pass(
             params_grads, mul_out, init_program)
         self.assertEqual(len(opts), 3)
         sgd_op = opts[-1]
@@ -214,8 +214,8 @@ class TestAdagradOptimizer(unittest.TestCase):
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
-        opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out,
-                                                          init_program)
+        opts = adagrad_optimizer._create_optimization_pass(
+            params_grads, mul_out, init_program)
         self.assertEqual(len(opts), 3)
         self.assertEqual([op.type for op in opts],
                          ["fill_constant", "elementwise_mul", "adagrad"])
@@ -278,8 +278,8 @@ class TestAdamOptimizer(unittest.TestCase):
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(adam_optimizer.get_accumulators()), 0)
-        opts = adam_optimizer.create_optimization_pass(params_grads, mul_out,
-                                                       init_program)
+        opts = adam_optimizer._create_optimization_pass(params_grads, mul_out,
+                                                        init_program)
         self.assertEqual(len(opts), 5)
         self.assertEqual(
             [op.type for op in opts],
@@ -287,7 +287,7 @@ class TestAdamOptimizer(unittest.TestCase):
 
         # Check accumulators
         accumulators = adam_optimizer.get_accumulators()
-        self.assertEqual(len(accumulators), 2)
+        self.assertEqual(len(accumulators), 4)
         self.assertTrue(adam_optimizer.get_moment1_str() in accumulators)
         self.assertTrue(adam_optimizer.get_moment2_str() in accumulators)
         moment1_acc = accumulators[adam_optimizer.get_moment1_str()]
@@ -345,8 +345,8 @@ class TestAdamaxOptimizer(unittest.TestCase):
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(adamax_optimizer.get_accumulators()), 0)
-        opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out,
-                                                         init_program)
+        opts = adamax_optimizer._create_optimization_pass(params_grads, mul_out,
+                                                          init_program)
         self.assertEqual(len(opts), 4)
         self.assertEqual(
             [op.type for op in opts],
@@ -354,7 +354,7 @@ class TestAdamaxOptimizer(unittest.TestCase):
 
         # Check accumulators
         accumulators = adamax_optimizer.get_accumulators()
-        self.assertEqual(len(accumulators), 2)
+        self.assertEqual(len(accumulators), 3)
         self.assertTrue(adamax_optimizer.get_moment_str() in accumulators)
         self.assertTrue(adamax_optimizer.get_inf_norm_str() in accumulators)
         moment_acc = accumulators[adamax_optimizer.get_moment_str()]
@@ -409,7 +409,7 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0)
-        opts = decayed_adagrad_optimizer.create_optimization_pass(
+        opts = decayed_adagrad_optimizer._create_optimization_pass(
             params_grads, mul_out, init_program)
         self.assertEqual(len(opts), 3)
         self.assertEqual(
@@ -475,8 +475,8 @@ class TestFtrlOptimizer(unittest.TestCase):
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(ftrl_optimizer.get_accumulators()), 0)
-        opts = ftrl_optimizer.create_optimization_pass(params_grads, mul_out,
-                                                       init_program)
+        opts = ftrl_optimizer._create_optimization_pass(params_grads, mul_out,
+                                                        init_program)
         self.assertEqual(len(opts), 3)
         self.assertEqual([op.type for op in opts],
                          ["fill_constant", "elementwise_mul", "ftrl"])
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
index 1f5d2f16773efb7537de85abec88344f8e0daa9f..60d63364d5f403f04519363db5ad3ad136f8a975 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -15,6 +15,7 @@
 import paddle.dataset.flowers as flowers
 import math
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import unittest
 import numpy as np
 import paddle
@@ -92,7 +93,8 @@ class TestFetchOp(unittest.TestCase):
             train_inputs.append(tst_reader_iter.next())
 
         os.environ['CPU_NUM'] = str(4)
-        self.parallel_exe(train_inputs, seed=1, use_cuda=True)
+        if core.is_compiled_with_cuda():
+            self.parallel_exe(train_inputs, seed=1, use_cuda=True)
         self.parallel_exe(train_inputs, seed=1, use_cuda=False)
 
 
@@ -137,7 +139,8 @@ class TestFeedParallel(unittest.TestCase):
 
     def test_feed_op(self):
         os.environ['CPU_NUM'] = str(4)
-        self.parallel_exe(use_cuda=True, seed=1)
+        if core.is_compiled_with_cuda():
+            self.parallel_exe(use_cuda=True, seed=1)
         self.parallel_exe(use_cuda=False, seed=1)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
index a801d99aa1ced35eb7f081fde63ad541f0eb2589..3a314f49ebe5091aa35299ea32ec593026a57c75 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -14,6 +14,7 @@
 
 from parallel_executor_test_base import TestParallelExecutorBase
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import numpy as np
 import paddle
 import paddle.dataset.mnist as mnist
@@ -32,9 +33,7 @@ def simple_fc_net(use_feed):
             filenames=[MNIST_RECORDIO_FILE],
             shapes=[[-1, 784], [-1, 1]],
             lod_levels=[0, 0],
-            dtypes=['float32', 'int64'],
-            thread_num=1,
-            for_parallel=True)
+            dtypes=['float32', 'int64'])
         reader = fluid.layers.io.double_buffer(reader)
         img, label = fluid.layers.read_file(reader)
     hidden = img
@@ -60,9 +59,7 @@ def fc_with_batchnorm(use_feed):
             filenames=[MNIST_RECORDIO_FILE],
             shapes=[[-1, 784], [-1, 1]],
             lod_levels=[0, 0],
-            dtypes=['float32', 'int64'],
-            thread_num=1,
-            for_parallel=True)
+            dtypes=['float32', 'int64'])
         reader = fluid.layers.io.double_buffer(reader)
         img, label = fluid.layers.read_file(reader)
 
@@ -101,86 +98,119 @@ class TestMNIST(TestParallelExecutorBase):
             fluid.recordio_writer.convert_reader_to_recordio_file(
                 MNIST_RECORDIO_FILE, reader, feeder)
 
-    def check_simple_fc_convergence(self,
-                                    balance_parameter_opt_between_cards,
-                                    use_cuda=True):
+    def _init_data(self):
+        np.random.seed(5)
+        img = np.random.random(size=[32, 784]).astype(np.float32)
+        label = np.ones(shape=[32, 1], dtype='int64')
+        return img, label
+
+    def _compare_reduce_and_allreduce(self, model, use_cuda):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+        self.check_network_convergence(
+            model, use_cuda=use_cuda, use_reduce=True)
+        self.check_network_convergence(
+            model, use_cuda=use_cuda, allow_op_delay=True, use_reduce=True)
+
+        img, label = self._init_data()
+
+        all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            use_reduce=False)
+        reduce_first_loss, reduce_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            use_reduce=True)
+
+        for loss in zip(all_reduce_first_loss, reduce_first_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+        for loss in zip(all_reduce_last_loss, reduce_last_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-4)
+
+    # simple_fc
+    def check_simple_fc_convergence(self, use_cuda, use_reduce=False):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
         self.check_network_convergence(simple_fc_net, use_cuda=use_cuda)
         self.check_network_convergence(
             simple_fc_net, use_cuda=use_cuda, allow_op_delay=True)
 
-        img = np.zeros(shape=[32, 784], dtype='float32')
-        label = np.ones(shape=[32, 1], dtype='int64')
+        img, label = self._init_data()
+
         self.check_network_convergence(
             simple_fc_net,
             feed_dict={"image": img,
                        "label": label},
             use_cuda=use_cuda,
-            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
-        )
+            use_reduce=use_reduce)
 
     def test_simple_fc(self):
-        self.check_simple_fc_convergence(False, use_cuda=True)
-        self.check_simple_fc_convergence(False, use_cuda=False)
+        # use_cuda
+        self.check_simple_fc_convergence(True)
+        self.check_simple_fc_convergence(False)
 
     def test_simple_fc_with_new_strategy(self):
-        self.check_simple_fc_convergence(True, use_cuda=True)
-        self.check_simple_fc_convergence(True, use_cuda=False)
+        # use_cuda, use_reduce
+        self._compare_reduce_and_allreduce(simple_fc_net, True)
+        self._compare_reduce_and_allreduce(simple_fc_net, False)
+
+    def check_simple_fc_parallel_accuracy(self, use_cuda):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+
+        img, label = self._init_data()
 
-    def check_simple_fc_parallel_accuracy(self,
-                                          balance_parameter_opt_between_cards,
-                                          use_cuda=True):
-        img = np.zeros(shape=[32, 784], dtype='float32')
-        label = np.ones(shape=[32, 1], dtype='int64')
         single_first_loss, single_last_loss = self.check_network_convergence(
             method=simple_fc_net,
-            seed=1000,
+            seed=1,
             feed_dict={"image": img,
                        "label": label},
             use_cuda=use_cuda,
             use_parallel_executor=False)
         parallel_first_loss, parallel_last_loss = self.check_network_convergence(
             method=simple_fc_net,
-            seed=1000,
+            seed=1,
             feed_dict={"image": img,
                        "label": label},
             use_cuda=use_cuda,
-            use_parallel_executor=True,
-            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
-        )
+            use_parallel_executor=True)
 
-        for p_f in parallel_first_loss:
-            self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6)
-        for p_l in parallel_last_loss:
-            self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6)
+        self.assertAlmostEquals(
+            np.mean(parallel_first_loss), single_first_loss, delta=1e-6)
+        self.assertAlmostEquals(
+            np.mean(parallel_last_loss), single_last_loss, delta=1e-6)
 
     def test_simple_fc_parallel_accuracy(self):
-        self.check_simple_fc_parallel_accuracy(False, use_cuda=True)
-        self.check_simple_fc_parallel_accuracy(False, use_cuda=False)
+        self.check_simple_fc_parallel_accuracy(True)
+        self.check_simple_fc_parallel_accuracy(False)
 
-    def test_simple_fc_parallel_accuracy_with_new_strategy(self):
-        self.check_simple_fc_parallel_accuracy(True, use_cuda=True)
-        self.check_simple_fc_parallel_accuracy(True, use_cuda=False)
+    def check_batchnorm_fc_convergence(self, use_cuda):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
 
-    def check_batchnorm_fc_convergence(
-            self, balance_parameter_opt_between_cards, use_cuda):
         self.check_network_convergence(fc_with_batchnorm, use_cuda=use_cuda)
-        img = np.zeros(shape=[32, 784], dtype='float32')
-        label = np.ones(shape=[32, 1], dtype='int64')
+
+        img, label = self._init_data()
+
         self.check_network_convergence(
             fc_with_batchnorm,
             feed_dict={"image": img,
                        "label": label},
-            use_cuda=use_cuda,
-            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
-        )
+            use_cuda=use_cuda)
 
     def test_batchnorm_fc(self):
-        self.check_batchnorm_fc_convergence(False, use_cuda=True)
-        self.check_batchnorm_fc_convergence(False, use_cuda=False)
+        self.check_batchnorm_fc_convergence(True)
+        self.check_batchnorm_fc_convergence(False)
 
     def test_batchnorm_fc_with_new_strategy(self):
-        self.check_batchnorm_fc_convergence(True, use_cuda=True)
-        self.check_batchnorm_fc_convergence(True, use_cuda=False)
+        # FIXME(zcd): close this test temporally.
+        # self._compare_reduce_and_allreduce(fc_with_batchnorm, True)
+        self._compare_reduce_and_allreduce(fc_with_batchnorm, False)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
index 066299e6c6f7f6c159cb0886e86d3404b027b698..a28428d8dee201ba105e18684c15d4b4582d989f 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
@@ -13,9 +13,27 @@
 # limitations under the License.
 
 import paddle.fluid as fluid
+import paddle.fluid.layers.ops as ops
+from paddle.fluid.initializer import init_on_cpu
+from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
+import paddle.fluid.core as core
 from parallel_executor_test_base import TestParallelExecutorBase
 import unittest
+import math
 import os
+import numpy as np
+
+# FIXME(zcd): If the neural net has dropout_op, the output of ParallelExecutor
+# and Executor is different. Because, for ParallelExecutor, the dropout_op of
+# the neural net will be copied N copies(N is the number of device). This will
+# lead to the random numbers generated by ParallelExecutor and Executor are different.
+# So, if we compare the loss of ParallelExecutor and Executor, we should remove the
+# dropout_op.
+remove_dropout = False
+
+# FIXME(zcd): If the neural net has batch_norm, the output of ParallelExecutor
+# and Executor is different.
+remove_bn = False
 
 
 def squeeze_excitation(input, num_channels, reduction_ratio):
@@ -48,7 +66,8 @@ def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
         groups=groups,
         act=None,
         bias_attr=False)
-    return fluid.layers.batch_norm(input=conv, act=act, momentum=0.1)
+    return conv if remove_bn else fluid.layers.batch_norm(
+        input=conv, act=act, momentum=0.1)
 
 
 def shortcut(input, ch_out, stride):
@@ -87,13 +106,14 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
     return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
 
 
-def SE_ResNeXt50Small(batch_size=2, use_feed=False):
-    assert not use_feed, "SE_ResNeXt doesn't support feed yet"
+batch_size = 12
+img_shape = [3, 224, 224]
+
+
+def SE_ResNeXt50Small(use_feed):
 
-    img = fluid.layers.fill_constant(
-        shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
-    label = fluid.layers.fill_constant(
-        shape=[batch_size, 1], dtype='int64', value=0.0)
+    img = fluid.layers.data(name='image', shape=img_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
     conv = conv_bn_layer(
         input=img, num_filters=16, filter_size=3, stride=2, act='relu')
@@ -122,7 +142,8 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False):
     reshape = fluid.layers.reshape(
         x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
     pool = fluid.layers.reduce_mean(input=reshape, dim=2)
-    dropout = fluid.layers.dropout(x=pool, dropout_prob=0.2)
+    dropout = pool if remove_dropout else fluid.layers.dropout(
+        x=pool, dropout_prob=0.2, seed=1)
     # Classifier layer:
     prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
@@ -130,31 +151,135 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False):
     return loss
 
 
+def cosine_decay(learning_rate, step_each_epoch, epochs=120):
+    """
+    Applies cosine decay to the learning rate.
+    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
+    """
+    global_step = _decay_step_counter()
+
+    with init_on_cpu():
+        epoch = ops.floor(global_step / step_each_epoch)
+        decayed_lr = learning_rate * \
+                     (ops.cos(epoch * (math.pi / epochs)) + 1)/2
+    return decayed_lr
+
+
+def optimizer(learning_rate=0.01):
+    optimizer = fluid.optimizer.Momentum(
+        learning_rate=cosine_decay(
+            learning_rate=learning_rate, step_each_epoch=2, epochs=1),
+        momentum=0.9,
+        regularization=fluid.regularizer.L2Decay(1e-4))
+    return optimizer
+
+
 class TestResnet(TestParallelExecutorBase):
-    def check_resnet_convergence(self,
-                                 balance_parameter_opt_between_cards,
-                                 use_cuda=True,
-                                 iter=20):
+    @classmethod
+    def setUpClass(cls):
         os.environ['CPU_NUM'] = str(4)
+        global remove_dropout
+        global remove_bn
+        remove_dropout = False
+        remove_bn = False
+
+    def _init_data(self, batch_size=2, random=True):
+        np.random.seed(5)
+        if random:
+            img = np.random.random(
+                size=[batch_size] + img_shape).astype(np.float32)
+        else:
+            img = np.ones(shape=[batch_size] + img_shape, dtype='float32')
+        label = [np.random.randint(0, 999) for _ in range(batch_size)]
+        label = np.array(label).astype(np.int64).reshape(-1, 1)
+        return img, label
 
-        import functools
-        batch_size = 2
-        self.check_network_convergence(
-            functools.partial(
-                SE_ResNeXt50Small, batch_size=batch_size),
+    def _compare_reduce_and_allreduce(self,
+                                      model,
+                                      use_cuda,
+                                      iter=20,
+                                      delta2=1e-6):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+
+        global remove_bn
+        remove_bn = True
+
+        img, label = self._init_data(batch_size=batch_size)
+        all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
             iter=iter,
             batch_size=batch_size,
             use_cuda=use_cuda,
-            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
-        )
+            use_reduce=False,
+            optimizer=optimizer)
+        reduce_first_loss, reduce_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            iter=iter,
+            batch_size=batch_size,
+            use_cuda=use_cuda,
+            use_reduce=True,
+            optimizer=optimizer)
+
+        for loss in zip(all_reduce_first_loss, reduce_first_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+        for loss in zip(all_reduce_last_loss, reduce_last_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
+
+    def _check_resnet_convergence(self,
+                                  model,
+                                  use_cuda=True,
+                                  use_reduce=False,
+                                  iter=20,
+                                  delta2=1e-6):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+
+        global remove_dropout
+        global remove_bn
+        remove_dropout = True
+        remove_bn = True
+
+        img, label = self._init_data(batch_size=batch_size)
+        single_first_loss, single_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            iter=iter,
+            batch_size=batch_size,
+            use_cuda=use_cuda,
+            use_reduce=use_reduce,
+            optimizer=optimizer,
+            use_parallel_executor=False)
+        parallel_first_loss, parallel_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            iter=iter,
+            batch_size=batch_size,
+            use_cuda=use_cuda,
+            use_reduce=use_reduce,
+            optimizer=optimizer)
+
+        self.assertAlmostEquals(
+            np.mean(parallel_first_loss), single_first_loss[0], delta=1e-6)
+        self.assertAlmostEquals(
+            np.mean(parallel_last_loss), single_last_loss[0], delta=delta2)
 
-    def test_resnet(self):
-        self.check_resnet_convergence(False, use_cuda=True)
-        self.check_resnet_convergence(False, use_cuda=False, iter=5)
+    def test_seresnext_with_learning_rate_decay(self):
+        self._check_resnet_convergence(model=SE_ResNeXt50Small, use_cuda=True)
+        self._check_resnet_convergence(
+            model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3)
 
-    def test_resnet_with_new_strategy(self):
-        self.check_resnet_convergence(True, use_cuda=True)
-        self.check_resnet_convergence(True, use_cuda=False, iter=5)
+    def test_seresnext_with_new_strategy(self):
+        self._compare_reduce_and_allreduce(
+            model=SE_ResNeXt50Small, use_cuda=True, delta2=1e-2)
+        self._compare_reduce_and_allreduce(
+            model=SE_ResNeXt50Small, use_cuda=False, iter=5)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
index 9a2733927d38f1a2b1af92fcc12f036158b4d06f..7688b8495d7f7c60e80f62dae2edd72be9f839d4 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import numpy as np
 import unittest
 import os
@@ -92,16 +93,18 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
     def test_parallel_testing(self):
         build_strategy = fluid.BuildStrategy()
         build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
-        self.check_network_convergence(
-            use_cuda=True, build_strategy=build_strategy)
+        if core.is_compiled_with_cuda():
+            self.check_network_convergence(
+                use_cuda=True, build_strategy=build_strategy)
         self.check_network_convergence(
             use_cuda=False, build_strategy=build_strategy)
 
     def test_parallel_testing_with_new_strategy(self):
         build_strategy = fluid.BuildStrategy()
         build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
-        self.check_network_convergence(
-            use_cuda=True, build_strategy=build_strategy)
+        if core.is_compiled_with_cuda():
+            self.check_network_convergence(
+                use_cuda=True, build_strategy=build_strategy)
         self.check_network_convergence(
             use_cuda=False, build_strategy=build_strategy)
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
index b6215fddb11bb6b3a76b5a6395e7254d21971c13..8203d5d1fce0950130ab71db40fb306f73c41bd4 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
@@ -21,7 +21,7 @@ import paddle
 import paddle.dataset.wmt16 as wmt16
 import os
 
-WMT16_RECORDIO_FILE = "./wmt16_test_pe.recordio"
+WMT16_RECORDIO_FILE = "/tmp/wmt16.recordio"
 
 
 class ModelHyperParams(object):
@@ -167,10 +167,9 @@ class TestTransformer(TestParallelExecutorBase):
                     writer.append_tensor(t)
                 writer.complete_append_tensor()
 
-    @unittest.skip("transformer is buggy in multi gpu")
     def test_main(self):
         self.check_network_convergence(transformer, use_cuda=True)
-        self.check_network_convergence(transformer, use_cuda=False)
+        self.check_network_convergence(transformer, use_cuda=False, iter=5)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_op.py b/python/paddle/fluid/tests/unittests/test_parallel_op.py
index 79bea148f9398152a02d70946cdc5fff1f47ba6b..18309f457704f522457daefdb8464ae5df2ffcfb 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_op.py
@@ -15,6 +15,7 @@
 import unittest
 
 import paddle.fluid as fluid
+from paddle.fluid.layers.device import get_places
 import paddle.fluid.profiler as profiler
 import numpy
 
@@ -113,11 +114,13 @@ class BaseParallelForTest(unittest.TestCase):
             generator = callback()
             # Automatically insert parallel do if use_parallel = True
             if use_parallel:
-                places = fluid.layers.get_places()
+                thread_num = fluid.core.get_cuda_device_count(
+                ) if use_gpu else 8
+                places = get_places(thread_num)
                 pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
                 data = next(generator)
 
-                if isinstance(data, fluid.Variable):
+                if isinstance(data, fluid.framework.Variable):
                     data = [data]
 
                 with pd.do():
diff --git a/python/paddle/fluid/tests/unittests/test_print_op.py b/python/paddle/fluid/tests/unittests/test_print_op.py
index e01af42a58b86042fd0282928d1a78d9c3239fe3..b461c5c9401d74ef8dcf4afc84dc0ea6920a2419 100644
--- a/python/paddle/fluid/tests/unittests/test_print_op.py
+++ b/python/paddle/fluid/tests/unittests/test_print_op.py
@@ -56,6 +56,8 @@ class TestPrintOpCPU(unittest.TestCase):
                        return_numpy=False)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestPrintOpGPU(TestPrintOpCPU):
     def setUp(self):
         self.place = core.CUDAPlace(0)
diff --git a/python/paddle/fluid/tests/unittests/test_prior_box_op.py b/python/paddle/fluid/tests/unittests/test_prior_box_op.py
index bcbc02a2baa46b9ab583ecf3006bd3262e6038fd..e15554737b9f3fa36382dde15ded928271679538 100644
--- a/python/paddle/fluid/tests/unittests/test_prior_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prior_box_op.py
@@ -32,6 +32,7 @@ class TestPriorBoxOp(OpTest):
             'variances': self.variances,
             'flip': self.flip,
             'clip': self.clip,
+            'min_max_aspect_ratios_order': self.min_max_aspect_ratios_order,
             'step_w': self.step_w,
             'step_h': self.step_h,
             'offset': self.offset
@@ -52,6 +53,9 @@ class TestPriorBoxOp(OpTest):
         max_sizes = [5, 10]
         self.max_sizes = np.array(max_sizes).astype('float32').tolist()
 
+    def set_min_max_aspect_ratios_order(self):
+        self.min_max_aspect_ratios_order = False
+
     def init_test_params(self):
         self.layer_w = 32
         self.layer_h = 32
@@ -71,6 +75,7 @@ class TestPriorBoxOp(OpTest):
         self.set_max_sizes()
         self.aspect_ratios = [2.0, 3.0]
         self.flip = True
+        self.set_min_max_aspect_ratios_order()
         self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0]
         self.aspect_ratios = np.array(
             self.aspect_ratios, dtype=np.float).flatten()
@@ -78,7 +83,6 @@ class TestPriorBoxOp(OpTest):
         self.variances = np.array(self.variances, dtype=np.float).flatten()
 
         self.clip = True
-
         self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes)
         if len(self.max_sizes) > 0:
             self.num_priors += len(self.max_sizes)
@@ -106,26 +110,60 @@ class TestPriorBoxOp(OpTest):
                 idx = 0
                 for s in range(len(self.min_sizes)):
                     min_size = self.min_sizes[s]
-                    # rest of priors
-                    for r in range(len(self.real_aspect_ratios)):
-                        ar = self.real_aspect_ratios[r]
-                        c_w = min_size * math.sqrt(ar) / 2
-                        c_h = (min_size / math.sqrt(ar)) / 2
-                        out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w,
-                                                   (c_y - c_h) / self.image_h,
-                                                   (c_x + c_w) / self.image_w,
-                                                   (c_y + c_h) / self.image_h]
-                        idx += 1
-
-                    if len(self.max_sizes) > 0:
-                        max_size = self.max_sizes[s]
-                        # second prior: aspect_ratio = 1,
-                        c_w = c_h = math.sqrt(min_size * max_size) / 2
+                    if not self.min_max_aspect_ratios_order:
+                        # rest of priors
+                        for r in range(len(self.real_aspect_ratios)):
+                            ar = self.real_aspect_ratios[r]
+                            c_w = min_size * math.sqrt(ar) / 2
+                            c_h = (min_size / math.sqrt(ar)) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w, (c_y - c_h) /
+                                self.image_h, (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h
+                            ]
+                            idx += 1
+
+                        if len(self.max_sizes) > 0:
+                            max_size = self.max_sizes[s]
+                            # second prior: aspect_ratio = 1,
+                            c_w = c_h = math.sqrt(min_size * max_size) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w, (c_y - c_h) /
+                                self.image_h, (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h
+                            ]
+                            idx += 1
+                    else:
+                        c_w = c_h = min_size / 2.
                         out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w,
                                                    (c_y - c_h) / self.image_h,
                                                    (c_x + c_w) / self.image_w,
                                                    (c_y + c_h) / self.image_h]
                         idx += 1
+                        if len(self.max_sizes) > 0:
+                            max_size = self.max_sizes[s]
+                            # second prior: aspect_ratio = 1,
+                            c_w = c_h = math.sqrt(min_size * max_size) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w, (c_y - c_h) /
+                                self.image_h, (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h
+                            ]
+                            idx += 1
+
+                        # rest of priors
+                        for r in range(len(self.real_aspect_ratios)):
+                            ar = self.real_aspect_ratios[r]
+                            if abs(ar - 1.) < 1e-6:
+                                continue
+                            c_w = min_size * math.sqrt(ar) / 2
+                            c_h = (min_size / math.sqrt(ar)) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w, (c_y - c_h) /
+                                self.image_h, (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h
+                            ]
+                            idx += 1
 
         # clip the prior's coordidate such that it is within[0, 1]
         if self.clip:
@@ -137,10 +175,15 @@ class TestPriorBoxOp(OpTest):
         self.out_var = out_var.astype('float32')
 
 
-class TestPriorBoxOpWithMaxSize(TestPriorBoxOp):
+class TestPriorBoxOpWithoutMaxSize(TestPriorBoxOp):
     def set_max_sizes(self):
         self.max_sizes = []
 
 
+class TestPriorBoxOpWithSpecifiedOutOrder(TestPriorBoxOp):
+    def set_min_max_aspect_ratios_order(self):
+        self.min_max_aspect_ratios_order = True
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py
index cf6fe14a86aa1ab6ea3f60ad15f33d708e9b803a..9f8d33f9bbfc78b6f1a0c089b34b2f41d561c640 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
@@ -79,12 +79,18 @@ class TestProfiler(unittest.TestCase):
                 pass_acc_calculator.add(value=acc, weight=b_size)
                 pass_acc = pass_acc_calculator.eval()
 
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "profiler is enabled only with GPU")
     def test_cpu_profiler(self):
         self.net_profiler('CPU')
 
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "profiler is enabled only with GPU")
     def test_cuda_profiler(self):
         self.net_profiler('GPU')
 
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "profiler is enabled only with GPU")
     def test_all_profiler(self):
         self.net_profiler('All', '/tmp/profile_out')
         with open('/tmp/profile_out', 'r') as f:
diff --git a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
index 3f9059fb5b31cd009c068ccddc9a8938adae5772..f75a79bfa42405747df9e6f4f4ab743014e303b9 100644
--- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
@@ -181,13 +181,13 @@ class TestBlockDesc(unittest.TestCase):
         self.assertIsNotNone(block)
         op1 = block.append_op()
         op2 = block.append_op()
-        op0 = block.prepend_op()
+        op0 = block._prepend_op()
         all_ops = []
         for idx in xrange(0, block.op_size()):
             all_ops.append(block.op(idx))
         self.assertEqual(all_ops, [op0, op1, op2])
 
-    def test_remove_op(self):
+    def test__remove_op(self):
         program = Program()
         program_desc = program.desc
         self.assertIsNotNone(program_desc)
@@ -201,8 +201,8 @@ class TestBlockDesc(unittest.TestCase):
         op1.set_type("test")
         op2.set_type("test")
 
-        block.remove_op(1, 2)
-        program.sync_with_cpp()
+        block._remove_op(1, 2)
+        program._sync_with_cpp()
 
         all_ops = []
         for idx in xrange(0, block.op_size()):
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py b/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py
new file mode 100644
index 0000000000000000000000000000000000000000..91b1fd2af7d8aaf85d17965f8b02c35ee3990291
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid as fluid
+import numpy as np
+from threading import Thread
+
+
+def feed_data(feed_queue, inputs):
+    for in_data in inputs:
+        feed_queue.push(in_data)
+
+
+class TestPyReader(unittest.TestCase):
+    def setUp(self):
+        self.capacity = 10
+        self.batch_size_min = 10
+        self.batch_size_max = 20
+        self.shapes = [(-1, 3, 2, 1), (-1, 1)]
+        self.lod_levels = [0, 0]
+        self.dtypes = ['float32', 'int64']
+        self.iterations = 20
+
+    def test_single_thread_main(self):
+        self.main(use_thread=False)
+
+    def test_multiple_thread_main(self):
+        self.main(use_thread=True)
+
+    def main(self, use_thread=False):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+            executor = fluid.Executor(place)
+
+            data_file = fluid.layers.py_reader(
+                capacity=self.capacity,
+                dtypes=self.dtypes,
+                lod_levels=self.lod_levels,
+                shapes=self.shapes)
+            feed_queue = data_file.queue
+            read_out_data = fluid.layers.read_file(data_file)
+            self.inputs = []
+
+            for i in range(self.iterations):
+                in_data = fluid.LoDTensorArray()
+                batch_size = np.random.random_integers(self.batch_size_min,
+                                                       self.batch_size_max)
+                for shape, dtype in zip(self.shapes, self.dtypes):
+                    next_data = np.random.uniform(
+                        low=0, high=1000,
+                        size=(batch_size, ) + shape[1:]).astype(dtype)
+                    in_data.append(executor.as_lodtensor(next_data))
+
+                self.inputs.append(in_data)
+
+            executor.run(fluid.default_startup_program())
+            self.outputs = []
+            if use_thread:
+                thread = Thread(
+                    target=feed_data, args=(feed_queue, self.inputs))
+                thread.start()
+                for in_data in self.inputs:
+                    self.outputs.append(
+                        executor.run(fetch_list=list(read_out_data)))
+            else:
+                for in_data in self.inputs:
+                    feed_queue.push(in_data)
+                    self.outputs.append(
+                        executor.run(fetch_list=list(read_out_data)))
+
+            feed_queue.close()
+            self.validate()
+
+    def validate(self):
+        self.assertEqual(len(self.inputs), len(self.outputs))
+        for in_data_list, out_data_list in zip(self.inputs, self.outputs):
+            self.assertEqual(len(in_data_list), len(out_data_list))
+            in_data_list_np = [
+                np.array(in_lod_tensor) for in_lod_tensor in in_data_list
+            ]
+            for in_data, out_data in zip(in_data_list_np, out_data_list):
+                self.assertTrue((in_data == out_data).all())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a379bdbaa7e278879117a8cdc2dddb335a10ca1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
@@ -0,0 +1,226 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import numpy as np
+import threading
+import multiprocessing
+import os
+
+
+def as_tensor(np_array_or_tensor, place=None):
+    if isinstance(np_array_or_tensor, fluid.LoDTensor):
+        return np_array_or_tensor
+
+    if place is None:
+        place = fluid.CPUPlace()
+
+    tensor = fluid.LoDTensor()
+    tensor.set(np_array_or_tensor, place)
+    return tensor
+
+
+def as_numpy(tensor_or_numpy):
+    return tensor_or_numpy if isinstance(
+        tensor_or_numpy, np.ndarray) else np.array(tensor_or_numpy)
+
+
+def feed_data(feed_queue, reader):
+    data_generator = reader()
+    while True:
+        data = next(data_generator, None)
+        if data is None or not feed_queue.push(data):
+            break
+
+
+def simple_fc_net(in_size,
+                  class_num,
+                  hidden_sizes,
+                  batch_size,
+                  queue_capacity,
+                  use_double_buffer=False):
+    reader = fluid.layers.py_reader(
+        capacity=queue_capacity,
+        shapes=[[-1, in_size], [-1, 1]],
+        lod_levels=[0, 0],
+        dtypes=['float32', 'int64'],
+        use_double_buffer=False)
+    feed_queue = reader.queue
+    reader = fluid.layers.batch(reader, batch_size=batch_size)
+    if use_double_buffer:
+        reader = fluid.layers.double_buffer(reader)
+
+    in_data, label = fluid.layers.read_file(reader)
+
+    hidden = in_data
+    for hidden_size in hidden_sizes:
+        hidden = fluid.layers.fc(
+            hidden,
+            size=hidden_size,
+            act='tanh',
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+
+    predict_label = fluid.layers.fc(hidden, size=class_num, act='softmax')
+    loss = fluid.layers.mean(
+        fluid.layers.cross_entropy(
+            input=predict_label, label=label))
+
+    optimizer = fluid.optimizer.Adam()
+    optimizer.minimize(loss)
+    return in_data, label, loss, optimizer, feed_queue
+
+
+class TestPyReaderUsingExecutor(unittest.TestCase):
+    def setUp(self):
+        self.in_size = 1000
+        self.hidden_sizes = [50, 30, 20]
+        self.class_num = 10
+        self.batch_size = 32
+        self.iterations = 10
+        self.queue_capacity = 50
+
+    def test(self):
+        for use_cuda in [False, True]:
+            for use_parallel_executor in [False, True]:
+                for use_double_buffer in [False, True]:
+                    print('Test Parameters:'),
+                    print({
+                        'use_cuda': use_cuda,
+                        'use_parallel_executor': use_parallel_executor,
+                        'use_double_buffer': use_double_buffer
+                    })
+                    self.main(use_cuda, use_parallel_executor,
+                              use_double_buffer)
+
+    def random_reader(self):
+        def reader():
+            self.inputs = []
+            cnt = 0
+            while True:
+                tensors = fluid.LoDTensorArray()
+                in_data = np.random.uniform(
+                    low=0, high=1, size=(1, self.in_size)).astype('float32')
+                tensors.append(as_tensor(in_data))
+                label = np.random.random_integers(
+                    low=0, high=self.class_num - 1, size=(1, 1)).astype('int64')
+                tensors.append(as_tensor(label))
+
+                if cnt < self.iterations * self.batch_size * self.batch_size_times:
+                    if cnt % (self.batch_size * self.batch_size_times) == 0:
+                        self.inputs.append([in_data, label])
+                    else:
+                        self.inputs[-1][0] = np.concatenate(
+                            (self.inputs[-1][0], in_data), axis=0)
+                        self.inputs[-1][1] = np.concatenate(
+                            (self.inputs[-1][1], label), axis=0)
+                elif not self.use_double_buffer:
+                    break
+
+                yield tensors
+                cnt += 1
+
+            yield None
+
+        return reader
+
+    def main(self,
+             use_cuda=True,
+             use_parallel_executor=False,
+             use_double_buffer=False):
+        assert not use_cuda or use_cuda and core.is_compiled_with_cuda()
+
+        self.use_cuda = use_cuda
+        self.use_parallel_executor = use_parallel_executor
+        self.use_double_buffer = use_double_buffer
+
+        startup_program = fluid.Program()
+        main_program = fluid.Program()
+
+        with fluid.program_guard(main_program, startup_program):
+            in_data, label, loss, optimizer, feed_queue = simple_fc_net(
+                in_size=self.in_size,
+                class_num=self.class_num,
+                hidden_sizes=self.hidden_sizes,
+                batch_size=self.batch_size,
+                queue_capacity=self.queue_capacity,
+                use_double_buffer=self.use_double_buffer)
+
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+            startup_exe = fluid.Executor(place)
+            startup_exe.run(startup_program)
+
+            if use_parallel_executor:
+                main_exe = fluid.ParallelExecutor(use_cuda, loss_name=loss.name)
+                if use_cuda:
+                    self.batch_size_times = core.get_cuda_device_count()
+                else:
+                    self.batch_size_times = int(
+                        os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+            else:
+                main_exe = startup_exe
+                self.batch_size_times = 1
+
+            reader = self.random_reader()
+            thread = threading.Thread(
+                target=feed_data, args=(feed_queue, reader))
+            thread.start()
+
+            self.outputs = []
+            for _ in range(self.iterations):
+                fetches = main_exe.run(fetch_list=[in_data.name, label.name])
+                fetches = [as_numpy(fetch) for fetch in fetches]
+                self.outputs.append(fetches)
+
+            feed_queue.close()
+            self.validate()
+
+    def validate(self):
+        self.assertEqual(len(self.inputs), len(self.outputs))
+        for batch_in, batch_out in zip(self.inputs, self.outputs):
+            self.assertEqual(len(batch_in), len(batch_out))
+            if self.use_parallel_executor and not self.use_double_buffer:
+                self.validate_unordered_batch(batch_in, batch_out)
+            else:
+                for in_data, out_data in zip(batch_in, batch_out):
+                    self.assertEqual(in_data.shape, out_data.shape)
+                    if not self.use_parallel_executor:
+                        self.assertTrue((in_data == out_data).all())
+
+    def validate_unordered_batch(self, batch_in, batch_out):
+        out_index_left_set = set(range(self.batch_size * self.batch_size_times))
+        mapping_num = 0
+        for i in range(self.batch_size * self.batch_size_times):
+            for j in out_index_left_set:
+                flag = True
+                for k in range(len(batch_in)):
+                    in_data = batch_in[k][i]
+                    out_data = batch_out[k][j]
+                    if (in_data != out_data).any():
+                        flag = False
+                        break
+
+                if flag:
+                    out_index_left_set.remove(j)
+                    mapping_num += 1
+                    break
+
+        self.assertEqual(mapping_num, self.batch_size * self.batch_size_times)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_reader_reset.py b/python/paddle/fluid/tests/unittests/test_reader_reset.py
new file mode 100644
index 0000000000000000000000000000000000000000..d35183647ea57e378f0fe201ef03001122cb329f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+import numpy as np
+import unittest
+
+
+class TestReaderReset(unittest.TestCase):
+    def prepare_data(self):
+        def fake_data_generator():
+            for n in xrange(self.total_ins_num):
+                yield np.ones(self.ins_shape) * n, n
+
+        # Prepare data
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            reader = paddle.batch(fake_data_generator, batch_size=1)
+            feeder = fluid.DataFeeder(
+                feed_list=[
+                    fluid.layers.data(
+                        name='data', shape=[3], dtype='float32'),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            fluid.recordio_writer.convert_reader_to_recordio_file(
+                self.data_file_name, reader, feeder)
+
+    def setUp(self):
+        self.use_cuda = fluid.core.is_compiled_with_cuda()
+        self.data_file_name = './reader_reset_test.recordio'
+        self.ins_shape = [3]
+        self.batch_size = 5
+        self.total_ins_num = self.batch_size * 20
+        self.test_pass_num = 100
+        self.prepare_data()
+
+    def main(self, with_double_buffer):
+        main_prog = fluid.Program()
+        startup_prog = fluid.Program()
+
+        with fluid.program_guard(main_prog, startup_prog):
+            data_reader_handle = fluid.layers.io.open_files(
+                filenames=[self.data_file_name],
+                shapes=[[-1] + self.ins_shape, [-1, 1]],
+                lod_levels=[0, 0],
+                dtypes=['float32', 'int64'],
+                thread_num=1,
+                pass_num=1)
+            data_reader = fluid.layers.io.batch(data_reader_handle,
+                                                self.batch_size)
+            if with_double_buffer:
+                data_reader = fluid.layers.double_buffer(data_reader)
+            image, label = fluid.layers.read_file(data_reader)
+            fetch_list = [image.name, label.name]
+
+        place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+
+        build_strategy = fluid.BuildStrategy()
+        if with_double_buffer:
+            build_strategy.enable_data_balance = True
+        exec_strategy = fluid.ExecutionStrategy()
+        parallel_exe = fluid.ParallelExecutor(
+            use_cuda=self.use_cuda,
+            main_program=main_prog,
+            build_strategy=build_strategy,
+            exec_strategy=exec_strategy)
+
+        data_appeared = [False] * self.total_ins_num
+        pass_count = 0
+        while (True):
+            try:
+                data_val, label_val = parallel_exe.run(fetch_list,
+                                                       return_numpy=True)
+                ins_num = data_val.shape[0]
+                broadcasted_label = np.ones((ins_num, ) + tuple(
+                    self.ins_shape)) * label_val.reshape((ins_num, 1))
+                self.assertEqual(data_val.all(), broadcasted_label.all())
+                for l in label_val:
+                    self.assertFalse(data_appeared[l[0]])
+                    data_appeared[l[0]] = True
+
+            except fluid.core.EOFException:
+                pass_count += 1
+                if with_double_buffer:
+                    data_appeared = data_appeared[:-parallel_exe.device_count *
+                                                  self.batch_size]
+                for i in data_appeared:
+                    self.assertTrue(i)
+                if pass_count < self.test_pass_num:
+                    data_appeared = [False] * self.total_ins_num
+                    data_reader_handle.reset()
+                else:
+                    break
+
+    def test_all(self):
+        self.main(with_double_buffer=False)
+        self.main(with_double_buffer=True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index 865c2b7df085aa6a6cb0d6eb461c342ce08695cd..06d116601bf733986ccf9c725340456ab1258be2 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -89,15 +89,11 @@ class TestProdOp(OpTest):
         self.check_grad(['X'], 'Out')
 
 
-class TestKeepDimReduce(OpTest):
+class Test1DReduce(OpTest):
     def setUp(self):
         self.op_type = "reduce_sum"
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
-        self.attrs = {'dim': [-2], 'keep_dim': True}
-        self.outputs = {
-            'Out':
-            self.inputs['X'].sum(axis=tuple(self.attrs['dim']), keepdims=True)
-        }
+        self.inputs = {'X': np.random.random(20).astype("float64")}
+        self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
 
     def test_check_output(self):
         self.check_output()
@@ -106,32 +102,82 @@ class TestKeepDimReduce(OpTest):
         self.check_grad(['X'], 'Out')
 
 
-class Test1DReduce(OpTest):
+class Test2DReduce0(Test1DReduce):
     def setUp(self):
         self.op_type = "reduce_sum"
-        self.inputs = {'X': np.random.random(20).astype("float64")}
+        self.attrs = {'dim': [0]}
+        self.inputs = {'X': np.random.random((20, 10)).astype("float64")}
         self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
 
-    def test_check_output(self):
-        self.check_output()
 
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+class Test2DReduce1(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.attrs = {'dim': [1]}
+        self.inputs = {'X': np.random.random((20, 10)).astype("float64")}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
 
 
-class TestReduceAll(OpTest):
+class Test3DReduce0(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.attrs = {'dim': [1]}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
+
+
+class Test3DReduce1(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.attrs = {'dim': [2]}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
+
+
+class Test3DReduce2(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.attrs = {'dim': [-2]}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
+
+
+class Test3DReduce3(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.attrs = {'dim': [1, 2]}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
+
+
+class TestKeepDimReduce(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.attrs = {'dim': [1], 'keep_dim': True}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
+                                        keepdims=self.attrs['keep_dim'])
+        }
+
+
+class TestReduceAll(Test1DReduce):
     def setUp(self):
         self.op_type = "reduce_sum"
         self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")}
         self.attrs = {'reduce_all': True}
         self.outputs = {'Out': self.inputs['X'].sum()}
 
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
 
 ## reduction in multi dims
 class TestReduceMeanOpMultiAxises(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
index a70321bd800bf25eeb9e5d197ea7e08626b9aede..6e1cd56b3e309fc014dc981a1e3aa841159fca15 100644
--- a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
@@ -15,6 +15,7 @@
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+from paddle.fluid.layers.control_flow import lod_rank_table
 import numpy
 
 
@@ -34,7 +35,7 @@ class TestReorderLoDTensor(unittest.TestCase):
         dat.stop_gradient = False
         rank_dat = fluid.layers.data(
             name=cls.data_desc[1][0], shape=cls.data_desc[1][1])
-        table = fluid.layers.lod_rank_table(rank_dat)
+        table = lod_rank_table(rank_dat)
         new_dat = fluid.layers.reorder_lod_tensor_by_rank(
             x=dat, rank_table=table)
         loss = fluid.layers.reduce_sum(new_dat)
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index f51b5a7e9907294a5b91c920a363830d8b9a7137..2f5558578ac2a002a83c2a7e027ec5a96d8b4414 100644
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -25,7 +25,7 @@ class TestReshapeOp(OpTest):
 
         self.op_type = "reshape"
         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape, "inplace": False}
+        self.attrs = {"shape": new_shape}
         self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
     def test_check_output(self):
@@ -42,7 +42,7 @@ class TestReshapeOpDimInfer1(OpTest):
 
         self.op_type = "reshape"
         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape, "inplace": False}
+        self.attrs = {"shape": new_shape}
         self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])}
 
     def test_check_output(self):
@@ -60,7 +60,7 @@ class TestReshapeOpDimInfer2(OpTest):
 
         self.op_type = "reshape"
         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape, "inplace": False}
+        self.attrs = {"shape": new_shape}
         self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
 
     def test_check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..df6e0faaca6fd007b39a8f358d964055e149a025
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
@@ -0,0 +1,103 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+
+
+def rpn_target_assign(iou, rpn_batch_size_per_im, rpn_positive_overlap,
+                      rpn_negative_overlap, fg_fraction):
+    iou = np.transpose(iou)
+    anchor_to_gt_max = iou.max(axis=1)
+    gt_to_anchor_argmax = iou.argmax(axis=0)
+    gt_to_anchor_max = iou[gt_to_anchor_argmax, np.arange(iou.shape[1])]
+    anchors_with_max_overlap = np.where(iou == gt_to_anchor_max)[0]
+
+    tgt_lbl = np.ones((iou.shape[0], ), dtype=np.int32) * -1
+    tgt_lbl[anchors_with_max_overlap] = 1
+    tgt_lbl[anchor_to_gt_max >= rpn_positive_overlap] = 1
+
+    num_fg = int(fg_fraction * rpn_batch_size_per_im)
+    fg_inds = np.where(tgt_lbl == 1)[0]
+    if len(fg_inds) > num_fg:
+        disable_inds = np.random.choice(
+            fg_inds, size=(len(fg_inds) - num_fg), replace=False)
+        tgt_lbl[disable_inds] = -1
+    fg_inds = np.where(tgt_lbl == 1)[0]
+
+    num_bg = rpn_batch_size_per_im - np.sum(tgt_lbl == 1)
+    bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0]
+    if len(bg_inds) > num_bg:
+        enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)]
+        tgt_lbl[enable_inds] = 0
+    bg_inds = np.where(tgt_lbl == 0)[0]
+
+    loc_index = fg_inds
+    score_index = np.hstack((fg_inds, bg_inds))
+    tgt_lbl = np.expand_dims(tgt_lbl, axis=1)
+    return loc_index, score_index, tgt_lbl
+
+
+class TestRpnTargetAssignOp(OpTest):
+    def setUp(self):
+        iou = np.random.random((10, 8)).astype("float32")
+        self.op_type = "rpn_target_assign"
+        self.inputs = {'DistMat': iou}
+        self.attrs = {
+            'rpn_batch_size_per_im': 256,
+            'rpn_positive_overlap': 0.95,
+            'rpn_negative_overlap': 0.3,
+            'fg_fraction': 0.25,
+            'fix_seed': True
+        }
+        loc_index, score_index, tgt_lbl = rpn_target_assign(iou, 256, 0.95, 0.3,
+                                                            0.25)
+        self.outputs = {
+            'LocationIndex': loc_index,
+            'ScoreIndex': score_index,
+            'TargetLabel': tgt_lbl,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestRpnTargetAssignOp2(OpTest):
+    def setUp(self):
+        iou = np.random.random((10, 20)).astype("float32")
+        self.op_type = "rpn_target_assign"
+        self.inputs = {'DistMat': iou}
+        self.attrs = {
+            'rpn_batch_size_per_im': 128,
+            'rpn_positive_overlap': 0.5,
+            'rpn_negative_overlap': 0.5,
+            'fg_fraction': 0.5,
+            'fix_seed': True
+        }
+        loc_index, score_index, tgt_lbl = rpn_target_assign(iou, 128, 0.5, 0.5,
+                                                            0.5)
+        self.outputs = {
+            'LocationIndex': loc_index,
+            'ScoreIndex': score_index,
+            'TargetLabel': tgt_lbl,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_selected_rows.py b/python/paddle/fluid/tests/unittests/test_selected_rows.py
index 3d7b86787fbf0a855bcd86b8a873c9134cb1d5cc..f504a06ffff8cb636498652554fca05e22bb905d 100644
--- a/python/paddle/fluid/tests/unittests/test_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_selected_rows.py
@@ -40,12 +40,12 @@ class TestSelectedRows(unittest.TestCase):
 
         # compare tensor
         self.assertAlmostEqual(2.0,
-                               selected_rows.get_tensor().get_float_element(0))
+                               selected_rows.get_tensor()._get_float_element(0))
         self.assertAlmostEqual(1.0,
-                               selected_rows.get_tensor().get_float_element(1))
+                               selected_rows.get_tensor()._get_float_element(1))
         self.assertAlmostEqual(
             4.0,
-            selected_rows.get_tensor().get_float_element(2 * row_numel + 8))
+            selected_rows.get_tensor()._get_float_element(2 * row_numel + 8))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
index e91a69a0f8039651225039beb2a42e8dffeb62d3..c4fc8b74cf80c3596b0af9f7f0434864591195bd 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
@@ -61,6 +61,8 @@ class TestSequenceSoftmaxOp(OpTest):
 
 
 # ----------------cudnn Sequencesoftmax----------------
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestSequenceSoftmaxCUDNNOp(TestSequenceSoftmaxOp):
     def init_op_type(self):
         self.use_cudnn = True
diff --git a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
index b779f0fb014bbba62927754ea6f36828a32e6c0a..6f0e337034d1010880514181654170316fd9db19 100644
--- a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
+++ b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
@@ -21,6 +21,9 @@ from paddle.fluid.framework import default_main_program, switch_main_program
 from paddle.fluid.framework import Program
 import numpy as np
 
+from paddle.fluid.layers.control_flow import shrink_memory
+from paddle.fluid.layers.control_flow import lod_rank_table
+
 
 class TestShrinkRNNMemoryBase(unittest.TestCase):
     def setUp(self):
@@ -30,23 +33,23 @@ class TestShrinkRNNMemoryBase(unittest.TestCase):
         x.stop_gradient = False
         rank_table_tensor = layers.data(
             'rank_table_tensor', shape=[1], dtype='float32', lod_level=1)
-        table = layers.lod_rank_table(x=rank_table_tensor)
+        table = lod_rank_table(x=rank_table_tensor)
         i = layers.zeros(dtype='int64', shape=[1])
-        self.mem1 = layers.shrink_memory(x=x, i=i, table=table)
+        self.mem1 = shrink_memory(x=x, i=i, table=table)
         i = layers.increment(x=i)
         i.stop_gradient = True
-        self.mem2 = layers.shrink_memory(x=self.mem1, i=i, table=table)
+        self.mem2 = shrink_memory(x=self.mem1, i=i, table=table)
         i = layers.increment(x=i)
         i.stop_gradient = True
-        self.mem3 = layers.shrink_memory(x=self.mem2, i=i, table=table)
+        self.mem3 = shrink_memory(x=self.mem2, i=i, table=table)
         mem3_mean = layers.mean(self.mem3)
         append_backward(loss=mem3_mean)
         self.x_grad = self.main_program.global_block().var('x@GRAD')
 
     def sum_lodtensor(self, tensor):
         sum_res = 0.0
-        for i in xrange(np.product(tensor.get_dims())):
-            sum_res += tensor.get_float_element(i)
+        for i in xrange(np.product(tensor.shape())):
+            sum_res += tensor._get_float_element(i)
         return sum_res
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
index 279f3073f73d1c36f54bb901d92441a7403ac23f..70ad05597c4a160cf6a25aeb3c379320cef69c63 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -26,15 +26,22 @@ def stable_softmax(x):
 
 
 class TestSoftmaxOp(OpTest):
+    def get_x_shape(self):
+        return [10, 10]
+
     def setUp(self):
         self.op_type = "softmax"
         self.use_cudnn = False
         self.use_mkldnn = False
         self.dtype = np.float32
         self.init_kernel_type()
+        self.shape = self.get_x_shape()
+
+        x = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
+        out = np.apply_along_axis(stable_softmax, 1,
+                                  x.reshape([-1, self.shape[-1]]))
+        out = out.reshape(self.shape)
 
-        x = np.random.uniform(0.1, 1, [10, 10]).astype(self.dtype)
-        out = np.apply_along_axis(stable_softmax, 1, x)
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
         self.attrs = {
@@ -63,11 +70,27 @@ class TestSoftmaxOp(OpTest):
             self.check_grad(["X"], "Out", max_relative_error=0.01)
 
 
+class TestSoftmaxOp2(TestSoftmaxOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestSoftmaxCUDNNOp(TestSoftmaxOp):
     def init_kernel_type(self):
         self.use_cudnn = True
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxCUDNNOp2(TestSoftmaxCUDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestSoftmaxFP16Op(TestSoftmaxOp):
     def init_kernel_type(self):
         self.dtype = np.float16
@@ -79,6 +102,15 @@ class TestSoftmaxFP16Op(TestSoftmaxOp):
                 self.check_output_with_place(place, atol=1e-3)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxFP16Op2(TestSoftmaxFP16Op):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp):
     def init_kernel_type(self):
         self.use_cudnn = True
@@ -91,10 +123,22 @@ class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp):
                 self.check_output_with_place(place, atol=1e-3)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxFP16CUDNNOp2(TestSoftmaxFP16CUDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
+
 class TestSoftmaxMKLDNNOp(TestSoftmaxOp):
     def init_kernel_type(self):
         self.use_mkldnn = True
 
 
+class TestSoftmaxMKLDNNOp2(TestSoftmaxMKLDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
index 0916ed7c9f1e2d6d90c6908983fdc8b177aecbb9..ea1146166d34a31efbd859318b411cea50895fe1 100644
--- a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
@@ -19,6 +19,8 @@ import paddle.fluid.layers as layers
 from paddle.fluid.framework import Program, program_guard
 from paddle.fluid.executor import Executor
 from paddle.fluid.backward import append_backward
+from paddle.fluid.layers.control_flow import split_lod_tensor
+from paddle.fluid.layers.control_flow import merge_lod_tensor
 
 
 class TestCPULoDTensorArrayOps(unittest.TestCase):
@@ -96,12 +98,11 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
             y = layers.data(name='y', shape=[1])
             y.persistable = True
 
-            out_true, out_false = layers.split_lod_tensor(
-                input=x, mask=y, level=level)
+            out_true, out_false = split_lod_tensor(input=x, mask=y, level=level)
             out_true.persistable = True
             out_false.persistable = True
 
-            out = layers.merge_lod_tensor(
+            out = merge_lod_tensor(
                 in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
 
             out.persistable = True
@@ -142,9 +143,8 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
 
             level = 0
 
-            out_true, out_false = layers.split_lod_tensor(
-                input=x, mask=y, level=level)
-            out = layers.merge_lod_tensor(
+            out_true, out_false = split_lod_tensor(input=x, mask=y, level=level)
+            out = merge_lod_tensor(
                 in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
             mean = layers.mean(out)
 
diff --git a/python/paddle/fluid/tests/unittests/test_split_ids_op.py b/python/paddle/fluid/tests/unittests/test_split_ids_op.py
index e9f0a06a56b42952800411d548bb3fc1732e031e..ca7861309839d183e18c168403881a0b1b5bf309 100644
--- a/python/paddle/fluid/tests/unittests/test_split_ids_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_ids_op.py
@@ -15,6 +15,8 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
 
 
 class TestSplitIdsOp(OpTest):
@@ -31,5 +33,55 @@ class TestSplitIdsOp(OpTest):
         self.check_output()
 
 
+class TestSpliteIds(unittest.TestCase):
+    def get_places(self):
+        places = [core.CPUPlace()]
+        return places
+
+    def test_check_output(self):
+        for place in self.get_places():
+            self.check_with_place(place)
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+        rows = [0, 5, 7, 4, 9]
+        height = 20
+        row_numel = 2
+
+        # initialize input variable X
+        x = scope.var('X').get_selected_rows()
+        x.set_rows(rows)
+        x.set_height(height)
+        np_array = np.ones((len(rows), row_numel)).astype("float32")
+        for i in range(len(rows)):
+            for j in range(row_numel):
+                np_array[i, j] = rows[i] + j
+        x_tensor = x.get_tensor()
+        x_tensor.set(np_array, place)
+
+        outs_name = ["out%d" % i for i in xrange(3)]
+        outs = [
+            scope.var(var_name).get_selected_rows() for var_name in outs_name
+        ]
+
+        # expected output selected rows
+        expected_out_rows = [[0, 9], [7, 4], [5]]
+
+        op = Operator("split_ids", Ids="X", Out=outs_name)
+
+        for _ in range(3):
+            op.run(scope, place)
+
+            for i in range(len(outs)):
+                expected_rows = expected_out_rows[i]
+                self.assertEqual(outs[i].rows(), expected_rows)
+                for j in range(len(expected_rows)):
+                    row = expected_rows[j]
+                    self.assertAlmostEqual(
+                        float(row), np.array(outs[i].get_tensor())[j, 0])
+                    self.assertAlmostEqual(
+                        float(row + 1), np.array(outs[i].get_tensor())[j, 1])
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_squeeze_op.py b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..bca6af2fd5dfadbc48cf1a76cfa6ffd4f1fdfdef
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
@@ -0,0 +1,114 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+from op_test import OpTest
+
+
+# Correct: General.
+class TestSqueezeOp(OpTest):
+    def setUp(self):
+        self.op_type = "squeeze"
+        self.init_test_case()
+        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
+        self.init_attrs()
+        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+    def init_test_case(self):
+        self.ori_shape = (1, 3, 1, 5)
+        self.axes = (0, 2)
+        self.new_shape = (3, 5)
+
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes, "inplace": False}
+
+
+# Correct: There is mins axis.
+class TestSqueezeOp1(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (1, 3, 1, 5)
+        self.axes = (0, -2)
+        self.new_shape = (3, 5)
+
+
+# Correct: No axes input.
+class TestSqueezeOp2(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (1, 3, 1, 5)
+        self.axes = ()
+        self.new_shape = (3, 5)
+
+
+# Correct: Just part of axes be squeezed. 
+class TestSqueezeOp3(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (3, 1, 5, 1, 4, 1)
+        self.axes = (1, -1)
+        self.new_shape = (3, 5, 1, 4)
+
+
+# Correct: Inplace.
+class TestSqueezeOpInplace1(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (1, 3, 1, 5)
+        self.axes = (0, 2)
+        self.new_shape = (3, 5)
+
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes, "inplace": True}
+
+
+# Correct: Inplace. There is mins axis.
+class TestSqueezeOpInplace2(TestSqueezeOp):
+    def inti_test_case(self):
+        self.ori_shape = (1, 3, 1, 5)
+        self.axes = (0, -2)
+        self.new_shape = (3, 5)
+
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes, "inplace": True}
+
+
+# Correct: Inplace. No axes input.
+class TestSqueezeOpInplace3(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (1, 3, 1, 5)
+        self.axes = ()
+        self.new_shape = (3, 5)
+
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes, "inplace": True}
+
+
+# Correct: Inpalce. Just part of axes be squeezed. 
+class TestSqueezeOpInplace4(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (3, 1, 5, 1, 4, 1)
+        self.axes = (1, -1)
+        self.new_shape = (3, 5, 1, 4)
+
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes, "inplace": True}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor.py
index f17edd3025b17549892bbd47935a1d2452cefac3..5ccc876ae8e6e20f76c77c1892f4de59d72bffc8 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor.py
@@ -25,8 +25,8 @@ class TestTensor(unittest.TestCase):
 
         tensor = var.get_tensor()
 
-        tensor.set_dims([1000, 784])
-        tensor.alloc_int(place)
+        tensor._set_dims([1000, 784])
+        tensor._alloc_int(place)
         tensor_array = numpy.array(tensor)
         self.assertEqual((1000, 784), tensor_array.shape)
         tensor_array[3, 9] = 1
@@ -44,8 +44,8 @@ class TestTensor(unittest.TestCase):
 
         tensor = var.get_tensor()
 
-        tensor.set_dims([1000, 784])
-        tensor.alloc_float(place)
+        tensor._set_dims([1000, 784])
+        tensor._alloc_float(place)
 
         tensor_array = numpy.array(tensor)
         self.assertEqual((1000, 784), tensor_array.shape)
@@ -63,8 +63,8 @@ class TestTensor(unittest.TestCase):
         var_lod = scope.var("test_lod_tensor")
         lod_tensor = var_lod.get_tensor()
 
-        lod_tensor.set_dims([4, 4, 6])
-        lod_tensor.alloc_int(place)
+        lod_tensor._set_dims([4, 4, 6])
+        lod_tensor._alloc_int(place)
         array = numpy.array(lod_tensor)
         array[0, 0, 0] = 3
         array[3, 3, 5] = 10
@@ -84,8 +84,8 @@ class TestTensor(unittest.TestCase):
         var_lod = scope.var("test_lod_tensor")
 
         lod_tensor = var_lod.get_tensor()
-        lod_tensor.set_dims([5, 2, 3, 4])
-        lod_tensor.alloc_float(place)
+        lod_tensor._set_dims([5, 2, 3, 4])
+        lod_tensor._alloc_float(place)
 
         tensor_array = numpy.array(lod_tensor)
         self.assertEqual((5, 2, 3, 4), tensor_array.shape)
@@ -104,14 +104,13 @@ class TestTensor(unittest.TestCase):
         self.assertListEqual(lod_py, lod)
 
     def test_lod_tensor_init(self):
-        scope = core.Scope()
         place = core.CPUPlace()
         lod_py = [[2, 1], [1, 2, 2]]
         lod_tensor = core.LoDTensor()
 
-        lod_tensor.set_dims([5, 2, 3, 4])
+        lod_tensor._set_dims([5, 2, 3, 4])
         lod_tensor.set_recursive_sequence_lengths(lod_py)
-        lod_tensor.alloc_float(place)
+        lod_tensor._alloc_float(place)
         tensor_array = numpy.array(lod_tensor)
         tensor_array[0, 0, 0, 0] = 1.0
         tensor_array[0, 0, 0, 1] = 2.0
@@ -129,9 +128,9 @@ class TestTensor(unittest.TestCase):
         lod_py = [[2, 1], [1, 2, 2]]
         lod_tensor = core.LoDTensor()
 
-        lod_tensor.set_dims([5, 2, 3, 4])
+        lod_tensor._set_dims([5, 2, 3, 4])
         lod_tensor.set_recursive_sequence_lengths(lod_py)
-        lod_tensor.alloc_float(place)
+        lod_tensor._alloc_float(place)
         tensor_array = numpy.array(lod_tensor)
         tensor_array[0, 0, 0, 0] = 1.0
         tensor_array[0, 0, 0, 1] = 2.0
@@ -149,15 +148,15 @@ class TestTensor(unittest.TestCase):
 
         tensor = var.get_tensor()
 
-        tensor.set_dims([0, 1])
-        tensor.alloc_float(place)
+        tensor._set_dims([0, 1])
+        tensor._alloc_float(place)
 
         tensor_array = numpy.array(tensor)
         self.assertEqual((0, 1), tensor_array.shape)
 
         if core.is_compiled_with_cuda():
             gpu_place = core.CUDAPlace(0)
-            tensor.alloc_float(gpu_place)
+            tensor._alloc_float(gpu_place)
             tensor_array = numpy.array(tensor)
             self.assertEqual((0, 1), tensor_array.shape)
 
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a4aa0a40b5eb494f6027e800ca6b466bbe1c302
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+from op_test import OpTest
+
+
+# Correct: General.
+class TestUnsqueezeOp(OpTest):
+    def setUp(self):
+        self.init_test_case()
+        self.op_type = "unsqueeze"
+        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
+        self.init_attrs()
+        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+    def init_test_case(self):
+        self.ori_shape = (3, 5)
+        self.axes = (1, 2)
+        self.new_shape = (3, 1, 1, 5)
+
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes, "inplace": False}
+
+
+# Correct: Single input index.
+class TestUnsqueezeOp1(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (3, 5)
+        self.axes = (-1, )
+        self.new_shape = (3, 5, 1)
+
+
+# Correct: Mixed input axis.
+class TestUnsqueezeOp2(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (3, 5)
+        self.axes = (0, -1)
+        self.new_shape = (1, 3, 5, 1)
+
+
+# Correct: There is duplicated axis.
+class TestUnsqueezeOp3(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (3, 2, 5)
+        self.axes = (0, 3, 3)
+        self.new_shape = (1, 3, 2, 1, 1, 5)
+
+
+# Correct: Reversed axes.
+class TestUnsqueezeOp4(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (3, 2, 5)
+        self.axes = (3, 1, 1)
+        self.new_shape = (3, 1, 1, 2, 5, 1)
+
+
+# Correct: Inplace.
+class TestUnsqueezeOpInplace1(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (3, 5)
+        self.axes = (0, 2)
+        self.new_shape = (1, 3, 1, 5)
+
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes, "inplace": True}
+
+
+# Correct: Inplace. There is mins index.
+class TestUnsqueezeOpInplace2(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (3, 5)
+        self.axes = (0, -2)
+        self.new_shape = (1, 3, 1, 5)
+
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes, "inplace": True}
+
+
+# Correct: Inplace. There is duplicated axis.
+class TestUnsqueezeOpInplace3(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (3, 2, 5)
+        self.axes = (0, 3, 3)
+        self.new_shape = (1, 3, 2, 1, 1, 5)
+
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes, "inplace": True}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_version.py b/python/paddle/fluid/tests/unittests/test_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..a09c8a759b9461edcf7d5ddbd62d74408d5f292e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_version.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import re
+
+import paddle.version as fluid_version
+
+
+class VersionTest(unittest.TestCase):
+    def setUp(self):
+        self._major_regex = "[0-9]+"
+        self._minor_regex = "[0-9]+"
+        self._patch_regex = "[0-9]+(\\.(a|b|rc)\\.[0-9]+)?"
+        self._rc_regex = "[0-9]+"
+        self._version_regex = "[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?"
+        self._commit_regex = "[0-9a-f]{5,49}"
+
+    def test_check_output(self):
+        # check commit format
+        self.assertTrue(re.match(self._commit_regex, fluid_version.commit))
+        self.assertTrue(isinstance(fluid_version.istaged, bool))
+
+        # check version format
+        if fluid_version.istaged:
+            self.assertEqual(fluid_version.major, 0)
+            self.assertEqual(fluid_version.minor, 0)
+            self.assertEqual(fluid_version.patch, "0")
+            self.assertEqual(fluid_version.rc, 0)
+            self.assertEqual(fluid_version.full_version, "0.0.0")
+        else:
+            self.assertTrue(re.match(self._major_regex, fluid_version.major))
+            self.assertTrue(re.match(self._minor_regex, fluid_version.minor))
+            self.assertTrue(re.match(self._patch_regex, fluid_version.patch))
+            self.assertTrue(re.match(self._rc_regex, fluid_version.rc))
+            self.assertTrue(
+                re.match(self._version_regex, fluid_version.full_version))
diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py
index a995ee10f29a714b674fae4b31070e6ba2ca9953..910d9538b009496813f40b82d62eb2b12964a99f 100644
--- a/python/paddle/fluid/tests/unittests/testsuite.py
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
@@ -18,14 +18,6 @@ import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 
 
-def as_lodtensor(np_array, lod, place):
-    tensor = core.LoDTensor()
-    tensor.set(np_value, place)
-    if lod is not None:
-        tensor.set_recursive_sequence_lengths(lod)
-    return tensor
-
-
 def create_op(scope, op_type, inputs, outputs, attrs):
     kwargs = dict()
 
@@ -69,14 +61,19 @@ def create_op(scope, op_type, inputs, outputs, attrs):
 
 
 def set_input(scope, op, inputs, place):
+    def np_value_to_fluid_value(input):
+        if input.dtype == np.float16:
+            input = input.view(np.uint16)
+        return input
+
     def __set_input__(var_name, var):
         if isinstance(var, tuple) or isinstance(var, np.ndarray):
             tensor = scope.find_var(var_name).get_tensor()
             if isinstance(var, tuple):
                 tensor.set_recursive_sequence_lengths(var[1])
                 var = var[0]
-            tensor.set_dims(var.shape)
-            tensor.set(var, place)
+            tensor._set_dims(var.shape)
+            tensor.set(np_value_to_fluid_value(var), place)
         elif isinstance(var, float):
             scope.find_var(var_name).set_float(var)
         elif isinstance(var, int):
@@ -104,6 +101,7 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):
         if name not in np_list:
             assert var_proto.intermediate, "{} not found".format(name)
         else:
+            # inferece the dtype from numpy value.
             np_value = np_list[name]
             if isinstance(np_value, tuple):
                 dtype = np_value[0].dtype
@@ -116,6 +114,16 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):
                 if is_input:
                     shape = list(np_value.shape)
                     lod_level = 0
+        # NOTE(dzhwinter): type hacking
+        # numpy float16 is binded to paddle::platform::float16
+        # in tensor_py.h via the help of uint16 datatype. Because
+        # the internal memory representation of float16 is
+        # actually uint16_t in paddle. So we use np.uint16 in numpy for
+        # raw memory, it can pass through the pybind. So in the testcase,
+        # we feed data use data.view(uint16), but the dtype is float16 in fact.
+        # The data.view(uint16) means do not cast the data type, but process data as the uint16
+        if dtype == np.uint16:
+            dtype = np.float16
         return block.create_var(
             dtype=dtype, shape=shape, lod_level=lod_level, name=name)
 
diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py
index c62792face3c353db1f2e3c77eaf4bd32fbded69..d0eb3fd3724899aad39422983fd3cd0d00ff2a2d 100644
--- a/python/paddle/fluid/tests/unittests/transformer_model.py
+++ b/python/paddle/fluid/tests/unittests/transformer_model.py
@@ -403,7 +403,7 @@ def transformer(
         trg_pad_idx,
         pos_pad_idx, ):
     file_obj = fluid.layers.open_recordio_file(
-        filename='./wmt16.recordio',
+        filename='/tmp/wmt16.recordio',
         shapes=[
             [batch_size * max_length, 1],
             [batch_size * max_length, 1],
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index b6e0241265b18377874efb0d223441994b4650d0..64049a93cb0a267722de9cd94961b6256551330d 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -14,6 +14,9 @@
 
 import contextlib
 import os
+import errno
+import shutil
+import time
 
 import core
 
@@ -94,7 +97,7 @@ class EndStepEvent(object):
 
 class CheckpointConfig(object):
     """
-    Parameter object for :code:`fluid.io.save_checkpoint` and
+    Parameter object for :code:`save_checkpoint` and
     :code:`fluid.Trainer`. Used to configuration how to save checkpoint.
 
     Args:
@@ -237,7 +240,7 @@ class Trainer(object):
         self.checkpoint_cfg = checkpoint_config
         if self.checkpoint_cfg:
             assert isinstance(self.checkpoint_cfg, CheckpointConfig)
-            serial = io.get_latest_checkpoint_serial(
+            serial = _get_latest_checkpoint_serial(
                 self.checkpoint_cfg.checkpoint_dir)
             self.checkpoint_cfg.load_serial = serial if serial >= 0 else None
 
@@ -276,32 +279,15 @@ class Trainer(object):
             exe = executor.Executor(place)
             exe.run(self.startup_program)
 
-        if self.checkpoint_cfg and self.checkpoint_cfg.load_serial:
-            with self._prog_and_scope_guard():
-                exe = executor.Executor(place)
-                io.load_checkpoint(exe, self.checkpoint_cfg.checkpoint_dir,
-                                   self.checkpoint_cfg.load_serial,
-                                   self.startup_program)
-
-                if not self.checkpoint_cfg.pserver_id:
-                    epoch_id, step_id = io.load_trainer_args(
-                        self.checkpoint_cfg.checkpoint_dir,
-                        self.checkpoint_cfg.load_serial, self.trainer_id,
-                        self._get_checkpoint_load_args())
-                    self.checkpoint_cfg.epoch_id = int(epoch_id)
-                    self.checkpoint_cfg.step_id = int(step_id)
-                else:
-                    if self.checkpoint_cfg.lookup_table_name:
-                        io.load_lookup_table_vars(
-                            exe, self.checkpoint_cfg.checkpoint_dir,
-                            self.startup_program,
-                            self.checkpoint_cfg.pserver_id,
-                            self.checkpoint_cfg.lookup_table_name)
+        if self.checkpoint_cfg and self.checkpoint_cfg.load_serial is not None:
+            self._load_checkpoint()
 
         if param_path and os.path.isdir(param_path):
             # load params from param_path into scope
-            io.load_persist_vars_without_grad(
-                exe, dirname=param_path, program=self.startup_program)
+            io.load_persistables(
+                executor=exe,
+                dirname=param_path,
+                main_program=self.startup_program)
 
     def _transpile_nccl2_dist(self):
         # PADDLE_TRAINER_IPS
@@ -549,7 +535,7 @@ class Trainer(object):
 
     def _clean_checkpoint(self):
         assert self.checkpoint_cfg
-        io.clean_checkpoint(checkpoint_dir=self.checkpoint_cfg.checkpoint_dir)
+        clean_checkpoint(checkpoint_dir=self.checkpoint_cfg.checkpoint_dir)
 
     def _get_checkpoint_load_args(self):
         """
@@ -572,7 +558,7 @@ class Trainer(object):
         if epoch_id % self.checkpoint_cfg.epoch_interval == 0 \
             and step_id % self.checkpoint_cfg.step_interval == 0:
             exe = executor.Executor(self.place)
-            io.save_checkpoint(
+            save_checkpoint(
                 executor=exe,
                 checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
                 trainer_id=self.trainer_id,
@@ -580,6 +566,41 @@ class Trainer(object):
                 main_program=self.train_program,
                 max_num_checkpoints=self.checkpoint_cfg.max_num_checkpoints)
 
+    def _load_checkpoint(self):
+        with self._prog_and_scope_guard():
+            exe = executor.Executor(self.place)
+            load_checkpoint(
+                executor=exe,
+                checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
+                main_program=self.startup_program)
+
+            if not self.checkpoint_cfg.pserver_id:
+                load_trainer_args = self._get_checkpoint_load_args()
+                trainer_args = load_checkpoint(
+                    executor=exe,
+                    checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
+                    main_program=self.startup_program,
+                    role_id=self.trainer_id,
+                    is_trainer=True,
+                    load_trainer_args=load_trainer_args)
+
+                if len(trainer_args) != 2:
+                    raise ValueError(
+                        "the return trainer_args length do not equal _get_checkpoint_load_args"
+                    )
+                self.checkpoint_cfg.epoch_id = int(trainer_args[0])
+                self.checkpoint_cfg.step_id = int(trainer_args[1])
+            else:
+                if self.checkpoint_cfg.lookup_table_name:
+                    load_checkpoint(
+                        executor=exe,
+                        checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
+                        main_program=self.startup_program,
+                        role_id=self.checkpoint_cfg.pserver_id,
+                        is_trainer=False,
+                        load_trainer_args=None,
+                        load_lookup_table=self.checkpoint_cfg.lookup_table_name)
+
 
 def build_feed_var_list(program, feed_order):
     if not isinstance(program, framework.Program):
@@ -602,3 +623,610 @@ def build_feed_var_list(program, feed_order):
             program.global_block().var(pair[0]) for pair in sorted_pair_list
         ]
     return feed_var_list
+
+
+# move Checkpoint APIs from io.py to trainer.py, make all of them are private.
+SUCCESS_MARK_FILENAME = "_SUCCESS"
+CHECKPOINT_PREFIX = "checkpoint"
+MODEL_DIR = "__model__"
+LOOKUP_TABLE_DIR = "__lookup_table__"
+TRAINER_PREFIX = "trainer"
+CHECKPOINT_SEPARATOR = "_"
+
+
+def save_checkpoint(executor,
+                    checkpoint_dir,
+                    trainer_id,
+                    main_program,
+                    trainer_args=None,
+                    max_num_checkpoints=3,
+                    lookup_table=None,
+                    pserver_endpoints=None):
+    """
+    This function filters out all checkpoint variables from the give
+    main_program and then saves these variables to the `checkpoint_dir` 
+    directory.
+
+    In the training precess, we generally save a checkpoint in each
+    iteration. So there might be a lot of checkpoints in the 
+    `checkpoint_dir`. To avoid them taking too much disk space, the 
+    `max_num_checkpoints` are introduced to limit the total number of 
+    checkpoints. If the number of existing checkpints is greater than 
+    the `max_num_checkpoints`, oldest ones will be scroll deleted.
+
+    A variable is a checkpoint variable and will be saved if it meets
+    all following conditions:
+        1. It's persistable.
+        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
+        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
+
+    Args:
+        executor(Executor): The executor to run for save checkpoint.
+        checkpoint_dir(str): The folder where to save checkpoints.
+        trainer_id(int): currect trainer id, if id is equal to 0, the trainer 
+            is chief.
+        trainer_args(dict|None): Current training arguments. Such as 'epoch_id' 
+            and 'step_id'.
+            Defaut: None
+        main_program(Program): The program whose checkpoint variables will
+            be saved.
+        max_num_checkpoints(int): The max number of total number of existing 
+            checkpoints.
+            Default: 3
+        lookup_table(string|None): the lookup table name, when use distribute
+            lookup table, we can get lookup table name by DistributeTranspiler.
+            table_name 
+        pserver_endpoints(list|None): the parameter server ip:port list.  
+            when use distribute lookup table, we can get pserver_endpoints by 
+            distribute arguments.
+
+    Returns:
+        None
+
+    Raises:
+        ValueError: If `checkpoint_dir` is None.
+        AssertionError: If `trainer_args` is not a dict.
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            path = "./checkpoints"
+            prog = fluid.default_main_program()
+            trainer_args = {"epoch_id": 200,
+                            "step_id": 20} # just an example
+            table_name = "share_w"
+            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
+
+            save_checkpoint(executor=exe,
+                                     checkpoint_dir=path,
+                                     trainer_id=0,
+                                     trainer_args=trainer_args,
+                                     main_program=prog,
+                                     max_num_checkpoints=3,
+                                     lookup_table=table_name,
+                                     pserver_endpoints = ps_endpoints)
+    """
+    if checkpoint_dir is None:
+        raise ValueError("'checkpoint_dir' should not be None")
+
+    if main_program is None:
+        raise ValueError('main_program should not be None.')
+
+    if trainer_args:
+        assert isinstance(trainer_args, dict)
+
+    is_chief = trainer_id == 0
+
+    _make_chekcpoint_dirs(checkpoint_dir)
+    serial = _get_latest_checkpoint_serial(checkpoint_dir) + 1
+    cur_dir = _get_serial_dir(checkpoint_dir, serial)
+
+    _save_trainer_args(cur_dir, trainer_id, trainer_args)
+
+    if is_chief:
+        _save_persist_vars_without_grad(executor, cur_dir, main_program)
+
+    if is_chief and lookup_table and pserver_endpoints:
+        _save_pserver_vars_by_notify(executor, cur_dir, lookup_table,
+                                     pserver_endpoints)
+
+    _scroll_delete(checkpoint_dir, max_num_checkpoints)
+
+
+def load_checkpoint(executor,
+                    checkpoint_dir,
+                    main_program,
+                    role_id=0,
+                    is_trainer=True,
+                    load_trainer_args=None,
+                    load_lookup_table=None):
+    """
+    This function filters out all checkpoint variables from the give
+    main_program and then try to load these variables from the
+    `checkpoint_dir` directory.
+
+    In the training precess, we generally save a checkpoint in each
+    iteration. So there are more than one checkpoint in the 
+    `checkpoint_dir` (each checkpoint has its own sub folder), use 
+    `serial` to specify which serial of checkpoint you would like to
+    load.
+
+    A variable is a checkpoint variable and will be loaded if it meets
+    all following conditions:
+        1. It's persistable.
+        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
+        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
+
+    Args:
+        executor(Executor): The executor to run for loading checkpoint.
+        checkpoint_dir(str): The folder where all checkpoints are.
+        serial(int): The serial of checkpoint you would like to load.
+        main_program(Program): The program whose checkpoint variables will
+                               be loaded.
+        role_id(int):  the trainer id or the parameter server id.
+        is_trainer(bool): trainer is True and parameter server is False.
+        load_trainer_args(list|None): list about load trainer args.
+        load_lookup_table(str|None): the lookup table name
+
+    Returns:
+        None
+
+    Raises:
+        ValueError: If `checkpoint_dir` is None.
+        ValueError: If `main_program` is None.
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            path = "./checkpoints"
+            prog = fluid.default_main_program()
+            load_checkpoint(executor=exe, checkpoint_dir=path,
+                    serial=9, main_program=prog)
+
+            # In this example, `load_checkpoint` function
+            # will first filters out all checkpoint variables in the default
+            # main program, and then try to load these variables form the
+            # folder "./checkpoints/checkpoint_9/__model__".
+    """
+
+    if checkpoint_dir is None:
+        raise ValueError("'checkpoint_dir' should not be None")
+
+    serial = _get_latest_checkpoint_serial(checkpoint_dir)
+
+    # there are nothing  need to be loaded
+    if serial is None or serial < 0:
+        return
+
+    if main_program is None:
+        raise ValueError('main_program should not be None.')
+
+    if is_trainer and load_trainer_args is None:
+        cur_dir = _get_serial_dir(checkpoint_dir, serial)
+        _load_persist_vars_without_grad(executor, cur_dir, main_program, True)
+        return
+
+    if is_trainer and load_trainer_args:
+        return _load_trainer_args(checkpoint_dir, serial, role_id,
+                                  load_trainer_args)
+
+    if not is_trainer and load_lookup_table:
+        _load_lookup_table_vars(executor, checkpoint_dir, main_program, role_id,
+                                load_lookup_table)
+
+
+def clean_checkpoint(checkpoint_dir, delete_dir=False):
+    """
+    clean the checkpoint dir, when the train exits normally, 
+    the trainer will call clean_checkpoint to delete checkpoint directory saved before.
+    delete_dir only works when the directory is empty, otherwise, OSError is raised.  
+
+    : param checkpoint_dir
+    : param delete_dir
+    """
+
+    if checkpoint_dir is None:
+        raise ValueError("'checkpoint_dir' should not be None")
+    _scroll_delete(checkpoint_dir, max_num_checkpoints=0)
+
+    if delete_dir and not os.listdir(checkpoint_dir):
+        os.rmdir(checkpoint_dir)
+
+
+def _load_persist_vars_without_grad(executor,
+                                    dirname,
+                                    program,
+                                    has_model_dir=False):
+    """
+    This function filters out all checkpoint variables from the give
+    program and then trys to load these variables from the given directory.
+
+    A variable is a checkpoint variable if it meets all following
+    conditions:
+        1. It's persistable.
+        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
+        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
+
+    Args:
+        executor(Executor): The executor to run for loading variables.
+        dirname(str): The directory path.
+        program(Program): The program whose checkpoint variables will
+                          be loaded.
+        has_model_dir(bool): if True, the function loads variables
+                             from a sub directory named '__model__'.
+                             Default: False
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            _load_persist_vars_without_grad(executor=exe,
+                    dirname=param_path, program=prog, has_model_dir=True)
+
+            # In this example, `_load_persist_vars_without_grad` function
+            # will first filters out all checkpoint variables in the default
+            # main program, and then trys to load these variables form the
+            # folder "./my_paddle_model/__model__".
+    """
+
+    if has_model_dir:
+        dirname = _get_model_dir(dirname)
+
+    io.load_vars(
+        executor,
+        dirname=dirname,
+        main_program=program,
+        predicate=_is_checkpoint_var,
+        filename=None)
+
+
+def _load_lookup_table_vars(executor, dirname, program, pserver_id, table_name):
+    """
+    The parameter server will load lookup table's local file in 
+    selectedrows variable.
+
+    Args:
+        executor(Executor): The executor to run for loading persistable variables
+        dirname(str): The directory path
+        main_program(Program): Find the variable named table_name in main_program
+        pserver_id(int): the serial number in pserver_endpoints list
+        table_name(str): lookup table name
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            dirname = "./checkpoints/checkpoint_9/"
+            prog = fluid.default_main_program()
+            pserver_id = 1
+            table_name = "share_w"
+            _load_lookup_table_vars(executor=exe,
+                    dirname=dirname, program=prog, pserver_id=pserver_id,
+                    table_name=table_name)
+    """
+
+    for var in program.list_vars():
+        if var.name == table_name:
+            lookup_table_var = var
+            break
+
+    assert lookup_table_var is not None
+
+    lookup_table_dir = os.path.join(dirname, LOOKUP_TABLE_DIR)
+    table_file = table_name + CHECKPOINT_SEPARATOR + str(pserver_id)
+
+    load_prog = framework.Program()
+    load_block = load_prog.global_block()
+
+    load_block.append_op(
+        type='load',
+        inputs={},
+        outputs={'Out': [lookup_table_var]},
+        attrs={'file_path': os.path.join(lookup_table_dir, table_file)})
+
+    executor.run(load_prog)
+
+
+def _save_persist_vars_without_grad(executor, dirname, program):
+    """
+    This function filters out all checkpoint variables from the give
+    program and then save these variables to a sub-folder '__model__' of 
+    the given directory.
+
+    A variable is a checkpoint variable if it meets all following
+    conditions:
+        1. It's persistable.
+        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
+        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
+
+    Args:
+        executor(Executor): The executor to run for saving variables.
+        dirname(str): The directory path.
+        program(Program): The program whose checkpoint variables will
+                          be saved.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            _save_persist_vars_without_grad(executor=exe,
+                    dirname=param_path, program=prog)
+
+            # In this example, `_save_persist_vars_without_grad` function
+            # will first filters out all checkpoint variables in the default
+            # main program, and then saves these variables to the folder 
+            # "./my_paddle_model/__model__".
+    """
+    cur_dir = _get_model_dir(dirname)
+    io.save_vars(
+        executor,
+        dirname=cur_dir,
+        main_program=program,
+        vars=None,
+        predicate=_is_checkpoint_var,
+        filename=None)
+    _write_success(cur_dir)
+
+
+def _save_pserver_vars_by_notify(executor, dirname, lookup_table,
+                                 ps_endpoint_list):
+    """
+    This function will send checkpoint notify message from Trainer 0
+    to all the pservers.
+    The checkpoint notify message contains lookup table name, 
+    the absolute path on pserver to save lookup_table.
+
+    Args:
+        executor(Executor): The executor to run for send checkpoint notify.
+        dirname(str): The folder where to save checkpoints.
+        lookup_table(string): the lookup table name, when use distribute
+            lookup table, we can get lookup table name by DistributeTranspiler.
+            table_name 
+        ps_endpoint_list(list): the parameter server ip:port list.  
+            when use distribute lookup table, we can get ps_endpoint_list by 
+            distribute arguments.
+    Return:
+        None
+    
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            table_name = "share_w"
+            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
+
+            _save_pserver_vars_by_notify(executor=exe,
+                    dirname=param_path, lookup_table=table_name, 
+                    ps_endpoint_list=ps_endpoints)
+    """
+    cur_dir = _get_lookuptable_dir(dirname)
+
+    checkpoint_notify_program = framework.Program()
+    checkpoint_notify_block = checkpoint_notify_program.global_block()
+
+    attrs = {}
+    attrs['epmap'] = ps_endpoint_list
+    attrs['dir'] = cur_dir
+    attrs['lookup_table'] = lookup_table
+
+    checkpoint_notify_block.append_op(
+        type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
+    executor.run(checkpoint_notify_program)
+
+
+def _save_trainer_args(dirname, trainer_id, trainer_args):
+    assert isinstance(trainer_args, dict)
+
+    cur_dir = _get_trainer_dir(dirname, trainer_id)
+
+    for name, value in trainer_args.iteritems():
+        args_file = os.path.join(cur_dir, name)
+        with open(args_file, 'w') as f:
+            f.write(str(value))
+    _write_success(cur_dir)
+
+
+def _load_trainer_args(checkpoint_dir, serial, trainer_id, trainer_args):
+    """
+    trainer will load some args from it's independent directory, 
+    such as epoch_id and step_id.
+
+    Args:
+        checkpoint_dir(str): The folder where all checkpoints are.
+        serial(int): The serial of checkpoint you would like to load.
+        trainer_id(int): current trainer id.
+        trainer_args(list): list about load trainer args
+    Return:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            param_path = "./checkpoint/"
+            serial = 7
+            trainer_id = 2
+            trainer_args = ["epoch_id", "step_id"]
+
+            _load_trainer_args(checkpoint_dir=param_path, serial=serial,
+            trainer_id=trainer_id, trainer_args=trainer_args)
+    """
+    assert isinstance(trainer_args, list)
+
+    cur_dir = _get_serial_dir(checkpoint_dir, serial)
+    cur_dir = _get_trainer_dir(cur_dir, trainer_id)
+
+    ret_values = []
+
+    for arg in trainer_args:
+        cur_file = os.path.join(cur_dir, arg)
+        with open(cur_file, 'r') as f:
+            contents = f.read()
+            ret_values.append(contents.strip())
+    return ret_values
+
+
+def _is_checkpoint_var(var):
+    """
+    the checkpoint will not save or load all the variables.
+    var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
+
+    : param var(Variable)
+    """
+    if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+            var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+            var.desc.type() == core.VarDesc.VarType.RAW:
+        return False
+    # @GRAD are named for gradient variables, checkpoint will not save it.
+    if "@GRAD" in var.name:
+        return False
+    # .trainer_ are named for distribute train variables, checkpoint will not save it.
+    if ".trainer_" in var.name:
+        return False
+
+    # .block is named for distribute train variables, checkpoint will not save it.
+    if ".block" in var.name:
+        return False
+
+    return var.persistable
+
+
+def _make_chekcpoint_dirs(dirs):
+    """
+    _make_chekcpoint_dirs will makdir local directory directly, when the directory is exist, it will igore it.
+    """
+    assert dirs is not None
+
+    if os.path.isfile(dirs):
+        raise OSError(errno.ENOTDIR, "dirs path shoule be a Directory.", dirs)
+
+    if not os.path.isdir(dirs):
+        try:
+            os.makedirs(dirs)
+        except OSError as err:
+            if err.errno != errno.EEXIST:
+                raise err
+
+
+def _get_dir_serial(dirname):
+    _, serial = dirname.split(CHECKPOINT_SEPARATOR)
+
+    try:
+        serial_num = int(serial)
+    except ValueError:
+        serial_num = -1
+    return serial_num
+
+
+def _get_serial_dir(dirname, serial):
+    serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial)
+    serial_dir = os.path.join(dirname, serial_folder)
+    _make_chekcpoint_dirs(serial_dir)
+
+    return serial_dir
+
+
+def _get_model_dir(dirname):
+    model_dir = os.path.join(dirname, MODEL_DIR)
+    _make_chekcpoint_dirs(model_dir)
+    return model_dir
+
+
+def _get_lookuptable_dir(dirname):
+    lookuptable_dir = os.path.join(dirname, LOOKUP_TABLE_DIR)
+    _make_chekcpoint_dirs(lookuptable_dir)
+    return lookuptable_dir
+
+
+def _get_trainer_dir(dirname, trainer_id):
+    trainer_folder = TRAINER_PREFIX + CHECKPOINT_SEPARATOR + str(trainer_id)
+    trainer_dir = os.path.join(dirname, trainer_folder)
+    _make_chekcpoint_dirs(trainer_dir)
+    return trainer_dir
+
+
+def _scroll_delete(dirname, max_num_checkpoints=3):
+    dirs = os.listdir(dirname)
+    serial_map = {}
+    for serial in dirs:
+        serial_num = _get_dir_serial(serial)
+        serial_map[serial_num] = serial
+
+    if len(serial_map.keys()) <= max_num_checkpoints:
+        return
+
+    serials = serial_map.keys()
+    serials.sort(reverse=True)
+    serials = serials[max_num_checkpoints:]
+    for serial in serials:
+        cur_dir = _get_serial_dir(dirname, serial)
+        try:
+            shutil.rmtree(cur_dir)
+        except OSError as err:
+            if err.errno != errno.ENOENT:
+                raise err
+
+
+def _write_success(dirname):
+    """
+    write an empty file named "_SUCCESS" in checkpoint dir, indicate this checkpoint is correct.
+
+    : param dirname
+    """
+    success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME)
+    with open(success_file, 'a') as f:
+        now = time.ctime()
+        f.write(now)
+
+
+def _get_latest_checkpoint_serial(checkpoint_dir):
+    """
+    get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory
+
+    : param checkpoint_dir
+    """
+    if not checkpoint_dir:
+        return -1
+
+    def has_success(checkpoint_dir, cur_dir):
+        """
+        is _SUCCESS in this dir
+        """
+
+        serial = _get_dir_serial(cur_dir)
+        if serial == -1 or not os.path.isdir(
+                os.path.join(checkpoint_dir, cur_dir)):
+            return -1
+
+        success_path = os.path.join(
+            _get_serial_dir(checkpoint_dir, serial), MODEL_DIR,
+            SUCCESS_MARK_FILENAME)
+        if os.path.isfile(success_path):
+            return serial
+
+    if not os.path.isdir(checkpoint_dir):
+        return -1
+
+    current_dir = -1
+    dirs = os.listdir(checkpoint_dir)
+    for cur_dir in dirs:
+        success_num = has_success(checkpoint_dir, cur_dir)
+        if success_num > current_dir:
+            current_dir = success_num
+    return current_dir
diff --git a/python/paddle/fluid/transpiler/__init__.py b/python/paddle/fluid/transpiler/__init__.py
index cf18090f71f34be5105498f5846dbcdf15ab2e3f..eae13b50398f791d4a203b72a0e96f3e87cc2a88 100644
--- a/python/paddle/fluid/transpiler/__init__.py
+++ b/python/paddle/fluid/transpiler/__init__.py
@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from distribute_transpiler import DistributeTranspiler
+from distribute_transpiler import DistributeTranspiler, DistributeTranspilerConfig
 from inference_transpiler import InferenceTranspiler
 from memory_optimization_transpiler import memory_optimize, release_memory
 from ps_dispatcher import HashName, RoundRobin
 
 __all__ = [
     "DistributeTranspiler", "InferenceTranspiler", "memory_optimize",
-    "release_memory", "HashName", "RoundRobin"
+    "release_memory", "HashName", "RoundRobin", "DistributeTranspilerConfig"
 ]
diff --git a/python/paddle/fluid/transpiler/details/program_utils.py b/python/paddle/fluid/transpiler/details/program_utils.py
index f10b496306a002ee131d01798a0698b807d379ca..2ca1d4716b103d17117ae3ee958667c3a9747cdf 100644
--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
@@ -17,10 +17,10 @@ def delete_ops(block, ops):
     try:
         start = list(block.ops).index(ops[0])
         end = list(block.ops).index(ops[-1])
-        [block.remove_op(start) for _ in xrange(end - start + 1)]
+        [block._remove_op(start) for _ in xrange(end - start + 1)]
     except Exception, e:
         raise e
-    block.program.sync_with_cpp()
+    block.program._sync_with_cpp()
 
 
 def find_op_by_input_arg(block, arg_name):
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 53d6ca86a008f798af2854a154cce8b7242d2f35..820509bbcc4679cadb06554476798c76e6869eb5 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -31,13 +31,14 @@ Steps to transpile pserver:
 from __future__ import print_function
 
 import math
+import random
 import numpy as np
 
 from ps_dispatcher import RoundRobin, HashName, PSDispatcher
 from .. import core, framework
 from ..framework import Program, default_main_program, \
                         default_startup_program, Block, \
-                        Variable, Parameter, grad_var_name
+                        Parameter, grad_var_name
 from details import *
 
 LOOKUP_TABLE_TYPE = "lookup_table"
@@ -63,7 +64,7 @@ def same_or_split_var(p_name, var_name):
     return p_name == var_name or p_name.startswith(var_name + ".block")
 
 
-def slice_variable(var_list, slice_count, min_block_size=8192):
+def slice_variable(var_list, slice_count, min_block_size):
     """
     We may need to split dense tensor to one or more blocks and put
     them equally onto parameter server. One block is a sub-tensor
@@ -109,6 +110,22 @@ def slice_variable(var_list, slice_count, min_block_size=8192):
     return blocks
 
 
+class DistributeTranspilerConfig(object):
+    """
+    slice_var_up (bool): Do Tensor slice for pservers, default is True.
+    split_method (PSDispatcher): RoundRobin or HashName can be used
+        try to choose the best method to balance loads for pservers.
+    min_block_size (int): Minimum splitted element number in block.
+        According:https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156
+        We can use bandwidth effiently when data size is larger than 2MB.If you 
+        want to change it, please be sure you see the slice_variable function.
+    """
+
+    slice_var_up = True
+    split_method = None
+    min_block_size = 8192
+
+
 class DistributeTranspiler(object):
     """
     **DistributeTranspiler**
@@ -145,13 +162,23 @@ class DistributeTranspiler(object):
                 trainer_program = t.get_trainer_program()
     """
 
+    def __init__(self, config=None):
+        if config is not None:
+            self.config = config
+        else:
+            self.config = DistributeTranspilerConfig()
+
+        if self.config.split_method is None:
+            self.config.split_method = RoundRobin
+
+        assert (self.config.min_block_size >= 8192)
+        assert (self.config.split_method.__bases__[0] == PSDispatcher)
+
     def transpile(self,
                   trainer_id,
                   program=None,
                   pservers="127.0.0.1:6174",
                   trainers=1,
-                  slice_var_up=True,
-                  split_method=RoundRobin,
                   sync_mode=True):
         """
         Run the transpiler.
@@ -164,12 +191,8 @@ class DistributeTranspiler(object):
             pservers (str): comma separated ip:port string for the pserver
                 list.
             trainers (int): number of trainers in the distributed job.
-            slice_var_up (bool): Do Tensor slice for pservers, default is True.
-            split_method (PSDispatcher): RoundRobin or HashName can be used
-                try to choose the best method to balance loads for pservers.
             sync_mode (bool): Do sync training or not, default is True.
         """
-        assert (split_method.__bases__[0] == PSDispatcher)
         if program is None:
             program = default_main_program()
         self.origin_program = program
@@ -180,11 +203,11 @@ class DistributeTranspiler(object):
         self.pserver_endpoints = pserver_endpoints
         self.optimize_ops, self.params_grads = self._get_optimize_pass()
 
-        ps_dispatcher = split_method(self.pserver_endpoints)
+        ps_dispatcher = self.config.split_method(self.pserver_endpoints)
         self.has_distributed_lookup_table = self._has_distributed_lookup_table()
 
         # split and create vars, then put splited vars in dicts for later use.
-        self._init_splited_vars(slice_var_up)
+        self._init_splited_vars()
 
         # step 3.1: insert send op to send gradient vars to parameter servers
         ps_dispatcher.reset()
@@ -196,13 +219,14 @@ class DistributeTranspiler(object):
         #       fc_b@GRAD_trainer_0, fc_b@GRAD_trainer_1 --> pserver2
         # shuffle the map will avoid the uneven distribution above
         grad_var_mapping_items = self.grad_var_mapping.items()
-        if not slice_var_up:
-            np.random.shuffle(grad_var_mapping_items)
+        if not self.config.slice_var_up:
+            random.seed(self.trainer_num)
+            random.shuffle(grad_var_mapping_items)
 
         for orig_varname, splited_vars in grad_var_mapping_items:
             eplist = ps_dispatcher.dispatch(splited_vars)
 
-            if not slice_var_up:
+            if not self.config.slice_var_up:
                 assert (len(splited_vars) == 1)
 
             if len(splited_vars) == 1:
@@ -219,7 +243,7 @@ class DistributeTranspiler(object):
                 AssertionError("Can not insert the send op by original "
                                "variable name :", orig_varname)
 
-            program.global_block().insert_op(
+            program.global_block()._insert_op(
                 index=index + 1,
                 type="send",
                 inputs={"X": splited_vars},
@@ -269,14 +293,15 @@ class DistributeTranspiler(object):
                     RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
                 })
 
-        program.global_block().append_op(
-            type="fetch_barrier",
-            inputs={},
-            outputs={},
-            attrs={
-                "endpoints": pserver_endpoints,
-                RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
-            })
+        if self.sync_mode:
+            program.global_block().append_op(
+                type="fetch_barrier",
+                inputs={},
+                outputs={},
+                attrs={
+                    "endpoints": pserver_endpoints,
+                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                })
 
         for varname, splited_var in self.param_var_mapping.iteritems():
             if len(splited_var) <= 1:
@@ -323,6 +348,7 @@ class DistributeTranspiler(object):
 
         # step1
         pserver_program = Program()
+        pserver_program.random_seed = self.origin_program.random_seed
         # step2: Create vars to receive vars at parameter servers.
         recv_inputs = []
         for v in self.param_grad_ep_mapping[endpoint]["params"]:
@@ -377,11 +403,6 @@ class DistributeTranspiler(object):
         # append it into the sub program.
 
         global_ops = []
-        # HACK: optimization global ops only used to scale beta1 and beta2
-        # replace it with dependency engine.
-        for op in self.optimize_ops:
-            if self._is_adam_connected_op(op):
-                global_ops.append(op)
 
         def __append_optimize_op__(op, block, grad_to_block_id, merged_var,
                                    lr_ops):
@@ -410,7 +431,7 @@ class DistributeTranspiler(object):
 
             # clone vars
             for var in origin_block.vars:
-                new_sub_block.clone_variable(var)
+                new_sub_block._clone_variable(var)
 
             # clone ops
             for origin_op in origin_block.ops:
@@ -442,6 +463,8 @@ class DistributeTranspiler(object):
             per_opt_block = pserver_program.create_block(pre_block_idx)
             optimize_blocks.append(per_opt_block)
             # append grad merging ops before clip and weight decay
+            # cases may like: 
+            # L2Decay op -> clip op -> optimize
             for _, op in enumerate(self.optimize_ops):
                 # find the origin @GRAD var before clipping
                 grad_varname_for_block = __op_have_grad_input__(op)
@@ -449,6 +472,7 @@ class DistributeTranspiler(object):
                     merged_var = self._append_pserver_grad_merge_ops(
                         per_opt_block, grad_varname_for_block, endpoint,
                         grad_to_block_id, self.origin_program)
+                    break  # append optimize op once then append other ops.
             for _, op in enumerate(self.optimize_ops):
                 # optimizer is connected to itself
                 if ufind.is_connected(op, opt_op) and op not in global_ops:
@@ -472,6 +496,7 @@ class DistributeTranspiler(object):
             pserver_index = self.pserver_endpoints.index(endpoint)
             table_opt_block = self._create_table_optimize_block(
                 pserver_index, pserver_program, pre_block_idx, grad_to_block_id)
+            optimize_blocks.append(table_opt_block)
             prefetch_var_name_to_block_id = self._create_prefetch_block(
                 pserver_index, pserver_program, table_opt_block)
             checkpoint_block_id = self._create_checkpoint_save_block(
@@ -503,7 +528,7 @@ class DistributeTranspiler(object):
             outputs={},
             attrs=attrs)
 
-        pserver_program.sync_with_cpp()
+        pserver_program._sync_with_cpp()
         return pserver_program
 
     def get_startup_program(self, endpoint, pserver_program):
@@ -522,6 +547,7 @@ class DistributeTranspiler(object):
         """
         s_prog = Program()
         orig_s_prog = default_startup_program()
+        s_prog.random_seed = orig_s_prog.random_seed
         params = self.param_grad_ep_mapping[endpoint]["params"]
 
         def _get_splited_name_and_shape(varname):
@@ -535,7 +561,7 @@ class DistributeTranspiler(object):
         pserver_vars = pserver_program.global_block().vars
         created_var_map = dict()
         for _, var in pserver_vars.iteritems():
-            tmpvar = s_prog.global_block().clone_variable(var)
+            tmpvar = s_prog.global_block()._clone_variable(var)
             created_var_map[var.name] = tmpvar
 
         # 2. rename op outputs
@@ -630,7 +656,7 @@ class DistributeTranspiler(object):
                 ]
         return param_list, grad_list
 
-    def _init_splited_vars(self, slice_var_up):
+    def _init_splited_vars(self):
         # update these mappings for further transpile:
         # 1. param_var_mapping: param var name -> [splited params vars]
         # 2. grad_var_mapping: grad var name -> [splited grads vars]
@@ -654,17 +680,22 @@ class DistributeTranspiler(object):
         param_list, grad_list = self._update_dist_lookup_table_vars(
             param_list, grad_list, self.params_grads)
 
-        if slice_var_up:
+        if self.config.slice_var_up:
             # when we slice var up into blocks, we will slice the var according to
             # pserver services' count. A pserver may have two or more listening ports.
-            grad_blocks = slice_variable(grad_list, len(self.pserver_endpoints))
+            grad_blocks = slice_variable(grad_list,
+                                         len(self.pserver_endpoints),
+                                         self.config.min_block_size)
             param_blocks = slice_variable(param_list,
-                                          len(self.pserver_endpoints))
+                                          len(self.pserver_endpoints),
+                                          self.config.min_block_size)
         else:
             # when we do NOT slice var up into blocks, we will always slice params
             # grads into one block.
-            grad_blocks = slice_variable(grad_list, 1)
-            param_blocks = slice_variable(param_list, 1)
+            grad_blocks = slice_variable(grad_list, 1,
+                                         self.config.min_block_size)
+            param_blocks = slice_variable(param_list, 1,
+                                          self.config.min_block_size)
         assert (len(grad_blocks) == len(param_blocks))
 
         # origin_varname -> [splited_var]
@@ -733,7 +764,7 @@ class DistributeTranspiler(object):
                     self.all_prefetch_output_vars.append(prefetch_output_vars)
 
                     # insert split_ids_op
-                    program.global_block().insert_op(
+                    program.global_block()._insert_op(
                         index=lookup_table_op_index,
                         type="split_ids",
                         inputs={
@@ -745,18 +776,20 @@ class DistributeTranspiler(object):
                         outputs={"Out": prefetch_input_vars})
 
                     # insert prefetch_op
-                    program.global_block().insert_op(
+                    program.global_block()._insert_op(
                         index=lookup_table_op_index + 1,
                         type="prefetch",
                         inputs={'X': prefetch_input_vars},
                         outputs={"Out": prefetch_output_vars},
                         attrs={
                             "epmap": pserver_endpoints,
-                            RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                            # FIXME(qiao) temporarily disable this config because prefetch
+                            # is not act as other rpc op, it's more like a forward op
+                            # RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
                         })
 
                     # insert concat_op
-                    program.global_block().insert_op(
+                    program.global_block()._insert_op(
                         index=lookup_table_op_index + 2,
                         type="merge_ids",
                         inputs={
@@ -787,14 +820,14 @@ class DistributeTranspiler(object):
             if table_grad_name in op.output_arg_names:
                 op_index = list(all_ops).index(op)
                 # insert split_ids_op
-                program.global_block().insert_op(
+                program.global_block()._insert_op(
                     index=op_index + 1,
                     type="split_ids",
                     inputs={
                         'Ids': [program.global_block().vars[table_grad_name]]
                     },
                     outputs={"Out": self.trainer_side_table_grad_list})
-                program.global_block().insert_op(
+                program.global_block()._insert_op(
                     index=op_index + 2,
                     type="send",
                     inputs={'X': self.trainer_side_table_grad_list},
@@ -853,18 +886,17 @@ class DistributeTranspiler(object):
             persistable=True)
         # parameter must be selected rows
         param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
-        grad_var = pserver_program.global_block().clone_variable(
+        grad_var = pserver_program.global_block()._clone_variable(
             self.origin_program.global_block().vars[grad_var_name(
                 self.table_name)])
 
         # create table optimize block in pserver program
         table_opt_op = [
             op for op in self.optimize_ops
-            if op.input("Param")[0] == self.table_name
+            if 'Param' in op.input_names and op.input("Param")[0] ==
+            self.table_name
         ][0]
         table_opt_block = pserver_program.create_block(pre_block_idx)
-        # only support sgd now
-        assert table_opt_op.type == "sgd"
 
         if self.sync_mode:
             # create grad vars in pserver program
@@ -893,7 +925,7 @@ class DistributeTranspiler(object):
             if not splited_grad_name.startswith(origin_grad_name):
                 raise ValueError("origin_grad_var: " + splited_grad_name +
                                  " grad_var:" + grad_var.name)
-            grad_var = pserver_program.global_block().rename_var(
+            grad_var = pserver_program.global_block()._rename_var(
                 origin_grad_name, splited_grad_name)
 
         lr_var = pserver_program.global_block().vars[table_opt_op.input(
@@ -904,11 +936,12 @@ class DistributeTranspiler(object):
             "LearningRate": [lr_var]
         }
         outputs = {"ParamOut": [param_var]}
-        table_opt_block.append_op(
-            type=table_opt_op.type,
-            inputs=inputs,
-            outputs=outputs,
-            attrs=table_opt_op.attrs)
+        # only support sgd now
+        import logging
+        logging.warn(
+            "distribute lookup table only support sgd optimizer, change it's optimizer to sgd instead of "
+            + table_opt_op.type)
+        table_opt_block.append_op(type="sgd", inputs=inputs, outputs=outputs)
 
         # add table parameter gradient and it's block id to grad_to_block_id
         grad_to_block_id.append(grad_var.name + ":" + str(table_opt_block.idx))
@@ -969,7 +1002,7 @@ class DistributeTranspiler(object):
                 if self.sync_mode and add_trainer_suffix:
                     new_var_name = "%s.trainer_%d" % \
                         (orig_var.name, self.trainer_id)
-                    program.global_block().rename_var(varname, new_var_name)
+                    program.global_block()._rename_var(varname, new_var_name)
                     var_mapping[varname] = \
                         [program.global_block().var(new_var_name)]
                 else:
@@ -1003,7 +1036,7 @@ class DistributeTranspiler(object):
                     type=orig_var.type,
                     shape=splited_shape)  # flattend splited var
                 var_mapping[varname].append(var)
-            program.global_block().sync_with_cpp()
+            program.global_block()._sync_with_cpp()
         return var_mapping
 
     def create_splited_vars(self, source_var, block, tag):
@@ -1017,7 +1050,6 @@ class DistributeTranspiler(object):
         ]
 
     def _clone_var(self, block, var, persistable=True):
-        assert isinstance(var, Variable)
         return block.create_var(
             name=var.name,
             shape=var.shape,
@@ -1031,7 +1063,7 @@ class DistributeTranspiler(object):
             height_sections = []
             for v in splited_vars:
                 height_sections.append(v.shape[0])
-            program.global_block().insert_op(
+            program.global_block()._insert_op(
                 index=index + 1,
                 type="split_selected_rows",
                 inputs={"X": orig_var},
@@ -1041,7 +1073,7 @@ class DistributeTranspiler(object):
             sections = []
             for v in splited_vars:
                 sections.append(v.shape[0])
-            program.global_block().insert_op(
+            program.global_block()._insert_op(
                 index=index + 1,
                 type="split_byref",
                 inputs={"X": orig_var},
@@ -1230,7 +1262,7 @@ class DistributeTranspiler(object):
                 varlist = [varlist]
             for var in varlist:
                 if var not in program.global_block().vars:
-                    block.clone_variable(var)
+                    block._clone_variable(var)
 
         outputs = self._get_output_map_from_op(
             self.origin_program.global_block().vars, op)
@@ -1239,7 +1271,7 @@ class DistributeTranspiler(object):
                 varlist = [varlist]
             for var in varlist:
                 if var not in program.global_block().vars:
-                    block.clone_variable(var)
+                    block._clone_variable(var)
 
         return block.append_op(
             type=op.type, inputs=inputs, outputs=outputs, attrs=op.attrs)
@@ -1277,7 +1309,7 @@ class DistributeTranspiler(object):
                 if grad_block:
                     outputs[key] = grad_block
                 elif not program.global_block().vars.has_key(var.name):
-                    program.global_block().clone_variable(var)
+                    program.global_block()._clone_variable(var)
 
         return optimize_block.append_op(
             type=opt_op.type,
@@ -1289,26 +1321,8 @@ class DistributeTranspiler(object):
         # If one op's input is another op's output or
         # one op's output is another op's input, we say
         # the two operator is connected.
-        def _append_inname_remove_beta(varname_list):
-            op_input_names = []
-            for in_name in varname_list:
-                # HACK: remove beta1 and beta2 to avoid let all
-                # ops connected.
-                if in_name.startswith("beta2_pow_acc") or \
-                    in_name.startswith("beta1_pow_acc"):
-                    continue
-                else:
-                    op_input_names.append(in_name)
-            return op_input_names
-
-        op1_input_names = _append_inname_remove_beta(op1.desc.input_arg_names())
-        op1_output_names = op1.desc.output_arg_names()
-
-        op2_input_names = _append_inname_remove_beta(op2.desc.input_arg_names())
-        op2_output_names = op2.desc.output_arg_names()
-
-        if set(op1_output_names) & set(op2_input_names) or \
-           set(op1_input_names) & set(op2_output_names):
+        if set(op1.desc.output_arg_names()) & set(op2.desc.input_arg_names()) or \
+           set(op1.desc.input_arg_names()) & set(op2.desc.output_arg_names()):
             return True
         return False
 
@@ -1413,7 +1427,7 @@ class DistributeTranspiler(object):
 
     def _get_optimize_pass(self):
         """
-        Get optimizer operators, paramters and gradients from origin_program
+        Get optimizer operators, parameters and gradients from origin_program
         Returns:
             opt_ops (list): optimize operators.
             params_grads (dict): paramter->gradient.
@@ -1436,20 +1450,6 @@ class DistributeTranspiler(object):
                             origin_var_dict[param_name],
                             origin_var_dict[input_name]
                         ])
-            elif self._is_adam_connected_op(op):
-                opt_ops.append(op)
             else:
                 pass
         return opt_ops, params_grads
-
-    def _is_adam_connected_op(self, op):
-        """
-        A hack function to determinate whether the input operator
-        is connected to optimize operator.
-        """
-        if op.type == "scale":
-            for in_name in op.input_arg_names:
-                if in_name.startswith("beta1_pow_acc") or \
-                        in_name.startswith("beta2_pow_acc"):
-                    return True
-        return False
diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py
index b8afeae5ebd6ef7948a7c0c2775f419af461da04..f1905f08787da7a58a41d840ea68fb6c07f4028f 100644
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -95,7 +95,7 @@ class InferenceTranspiler(object):
                     # modify bnorm OP to include relu
                     current_op.set_attr("fuse_with_relu", True)
                     # remove relu OP
-                    self.block.remove_op(i + 1)
+                    self.block._remove_op(i + 1)
             i = i + 1
 
         self._remove_unused_var()
@@ -171,7 +171,7 @@ class InferenceTranspiler(object):
                     # fuse batch_norm
                     self._fuse_param(current_op, next_op, bias_op, 0)
                     # remove batch_norm_op
-                    self.block.remove_op(i + 2)
+                    self.block._remove_op(i + 2)
                     i = i + 1
                 # conv2d with bias, the next_op.type is elementwise_add
                 elif (next_op.type == 'elementwise_add'):
@@ -180,7 +180,7 @@ class InferenceTranspiler(object):
                         # fuse batch_norm
                         self._fuse_param(current_op, next_next_op, next_op, 1)
                         # remove batch_norm_op
-                        self.block.remove_op(i + 2)
+                        self.block._remove_op(i + 2)
                         i = i + 1
             i = i + 1
 
@@ -212,7 +212,7 @@ class InferenceTranspiler(object):
         y_var = self.block.var(bn_op.input("Bias")[0])
         out_var = self.block.var(bn_op.output("Y")[0])
 
-        bias_op = self.block.insert_op(
+        bias_op = self.block._insert_op(
             index,
             type="elementwise_add",
             inputs={"X": x_var,
@@ -307,4 +307,4 @@ class InferenceTranspiler(object):
 
         for var in self.block.vars.keys():
             if var not in args:
-                self.block.remove_var(var)
+                self.block._remove_var(var)
diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
index 999ef43ca0feacbddff5f9db59589ce7097fe77e..0ca5cf813b51e200da5edd5830767ad9457acec2 100644
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -14,7 +14,7 @@
 
 from collections import defaultdict
 from .. import core
-from ..framework import Program, default_main_program, Parameter, Variable
+from ..framework import Program, default_main_program, Parameter
 from ..backward import _rename_arg_
 
 dtype_to_size = {
@@ -177,7 +177,7 @@ class ControlFlowGraph(object):
                 in_diff)
             if can_optimize:
                 index = i + fwd_id + 1 if is_forward else i - self._forward_num + bwd_id + 1
-                delete_op = block_desc.insert_op(index)
+                delete_op = block_desc._insert_op(index)
                 delete_op.set_type("delete_var")
                 delete_op.set_input("X", can_optimize)
                 if is_forward:
@@ -324,6 +324,8 @@ def _process_sub_block_pair(pdesc, sub_block_pair):
             sub_op_output = set()
             sub_op_output.update(sub_op_dict[fwd_id].output_arg_names())
             sub_op_output.update(sub_op_dict[grad_id].output_arg_names())
+            sub_op_output.update(sub_op_dict[fwd_id].input_arg_names())
+            sub_op_output.update(sub_op_dict[grad_id].input_arg_names())
             ops_list.append((sub_block_ops, block_op_size, sub_op_output))
 
         # Process rest fwd_op block ops
@@ -335,6 +337,7 @@ def _process_sub_block_pair(pdesc, sub_block_pair):
                 sub_block_ops.append(sub_block.op(i))
             sub_op_output = set()
             sub_op_output.update(sub_op_dict[fwd_id].output_arg_names())
+            sub_op_output.update(sub_op_dict[fwd_id].input_arg_names())
             ops_list.append((sub_block_ops, sub_block_op_size, sub_op_output))
     return ops_list
 
@@ -349,13 +352,17 @@ def _get_cfgs(input_program):
     pdesc = input_program.get_desc()
     block_desc = pdesc.block(0)
     op_size = block_desc.op_size()
-    # Get global block ops
-    ops_list.append(
-        ([block_desc.op(i) for i in range(op_size)], op_size, set()))
 
     # Only process one level of nested subblock.
     ops_list.extend(_process_sub_block_pair(pdesc, SUB_BLOCK_PAIR))
 
+    skip_opt_set = set()
+    for _, _, skip_opt in ops_list:
+        skip_opt_set.update(skip_opt)
+
+    # Get global block ops
+    ops_list.insert(
+        0, ([block_desc.op(i) for i in range(op_size)], op_size, skip_opt_set))
     cfgs = [
         ControlFlowGraph(input_program, ops, forward_num, skip_opt)
         for ops, forward_num, skip_opt in ops_list
diff --git a/python/paddle/reader/__init__.py b/python/paddle/reader/__init__.py
index 3b059735a924d58714cd88a761eb83143f1192d6..678026cf95970e8ff58c1bad20246059ffb464c1 100644
--- a/python/paddle/reader/__init__.py
+++ b/python/paddle/reader/__init__.py
@@ -66,9 +66,9 @@ An example implementation for multiple item data reader creator:
 TODO(yuyang18): Should we add whole design doc here?
 """
 
-import decorator
-from decorator import *
+import paddle.reader.decorator
+from paddle.reader.decorator import *
 
-import creator
+import paddle.reader.creator
 
 __all__ = decorator.__all__ + ['creator']
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index 1f83cabb8481451736944823be45185deea4f43b..4b1fe94222d35f8c0e4e4cccc364227a3f9509d0 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -20,7 +20,7 @@ __all__ = [
 from threading import Thread
 import subprocess
 
-from Queue import Queue
+from six.moves.queue import Queue
 import itertools
 import random
 import zlib
diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py
index 2b959c48e4bc62e08f6f57981b61b7c5fe3a1d06..026cf501cfb35ab3fe35d24f52d3c271565482ef 100644
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py
@@ -68,8 +68,14 @@ def reader_creator(image_filename, label_filename, buffer_size):
                 for i in xrange(buffer_size):
                     yield images[i, :], int(labels[i])
         finally:
-            m.terminate()
-            l.terminate()
+            try:
+                m.terminate()
+            except:
+                pass
+            try:
+                l.terminate()
+            except:
+                pass
 
     return reader
 
diff --git a/python/paddle/v2/image.py b/python/paddle/v2/image.py
index 9235c41e9eb95b25a0dc53a494a203e7a4525981..08d8bd68f9b7eb703c15f7cb5ad1300969db5713 100644
--- a/python/paddle/v2/image.py
+++ b/python/paddle/v2/image.py
@@ -182,7 +182,7 @@ def resize_short(im, size):
         h_new = size * h / w
     else:
         w_new = size * w / h
-    im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC)
+    im = cv2.resize(im, (w_new, h_new), interpolation=cv2.INTER_CUBIC)
     return im
 
 
@@ -324,7 +324,6 @@ def simple_transform(im,
         if np.random.randint(2) == 0:
             im = left_right_flip(im, is_color)
     else:
-        im = center_crop(im, crop_size, is_color)
         im = center_crop(im, crop_size, is_color=is_color)
     if len(im.shape) == 3:
         im = to_chw(im)
diff --git a/python/requirements.txt b/python/requirements.txt
index ea827e9d5a0dcf8eb2ede1f6eaa88c777a138816..c091ecb111bda9d5e83c3ddcae93aed0745f9e4c 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -8,4 +8,4 @@ scipy>=0.19.0
 Pillow
 nltk>=3.2.2
 graphviz
-LinkChecker
+six
diff --git a/python/setup.py.in b/python/setup.py.in
index 5506443733650631fe045be3f701a41519352e8d..4a6cddbbea4903f5a65123aa19b7e978b335f32b 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -1,16 +1,13 @@
 from setuptools import setup, Distribution, Extension
 import subprocess
-import shutil
 import os
+import re
+import shutil
 class BinaryDistribution(Distribution):
     def has_ext_modules(foo):
         return True
 
-MAJOR   = 0
-MINOR   = 14
-PATCH   = 0
 RC      = 0
-ISTAGED = False
 
 
 
@@ -20,16 +17,51 @@ def git_commit():
         git_commit = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip()
     except:
         git_commit = 'Unknown'
-    return git_commit
+    git_commit = git_commit.decode()
+    return str(git_commit)
+
+def _get_version_detail(idx):
+    assert idx < 3, "vesion info consists of %(major)d.%(minor)d.%(patch)d, \
+        so detail index must less than 3"
+
+    if re.match('@TAG_VERSION_REGEX@', '@PADDLE_VERSION@'):
+        version_details = '@PADDLE_VERSION@'.split('.')
+
+        if len(version_details) == 3:
+            return version_details[idx]
+
+    return 0
+
+def get_major():
+    return int(_get_version_detail(0))
+
+def get_minor():
+    return int(_get_version_detail(1))
+
+def get_patch():
+    return str(_get_version_detail(2))
+
+def is_taged():
+    try:
+        cmd = ['git', 'describe', '--exact-match', '--tags', 'HEAD', '2>/dev/null']
+        git_tag = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip()
+        git_tag = git_tag.decode()
+    except:
+        return False
+
+    if str(git_tag).replace('v', '') == '@PADDLE_VERSION@':
+        return True
+    else:
+        return False
 
 def write_version_py(filename='paddle/version.py'):
     cnt = '''
 # THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
 #
-full_version    = '%(major)d.%(minor)d.%(patch)d'
+full_version    = '%(major)d.%(minor)d.%(patch)s'
 major           = '%(major)d'
 minor           = '%(minor)d'
-patch           = '%(patch)d'
+patch           = '%(patch)s'
 rc              = '%(rc)d'
 istaged         = %(istaged)s
 commit          = '%(commit)s'
@@ -37,13 +69,13 @@ with_mkl        = '%(with_mkl)s'
 
 def show():
     if istaged:
-        print 'full_version:', full_version
-        print 'major:', major
-        print 'minor:', minor
-        print 'patch:', patch
-        print 'rc:', rc
+        print('full_version:', full_version)
+        print('major:', major)
+        print('minor:', minor)
+        print('patch:', patch)
+        print('rc:', rc)
     else:
-        print 'commit:', commit
+        print('commit:', commit)
 
 def mkl():
     return with_mkl
@@ -51,13 +83,13 @@ def mkl():
     commit = git_commit()
     with open(filename, 'w') as f:
         f.write(cnt % {
-            'major': MAJOR,
-            'minor': MINOR,
-            'patch': PATCH,
+            'major': get_major(),
+            'minor': get_minor(),
+            'patch': get_patch(),
             'rc': RC,
             'version': '${PADDLE_VERSION}',
             'commit': commit,
-            'istaged': ISTAGED,
+            'istaged': is_taged(),
             'with_mkl': '@WITH_MKL@'})
 
 write_version_py(filename='@PADDLE_BINARY_DIR@/python/paddle/version.py')
@@ -72,6 +104,8 @@ packages=['paddle',
           'paddle.fluid.proto',
           'paddle.fluid.proto.profiler',
           'paddle.fluid.layers',
+          'paddle.fluid.contrib',
+          'paddle.fluid.contrib.decoder',
           'paddle.fluid.transpiler',
           'paddle.fluid.transpiler.details']
 
@@ -126,12 +160,15 @@ if '${WITH_MKL}' == 'ON':
     shutil.copy('${MKLML_IOMP_LIB}', libs_path)
     package_data['paddle.libs']+=['libmklml_intel.so','libiomp5.so']
 if '${WITH_MKLDNN}' == 'ON':
+    # TODO(typhoonzero): use install_name_tool to patch mkl libs once
+    # we can support mkl on mac.
+    #
     # change rpath of libmkldnn.so.0, add $ORIGIN/ to it.
     # The reason is that all thirdparty libraries in the same directory,
     # thus, libmkldnn.so.0 will find libmklml_intel.so and libiomp5.so.
     command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
     if os.system(command) != 0:
-        raise Exception("patchelf --set-rpath for libmkldnn.so.0 fails")
+        raise Exception("patch libmkldnn.so failed, command: %s" % command)
     package_data['paddle.libs']+=['libmkldnn.so.0']
     shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
 # remove unused paddle/libs/__init__.py
@@ -142,9 +179,20 @@ package_dir['paddle.libs']=libs_path
 # The reason is that libwarpctc.so, libiomp5.so etc are in paddle.libs, and
 # core.so is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries.
 # This operation will fix https://github.com/PaddlePaddle/Paddle/issues/3213
-command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
+if "@APPLE@" == "1":
+    command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
+else:
+    command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
 if os.system(command) != 0:
-    raise Exception("patchelf --set-rpath for core.so fails")
+    raise Exception("patch core.so failed, command: %s" % command)
+if '${WITH_FLUID_ONLY}'== 'OFF':
+    # change rpath of _swig_paddle.so.
+    if "@APPLE@" == "1":
+        command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"
+    else:
+        command = "patchelf --set-rpath '$ORIGIN/../paddle/libs/' ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"
+    if os.system(command) != 0:
+        raise Exception("patch _swig_paddle.so failed, command: %s" % command)
 
 setup(name='${PACKAGE_NAME}',
       version='${PADDLE_VERSION}',
diff --git a/tools/check_pr_approval.py b/tools/check_pr_approval.py
new file mode 100644
index 0000000000000000000000000000000000000000..937b0be7562fab93157c16b942631f0a580dfc68
--- /dev/null
+++ b/tools/check_pr_approval.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+import json
+
+
+def check_approval(count, required_reviewers):
+    json_buff = ""
+    for line in sys.stdin:
+        json_buff = "".join([json_buff, line])
+    json_resp = json.loads(json_buff)
+    approves = 0
+    approved_user_ids = []
+    for review in json_resp:
+        if review["state"] == "APPROVED":
+            approves += 1
+            approved_user_ids.append(review["user"]["id"])
+
+    # convert to int
+    required_reviewers_int = set()
+    for rr in required_reviewers:
+        required_reviewers_int.add(int(rr))
+
+    if len(set(approved_user_ids) & required_reviewers_int) >= count:
+        print("TRUE")
+    else:
+        print("FALSE")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1 and sys.argv[1].isdigit():
+        check_approval(int(sys.argv[1]), sys.argv[2:])
+    else:
+        print(
+            "Usage: python check_pr_approval.py [count] [required reviewer id] ..."
+        )
diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook
index 2c65222c8aa7a019f0f8fea68fe02612f70bd41f..aa14d3a2a12208eda11e82d88bc582eb3d2f5893 100755
--- a/tools/codestyle/cpplint_pre_commit.hook
+++ b/tools/codestyle/cpplint_pre_commit.hook
@@ -4,7 +4,7 @@ TOTAL_ERRORS=0
 
 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
 for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do
-    if [[ $file =~ ^(paddle/legacy/api/.*|paddle/legacy/capi/.*|paddle/contrib/.*|paddle/legacy/cuda/.*|paddle/legacy/function/.*|paddle/legacy/gserver/.*|paddle/legacy/math/.*|paddle/legacy/optimizer/.*|paddle/legacy/parameter/.*|paddle/legacy/pserver/.*|paddle/legacy/trainer/.*|paddle/legacy/utils/.*|paddle/testing/TestUtil.*) ]]; then
+    if [[ $file =~ ^(paddle/legacy/api/.*|paddle/legacy/capi/.*|paddle/contrib/.*|paddle/legacy/cuda/.*|paddle/legacy/function/.*|paddle/legacy/gserver/.*|paddle/legacy/math/.*|paddle/legacy/optimizer/.*|paddle/legacy/parameter/.*|paddle/legacy/pserver/.*|paddle/legacy/trainer/.*|paddle/legacy/utils/.*|paddle/testing/TestUtil.*|patches/grpc/.*) ]]; then
         continue;
     else
         cpplint --filter=-readability/fn_size $file;
diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64
index bca0b77ad71a3f65dda15191e5f540bfc2e043d1..0b72ea323b72a1a6cfd0911416c4037243d06ff4 100644
--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64
@@ -13,7 +13,7 @@ ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib:${LD_LIBRARY_PATH}
 ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
 
-RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz
+RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz
 COPY build_scripts /build_scripts
 RUN bash build_scripts/build.sh && \
   bash build_scripts/install_nccl2.sh && rm -r build_scripts