diff --git a/CMakeLists.txt b/CMakeLists.txt
index a51552d96a462f297f2711f9aaa70f4fc6c8ba00..9ec632e20690eafdc558e24f160270a89b29ee41 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,9 +33,7 @@ if(WIN32)
     set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
 endif(WIN32)
 
-if(NOT CMAKE_CROSSCOMPILING)
-    find_package(CUDA QUIET)
-endif(NOT CMAKE_CROSSCOMPILING)
+find_package(CUDA QUIET)
 find_package(Git REQUIRED)
 find_package(Threads REQUIRED)
 
@@ -49,7 +47,6 @@ option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FO
 option(WITH_NGRAPH      "Compile PaddlePaddle with nGraph support."     OFF)
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
-option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
 option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
 option(WITH_DOUBLE      "Compile PaddlePaddle with double precision"    OFF)
 option(WITH_RDMA        "Compile PaddlePaddle with RDMA support"        OFF)
@@ -60,11 +57,9 @@ option(WITH_DOC         "Compile PaddlePaddle with documentation"       OFF)
 option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
-option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
 option(WITH_FLUID_ONLY  "Compile PaddlePaddle fluid only"               OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
-option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
 option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
 option(WITH_PSLIB       "Compile with pslib support"                    OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
@@ -96,37 +91,6 @@ if(NOT CMAKE_BUILD_TYPE)
       FORCE)
 endif()
 
-if(ANDROID OR IOS)
-    if(ANDROID)
-        if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
-            message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
-        endif()
-    endif()
-
-    set(WITH_GPU OFF CACHE STRING
-        "Disable GPU when cross-compiling for Android and iOS" FORCE)
-    set(WITH_AVX OFF CACHE STRING
-        "Disable AVX when cross-compiling for Android and iOS" FORCE)
-    set(WITH_PYTHON OFF CACHE STRING
-        "Disable PYTHON when cross-compiling for Android and iOS" FORCE)
-    set(WITH_RDMA OFF CACHE STRING
-        "Disable RDMA when cross-compiling for Android and iOS" FORCE)
-    set(WITH_MKL OFF CACHE STRING
-        "Disable MKL when cross-compiling for Android and iOS" FORCE)
-    set(WITH_NGRAPH OFF CACHE STRING
-        "Disable nGraph when cross-compiling for Android and iOS" FORCE)
-    set(WITH_GOLANG OFF CACHE STRING
-        "Disable golang when cross-compiling for Android and iOS" FORCE)
-
-    # Compile PaddlePaddle mobile inference library
-    if (NOT WITH_C_API)
-        set(WITH_C_API ON CACHE STRING
-            "Always compile the C_API when cross-compiling for Android and iOS" FORCE)
-    endif()
-    set(MOBILE_INFERENCE ON)
-    add_definitions(-DPADDLE_MOBILE_INFERENCE)
-endif()
-
 if (APPLE)
     set(WITH_MKL OFF CACHE STRING
         "Disable MKL for building on mac" FORCE)
@@ -135,8 +99,6 @@ endif()
 if (WIN32)
     set(WITH_DISTRIBUTE OFF CACHE STRING
             "Disable DISTRIBUTE when compiling for Windows" FORCE)
-    set(WITH_C_API OFF CACHE STRING
-            "Disable C_API when compiling for Windows" FORCE)
     set(WITH_FLUID_ONLY ON CACHE STRING
             "Enable FLUID_ONLY when compiling for Windows" FORCE)
 endif()
@@ -150,21 +112,7 @@ set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING
 set(FLUID_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_inference_install_dir" CACHE STRING
   "A path setting fluid inference shared and static libraries")
 
-if (WITH_C_API AND WITH_PYTHON)
-  message(WARNING "It is suggest not embedded a python interpreter in Paddle "
-    "when using C-API. It will give an unpredictable behavior when using a "
-    "different Python interpreter from compiling.")
-endif()
-
-if (WITH_C_API)
-  set(WITH_FLUID_ONLY OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE)
-endif()
-
-if(MOBILE_INFERENCE)
-    set(THIRD_PARTY_BUILD_TYPE MinSizeRel)
-else()
-    set(THIRD_PARTY_BUILD_TYPE Release)
-endif()
+set(THIRD_PARTY_BUILD_TYPE Release)
 
 set(WITH_MKLML ${WITH_MKL})
 if (NOT DEFINED WITH_MKLDNN)
@@ -193,7 +141,6 @@ include(external/python)    # download, build, install python
 include(external/openblas)  # download, build, install openblas
 include(external/mkldnn)    # download, build, install mkldnn
 include(external/ngraph)    # download, build, install nGraph
-include(external/swig)      # download, build, install swig
 include(external/boost)     # download boost
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
@@ -312,11 +259,6 @@ if(WITH_MKLDNN)
     list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
 endif()
 
-if(USE_NNPACK)
-    include(external/nnpack)
-    list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS})
-endif(USE_NNPACK)
-
 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
diff --git a/Dockerfile.android b/Dockerfile.android
deleted file mode 100644
index 48db2efea21a648657e3f490c95429b9a29ede52..0000000000000000000000000000000000000000
--- a/Dockerfile.android
+++ /dev/null
@@ -1,42 +0,0 @@
-FROM ubuntu:16.04
-MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-
-ARG UBUNTU_MIRROR
-RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-
-# ENV variables
-ARG ANDROID_ABI
-ARG ANDROID_API
-
-ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"}
-ENV ANDROID_API=${ANDROID_API:-21}
-
-ENV HOME=/root \
-    ANDROID_NDK_HOME=/opt/android-ndk-linux \
-    ANDROID_TOOLCHAINS_DIR=/opt/toolchains
-
-RUN apt-get update && \
-    apt-get install -y \
-    git python-dev python-pip python-numpy \
-    wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
-    apt-get clean -y
-
-# git credential to skip password typing
-RUN git config --global credential.helper store
-
-# Fix locales to en_US.UTF-8
-RUN localedef -i en_US -f UTF-8 en_US.UTF-8
-
-RUN pip install --upgrade pip==9.0.3 && \
-    pip install -U 'protobuf==3.1.0' && \
-    pip install -U wheel sphinx && \
-    pip install pre-commit
-
-# Android NDK
-RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \
-    mkdir -p /opt/android-ndk-tmp && \
-    cd /opt/android-ndk-tmp && \
-    wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \
-    unzip -q android-ndk-r14b-linux-x86_64.zip && \
-    mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
-    rm -rf /opt/android-ndk-tmp
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 24de8d9d7ced5f8111cc5d65f761b7506bde048e..52ac31d1d125afb89fb0ae783fba94ab9a0c5a1a 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -64,24 +64,18 @@ endif()
 ## Then find the reference-cblas.  www.netlib.org/blas/
 set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH
   "Folder contains reference-cblas")
-if(NOT CMAKE_CROSSCOMPILING)
-  set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
-    ${REFERENCE_CBLAS_ROOT}/include
-    /usr/include
-    /usr/include/cblas
-  )
+set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
+  ${REFERENCE_CBLAS_ROOT}/include
+  /usr/include
+  /usr/include/cblas
+)
 
-  set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
-    ${REFERENCE_CBLAS_ROOT}/lib
-    /usr/lib
-    /usr/lib/blas/reference/
-    /usr/lib/reference/
-  )
-else()
-  # Disable the finding of reference cblas under host's system path
-  set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/include)
-  set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib)
-endif()
+set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
+  ${REFERENCE_CBLAS_ROOT}/lib
+  /usr/lib
+  /usr/lib/blas/reference/
+  /usr/lib/reference/
+)
 
 if(WITH_SYSTEM_BLAS)
   find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
@@ -98,10 +92,3 @@ if(WITH_SYSTEM_BLAS)
     message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
   endif()
 endif()
-
-if(IOS_USE_VECLIB_FOR_BLAS AND VECLIB_FOUND)
-  set(CBLAS_FOUND ON)
-  set(CBLAS_PROVIDER vecLib)
-  set(CBLAS_INC_DIR ${VECLIB_INC_DIR})
-  add_definitions(-DPADDLE_USE_VECLIB)
-endif()
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index e3d856fb30d8103f50ebcb6dc16153c8ed2a97a6..076e839120d98d801de4374f2f8338ebd918b88f 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -49,12 +49,10 @@ if(NOT WITH_PROFILER)
     add_definitions(-DPADDLE_DISABLE_PROFILER)
 endif(NOT WITH_PROFILER)
 
-if(NOT CMAKE_CROSSCOMPILING)
-    if(WITH_AVX AND AVX_FOUND)
-        set(SIMD_FLAG ${AVX_FLAG})
-    elseif(SSE3_FOUND)
-        set(SIMD_FLAG ${SSE3_FLAG})
-    endif()
+if(WITH_AVX AND AVX_FOUND)
+    set(SIMD_FLAG ${AVX_FLAG})
+elseif(SSE3_FOUND)
+    set(SIMD_FLAG ${SSE3_FLAG})
 endif()
 
 if(WIN32)
diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake
deleted file mode 100644
index 4cf2be3bdf07ed018c57cd6bc305a3eda9c9a23d..0000000000000000000000000000000000000000
--- a/cmake/cross_compiling/android.cmake
+++ /dev/null
@@ -1,236 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-# http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This is a toolchain file for cross-compiling for Android, and the
-# configuration refers to the open-source resposity:
-#     https://github.com/taka-no-me/android-cmake
-# Most of the variables are compatible with that used in
-#     https://developer.android.com/ndk/guides/cmake.html
-# The supported variables are listed belows:
-# 
-# ANDROID_STANDALONE_TOOLCHAIN
-# ANDROID_TOOLCHAIN
-# ANDROID_ABI
-# ANDROID_NATIVE_API_LEVEL
-# ANDROID_ARM_MODE
-# ANDROID_ARM_NEON
-#
-# For CMake >= 3.7.0, all the settings will be delivered to CMake system
-# variables to let CMake do the cross-compiling configurations itself.
-# More detail of cross-compiling settings
-#     https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html
-
-IF(NOT ANDROID)
-    return()
-ENDIF()
-
-# check the exist of android standalone toolchain
-IF(NOT DEFINED ANDROID_STANDALONE_TOOLCHAIN)
-    SET(ANDROID_STANDALONE_TOOLCHAIN $ENV{ANDROID_STANDALONE_TOOLCHAIN}
-        CACHE PATH "Folder holds the standalone toolchain of Android NDK")
-ENDIF()
-IF(NOT ANDROID_STANDALONE_TOOLCHAIN)
-    MESSAGE(WARNING "It is recommended to set ANDROID_STANDALONE_TOOLCHAIN to "
-            "use a standalone toolchain.\n"
-            "To cross-compile for Android, you need to:\n"
-            "1. Download an Android NDK from"
-            " https://developer.android.com/ndk/downloads/index.html\n"
-            "2. Setup a standalone toolchain"
-            "https://developer.android.google.cn/ndk/guides/standalone_toolchain.html?hl=zh-cn\n")
-ENDIF()
-
-IF(NOT DEFINED CMAKE_SYSTEM_VERSION AND ANDROID_NATIVE_API_LEVEL)
-    IF(ANDROID_NATIVE_API_LEVEL MATCHES "^android-[0-9]+$")
-        STRING(REPLACE "android-" "" CMAKE_SYSTEM_VERSION "${CMAKE_MATCH_0}")
-    ELSEIF(ANDROID_NATIVE_API_LEVEL MATCHES "^[0-9]+$")
-        SET(CMAKE_SYSTEM_VERSION ${ANDROID_NATIVE_API_LEVEL})
-    ENDIF()
-ENDIF()
-
-IF(NOT DEFINED ANDROID_TOOLCHAIN)
-    SET(ANDROID_TOOLCHAIN clang)
-ENDIF()
-
-IF(NOT DEFINED ANDROID_ABI)
-    SET(ANDROID_ABI "armeabi-v7a")
-ENDIF()
-
-IF(NOT DEFINED ANDROID_ARM_MODE)
-    SET(ANDROID_ARM_MODE ON)
-ENDIF()
-IF(ANDROID_ARM_MODE)
-    SET(ANDROID_ARM_MODE_NAME "arm")
-ELSE(ANDROID_ARM_MODE)
-    SET(ANDROID_ARM_MODE_NAME "thumb")
-ENDIF(ANDROID_ARM_MODE)
-
-IF(NOT DEFINED ANDROID_ARM_NEON)
-    SET(ANDROID_ARM_NEON ON)
-ENDIF()
-
-IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
-    IF("${CMAKE_VERSION}" VERSION_LESS "3.1.0")
-        SET(CMAKE_SYSTEM_NAME "Linux")
-    ENDIF()
-    MESSAGE(WARNING "It is recommended to use CMake >= 3.7.0 (current version: "
-            "${CMAKE_VERSION}), when cross-compiling for Android.")
-
-    IF(ANDROID_STANDALONE_TOOLCHAIN)
-        # Use standalone toolchain
-        SET(CMAKE_SYSROOT "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot")
-
-        IF(NOT CMAKE_SYSTEM_VERSION)
-            SET(ANDROID_STANDALONE_TOOLCHAIN_API "")
-            SET(ANDROID_API_LEVEL_H_REGEX "^[\t ]*#[\t ]*define[\t ]+__ANDROID_API__[\t ]+([0-9]+)")
-            FILE(STRINGS "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot/usr/include/android/api-level.h"
-                ANDROID_API_LEVEL_H_CONTENT REGEX "${ANDROID_API_LEVEL_H_REGEX}")
-            IF(ANDROID_API_LEVEL_H_CONTENT MATCHES "${ANDROID_API_LEVEL_H_REGEX}")
-                SET(ANDROID_STANDALONE_TOOLCHAIN_API "${CMAKE_MATCH_1}")
-            ENDIF()
-            SET(CMAKE_SYSTEM_VERSION ${ANDROID_STANDALONE_TOOLCHAIN_API})
-        ENDIF()
-
-        # Toolchain
-        SET(ANDROID_TOOLCHAIN_ROOT ${ANDROID_STANDALONE_TOOLCHAIN})
-    ELSE(ANDROID_NDK)
-        # TODO: use android ndk
-    ENDIF()
-
-    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
-        SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi)
-        IF(ANDROID_ABI STREQUAL "armeabi")
-            SET(CMAKE_SYSTEM_PROCESSOR armv5te)
-            SET(ANDROID_CLANG_TRIPLE armv5te-none-linux-androideabi)
-        ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
-            SET(CMAKE_SYSTEM_PROCESSOR armv7-a)
-            SET(ANDROID_CLANG_TRIPLE armv7-none-linux-androideabi)
-        ENDIF()
-    ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
-        SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
-        SET(CMAKE_SYSTEM_PROCESSOR aarch64)
-        SET(ANDROID_CLANG_TRIPLE aarch64-none-linux-android)
-    ELSE()
-        MESSAGE(FATAL_ERROR "Invalid Android ABI: ${ANDROID_ABI}.")
-    ENDIF()
-    SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
-
-    IF(ANDROID_TOOLCHAIN STREQUAL clang)
-        SET(ANDROID_C_COMPILER_NAME clang)
-        SET(ANDROID_CXX_COMPILER_NAME clang++)
-        SET(CMAKE_C_COMPILER_TARGET   ${ANDROID_CLANG_TRIPLE})
-        SET(CMAKE_CXX_COMPILER_TARGET ${ANDROID_CLANG_TRIPLE})
-    ELSEIF(ANDROID_TOOLCHAIN STREQUAL gcc)
-        SET(ANDROID_C_COMPILER_NAME gcc)
-        SET(ANDROID_CXX_COMPILER_NAME g++)
-    ELSE()
-        MESSAGE(FATAL_ERROR "Invalid Android toolchain: ${ANDROID_TOOLCHAIN}")
-    ENDIF()
-
-    # C compiler
-    IF(NOT CMAKE_C_COMPILER)
-        SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_C_COMPILER_NAME}")
-    ELSE()
-        GET_FILENAME_COMPONENT(ANDROID_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
-    ENDIF()
-    IF(NOT EXISTS ${ANDROID_C_COMPILER})
-        MESSAGE(FATAL_ERROR "Cannot find C compiler: ${ANDROID_C_COMPILER}")
-    ENDIF()
-
-    # CXX compiler
-    IF(NOT CMAKE_CXX_COMPILER)
-        SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_CXX_COMPILER_NAME}")
-    ELSE()
-        GET_FILENAME_COMPONENT(ANDROID_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
-    ENDIF()
-    IF(NOT EXISTS ${ANDROID_CXX_COMPILER})
-        MESSAGE(FATAL_ERROR "Cannot find CXX compiler: ${ANDROID_CXX_COMPILER}")
-    ENDIF()
-
-    SET(CMAKE_C_COMPILER ${ANDROID_C_COMPILER} CACHE PATH "C compiler" FORCE)
-    SET(CMAKE_CXX_COMPILER ${ANDROID_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
-
-    # Toolchain and ABI specific flags.
-    SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections")
-    SET(ANDROID_LINKER_FLAGS "-Wl,--gc-sections")
-
-    IF(ANDROID_ABI STREQUAL "armeabi")
-        LIST(APPEND ANDROID_COMPILER_FLAGS
-             -march=armv5te
-             -mtune=xscale
-             -msoft-float)
-    ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
-        LIST(APPEND ANDROID_COMPILER_FLAGS
-             -march=armv7-a
-             -mfloat-abi=softfp)
-        IF(ANDROID_ARM_NEON)
-            LIST(APPEND ANDROID_COMPILER_FLAGS -mfpu=neon)
-        ELSE()
-            LIST(APPEND ANDROID_COMPILER_FLAGS -mfpu=vfpv3-d16)
-        ENDIF()
-        LIST(APPEND ANDROID_LINKER_FLAGS -Wl,--fix-cortex-a8)
-    ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
-        LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
-    ENDIF()
-
-    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
-        IF(ANDROID_ARM_MODE)
-            LIST(APPEND ANDROID_COMPILER_FLAGS -marm)
-        ELSE()
-            LIST(APPEND ANDROID_COMPILER_FLAGS -mthumb)
-        ENDIF()
-        IF(ANDROID_TOOLCHAIN STREQUAL clang)
-            # Disable integrated-as for better compatibility.
-            LIST(APPEND ANDROID_COMPILER_FLAGS -fno-integrated-as)
-        ENDIF()
-    ENDIF()
-
-    IF(ANDROID_TOOLCHAIN STREQUAL clang)
-        # CMake automatically forwards all compiler flags to the linker,
-        # and clang doesn't like having -Wa flags being used for linking.
-        # To prevent CMake from doing this would require meddling with
-        # the CMAKE_<LANG>_COMPILE_OBJECT rules, which would get quite messy.
-        LIST(APPEND ANDROID_LINKER_FLAGS -Qunused-arguments)
-    ENDIF()
-
-    STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}")
-    STRING(REPLACE ";" " " ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS}")
-
-    SET(CMAKE_C_FLAGS "${ANDROID_COMPILER_FLAGS} ${CMAKE_C_FLAGS}"
-        CACHE STRING "C flags")
-    SET(CMAKE_CXX_FLAGS "${ANDROID_COMPILER_FLAGS} ${CMAKE_CXX_FLAGS}"
-        CACHE STRING "CXX flags")
-    SET(CMAKE_SHARED_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}"
-        CACHE STRING "shared linker flags")
-
-    SET(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
-    SET(CMAKE_EXE_LINKER_FLAGS "-pie -fPIE ${ANDROID_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}"
-        CACHE STRING "executable linker flags")
-
-    MESSAGE(STATUS "Android: Targeting API '${CMAKE_SYSTEM_VERSION}' "
-            "with architecture '${ANDROID_ARM_MODE_NAME}', "
-            "ABI '${ANDROID_ABI}', and processor '${CMAKE_SYSTEM_PROCESSOR}'")
-    MESSAGE(STATUS "System CMAKE_C_FLAGS: " ${CMAKE_C_FLAGS})
-    MESSAGE(STATUS "System CMAKE_CXX_FLAGS: " ${CMAKE_CXX_FLAGS})
-ELSE()
-    IF(ANDROID_STANDALONE_TOOLCHAIN)
-        SET(CMAKE_ANDROID_STANDALONE_TOOLCHAIN ${ANDROID_STANDALONE_TOOLCHAIN})
-    ENDIF()
-    SET(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ABI})
-    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
-        SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE})
-        IF(ANDROID_ABI STREQUAL "armeabi-v7a")
-            SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON})
-        ENDIF()
-    ENDIF()
-ENDIF()
diff --git a/cmake/cross_compiling/host.cmake b/cmake/cross_compiling/host.cmake
deleted file mode 100644
index f9c6b12136f488a9a6ab77b1ba673b6be75391b5..0000000000000000000000000000000000000000
--- a/cmake/cross_compiling/host.cmake
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-# http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# find host C compiler
-IF(HOST_C_COMPILER)
-    SET(HOST_C_COMPILER_NAME ${HOST_C_COMPILER})
-ELSEIF(NOT $ENV{CC} STREQUAL "")
-    SET(HOST_C_COMPILER_NAME $ENV{CC})
-ELSE()
-    SET(HOST_C_COMPILER_NAME cc)
-ENDIF()
-
-GET_FILENAME_COMPONENT(HOST_C_COMPILER_PATH ${HOST_C_COMPILER_NAME} PROGRAM)
-IF(NOT HOST_C_COMPILER_PATH OR NOT EXISTS ${HOST_C_COMPILER_PATH})
-    MESSAGE(FATAL_ERROR "Cannot find host C compiler, set host C compiler:\n"
-            "\tcmake .. -DHOST_C_COMPILER=...")
-ENDIF()
-
-# find host CXX compiler
-IF(HOST_CXX_COMPILER)
-    SET(HOST_CXX_COMPILER_NAME ${HOST_CXX_COMPILER})
-ELSEIF(NOT $ENV{CXX} STREQUAL "")
-    SET(HOST_CXX_COMPILER_NAME $ENV{CXX})
-ELSE()
-    SET(HOST_CXX_COMPILER_NAME c++)
-ENDIF()
-
-GET_FILENAME_COMPONENT(HOST_CXX_COMPILER_PATH ${HOST_CXX_COMPILER_NAME} PROGRAM)
-IF(NOT HOST_CXX_COMPILER_PATH OR NOT EXISTS ${HOST_CXX_COMPILER_PATH})
-    MESSAGE(FATAL_ERROR "Cannot find host CXX compiler, set host CXX compiler:\n"
-            "\tcmake .. -DHOST_CXX_COMPILER=...")
-ENDIF()
-
-SET(HOST_C_COMPILER ${HOST_C_COMPILER_PATH} CACHE PATH "Host C compiler")
-SET(HOST_CXX_COMPILER ${HOST_CXX_COMPILER_PATH} CACHE PATH "Host CXX compiler")
-
-MESSAGE(STATUS "Found host C compiler: " ${HOST_C_COMPILER})
-MESSAGE(STATUS "Found host CXX compiler: " ${HOST_CXX_COMPILER})
diff --git a/cmake/cross_compiling/ios.cmake b/cmake/cross_compiling/ios.cmake
deleted file mode 100644
index 10d389ec8ed57ac2b15dd925ef99c8aff4807b05..0000000000000000000000000000000000000000
--- a/cmake/cross_compiling/ios.cmake
+++ /dev/null
@@ -1,347 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This is a toolchain file for cross-compiling for iOS, and the
-# configuration largely refers to public toolchain file:
-#    https://raw.githubusercontent.com/leetal/ios-cmake/master/ios.toolchain.cmake
-# and
-#    https://github.com/cristeab/ios-cmake
-#
-# Supports options:
-# IOS_PLATFORM = OS (default) or SIMULATOR
-#   This decides if SDKS will be selected from the iPhoneOS.platform or iPhoneSimulator.platform folders
-#   OS - the default, used to build for iPhone and iPad physical devices, which have an arm arch.
-#   SIMULATOR - used to build for the Simulator platforms, which have an x86 arch.
-# IOS_ARCH
-#   The archectures wanted to support, such "arm64", "armv7;arm64"
-# IOS_DEPLOYMENT_TARGET
-#   The minimum iOS deployment version, such as "7.0"
-# IOS_ENABLE_BITCODE = ON (default) or OFF
-# IOS_USE_VECLIB_FOR_BLAS = OFF (default) or ON
-# IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder
-#   By default this location is automatcially chosen based on the IOS_PLATFORM value above.
-#   If set manually, it will override the default location and force the user of a particular Developer Platform
-# IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder
-#   By default this location is automatcially chosen based on the IOS_DEVELOPER_ROOT value.
-#   In this case it will always be the most up-to-date SDK found in the IOS_DEVELOPER_ROOT path.
-#   If set manually, this will force the use of a specific SDK version
-
-# Macros:
-# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE)
-#  A convenience macro for setting xcode specific properties on targets
-#  example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1")
-# find_host_package (PROGRAM ARGS)
-#  A macro used to find executable programs on the host system, not within the iOS environment.
-#  Thanks to the android-cmake project for providing the command
-
-if(NOT IOS)
-  return()
-endif()
-
-set(CMAKE_SYSTEM_NAME Darwin)
-
-# Get the Xcode version being used.
-execute_process(COMMAND xcodebuild -version
-                OUTPUT_VARIABLE XCODE_VERSION
-                RESULT_VARIABLE XCODE_VERSION_RESULT
-                ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-if(NOT ${XCODE_VERSION_RESULT})
-  string(REGEX MATCH "Xcode [0-9\\.]+" XCODE_VERSION "${XCODE_VERSION}")
-  string(REGEX REPLACE "Xcode ([0-9\\.]+)" "\\1" XCODE_VERSION "${XCODE_VERSION}")
-  message(STATUS "Building with Xcode version: ${XCODE_VERSION}")
-else()
-  message(FATAL_ERROR "Cannot execute xcodebuild, please check whether xcode is installed.")
-endif()
-
-# Required as of cmake 2.8.10
-set(CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE)
-
-# Setup iOS platform unless specified manually with IOS_PLATFORM
-if(NOT DEFINED IOS_PLATFORM)
-  set(IOS_PLATFORM "OS")
-endif()
-set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
-
-# Set the architecture for iOS
-if(NOT DEFINED IOS_ARCH)
-  if(IOS_PLATFORM STREQUAL "OS")
-    set(IOS_ARCH "armv7;armv7s;arm64")
-  elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
-    set(IOS_ARCH "i386;x86_64")
-  endif()
-endif()
-set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string  "Build architecture for iOS")
-
-# Specify minimum iOS deployment version
-if(NOT DEFINED IOS_DEPLOYMENT_TARGET)
-  set(IOS_DEPLOYMENT_TARGET "7.0")
-endif()
-set(IOS_DEPLOYMENT_TARGET ${IOS_DEPLOYMENT_TARGET} CACHE STRING "Minimum iOS version")
-
-# Whether to enable bitcode
-if(NOT DEFINED IOS_ENABLE_BITCODE)
-  set(IOS_ENABLE_BITCODE ON)
-endif()
-set(IOS_ENABLE_BITCODE ${IOS_ENABLE_BITCODE} CACHE BOOL "Whether to enable bitcode")
-
-if(NOT DEFINED IOS_USE_VECLIB_FOR_BLAS)
-  set(IOS_USE_VECLIB_FOR_BLAS OFF)
-endif()
-set(IOS_USE_VECLIB_FOR_BLAS ${IOS_UES_VECLIB_FOR_BLAS} CACHE BOOL "Whether to use veclib")
-
-# Check the platform selection and setup for developer root
-if(${IOS_PLATFORM} STREQUAL "OS")
-  set(IOS_PLATFORM_LOCATION "iPhoneOS.platform")
-  set(XCODE_IOS_PLATFORM iphoneos)
-
-  # This causes the installers to properly locate the output libraries
-  set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphoneos")
-elseif(${IOS_PLATFORM} STREQUAL "SIMULATOR")
-  set(IOS_PLATFORM_LOCATION "iPhoneSimulator.platform")
-  set(XCODE_IOS_PLATFORM iphonesimulator)
-
-  # This causes the installers to properly locate the output libraries
-  set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator")
-elseif(${IOS_PLATFORM} STREQUAL "WATCHOS")
-  set(IOS_PLATFORM_LOCATION "WatchOS.platform")
-  set(XCODE_IOS_PLATFORM watchos)
-
-  # This causes the installers to properly locate the output libraries
-  set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-watchos")
-else(${IOS_PLATFORM} STREQUAL "OS")
-  message(FATAL_ERROR "Unsupported IOS_PLATFORM value selected. Please set to\n"
-          "\t OS, SIMULATOR, or WATCHOS.")
-endif()
-
-# Check iOS developer toolchain
-if(NOT DEFINED IOS_DEVELOPER_ROOT)
-  # Setup iOS developer location
-  execute_process(COMMAND xcode-select -print-path
-                  OUTPUT_VARIABLE XCODE_DEVELOPER_DIR
-                  RESULT_VARIABLE XCODE_DEVELOPER_DIR_RESULT
-                  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-  # Xcode 4.3 changed the installation location, choose the most recent one available
-  if(${XCODE_VERSION} VERSION_LESS "4.3.0")
-    set(IOS_DEVELOPER_ROOT "/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
-  else()
-    set(IOS_DEVELOPER_ROOT "${XCODE_DEVELOPER_DIR}/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
-  endif()
-endif()
-if(EXISTS ${IOS_DEVELOPER_ROOT})
-  set(IOS_DEVELOPER_ROOT ${IOS_DEVELOPER_ROOT} CACHE PATH "Location of iOS Platform")
-else()
-  message(FATAL_ERROR "Invalid IOS_DEVELOPER_ROOT: ${IOS_DEVELOPER_ROOT} does not exist.")
-endif()
-
-# Check iOS SDK
-if(NOT DEFINED IOS_SDK_ROOT)
-  # Find and use the most recent iOS sdk
-  file(GLOB IOS_SDK_LISTS "${IOS_DEVELOPER_ROOT}/SDKs/*")
-  if(IOS_SDK_LISTS)
-    list(SORT IOS_SDK_LISTS)
-    list(REVERSE IOS_SDK_LISTS)
-    list(GET IOS_SDK_LISTS 0 IOS_SDK_ROOT)
-  else(IOS_SDK_LISTS)
-    message(FATAL_ERROR "No iOS SDK's found in default search path ${IOS_DEVELOPER_ROOT}."
-            " Please manually set IOS_SDK_ROOT or install the iOS SDK.")
-  endif(IOS_SDK_LISTS)
-endif()
-if(EXISTS ${IOS_SDK_ROOT})
-  set(IOS_SDK_ROOT ${IOS_SDK_ROOT} CACHE PATH "Location of the selected iOS SDK")
-  message(STATUS "iOS toolchain: ${IOS_SDK_ROOT}")
-else()
-  message(FATAL_ERROR "Invalid IOS_SDK_ROOT: ${IOS_SDK_ROOT} does not exist.")
-endif()
-
-# Set the sysroot default to the most recent SDK
-set(CMAKE_OSX_SYSROOT ${IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support")
-
-# Get version of iOS SDK
-execute_process(COMMAND xcodebuild -sdk ${CMAKE_OSX_SYSROOT} -version SDKVersion
-                OUTPUT_VARIABLE IOS_SDK_VERSION
-                RESULT_VARIABLE IOS_SDK_VERSION_RESULT
-                ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-if(${IOS_SDK_VERSION_RESULT})
-  string(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" IOS_SDK_VERSION "${IOS_SDK_ROOT}")
-endif()
-if(NOT IOS_SDK_VERSION)
-  message(WARNING "Cannot get SDK's version.")
-  set(IOS_SDK_VERSION 1)
-endif()
-set(CMAKE_SYSTEM_VERSION ${IOS_SDK_VERSION})
-
-# Find the C & C++ compilers for the specified SDK.
-if(NOT CMAKE_C_COMPILER)
-  # Default to use clang
-  execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang
-                  OUTPUT_VARIABLE IOS_C_COMPILER
-                  RESULT_VARIABLE IOS_C_COMPILER_RESULT
-                  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-  if(${IOS_C_COMPILER_RESULT})
-    get_filename_component(IOS_C_COMPILER clang PROGRAM)
-  endif()
-else(NOT CMAKE_C_COMPILER)
-  # User can set it in cmake command
-  get_filename_component(IOS_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
-endif(NOT CMAKE_C_COMPILER)
-if(NOT EXISTS ${IOS_C_COMPILER})
-  message(FATAL_ERROR "Cannot find C compiler: ${IOS_C_COMPILER}")
-endif()
-
-if(NOT CMAKE_CXX_COMPILER)
-  # Default to use clang++
-  execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang++
-                  OUTPUT_VARIABLE IOS_CXX_COMPILER
-                  RESULT_VARIABLE IOS_CXX_COMPILER_RESULT
-                  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-  if(${IOS_CXX_COMPILER_RESULT})
-    get_filename_component(IOS_CXX_COMPILER clang++ PROGRAM)
-  endif()
-else(NOT CMAKE_CXX_COMPILER)
-  # User can set it in cmake command
-  get_filename_component(IOS_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
-endif(NOT CMAKE_CXX_COMPILER)
-if(NOT EXISTS ${IOS_CXX_COMPILER})
-  message(FATAL_ERROR "Cannot find CXX compiler: ${IOS_CXX_COMPILER}")
-endif()
-
-set(CMAKE_C_COMPILER ${IOS_C_COMPILER} CACHE PATH "C compiler" FORCE)
-set(CMAKE_CXX_COMPILER ${IOS_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
-
-set(CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ")
-set(CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ")
-set(CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}")
-set(CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}")
-
-# Set iOS specific C/C++ flags
-if(IOS_PLATFORM STREQUAL "OS")
-  if(XCODE_VERSION VERSION_LESS "7.0")
-    set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-mios-version-min=${IOS_DEPLOYMENT_TARGET}")
-  else()
-    # Xcode 7.0+ uses flags we can build directly from XCODE_IOS_PLATFORM.
-    set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-m${XCODE_IOS_PLATFORM}-version-min=${IOS_DEPLOYMENT_TARGET}")
-  endif()
-else()
-  set(XCODE_IOS_FLATFORM_VERSION_FLAGS "-mios-simulator-version-min=${IOS_DEPLOYMENT_TARGET}")
-endif()
-
-if(IOS_ENABLE_BITCODE)
-  set(XCODE_IOS_BITCODE_FLAGS "${IOS_COMPILER_FLAGS} -fembed-bitcode")
-else()
-  set(XCODE_IOS_BITCODE_FLAGS "")
-endif()
-
-set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_FLAGS}")
-
-# Hidden visibilty is required for cxx on iOS 
-set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags")
-set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
-
-set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first")
-
-if(IOS_USE_VECLIB_FOR_BLAS)
-  # Find vecLib for iOS
-  set(VECLIB_SEARCH_DIRS
-      ${IOS_SDK_ROOT}/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks
-      ${IOS_SDK_ROOT}/System/Library/Frameworks/Accelerate.framework/Frameworks
-      )
-  find_path(VECLIB_INC_DIR vecLib.h PATHS ${VECLIB_SEARCH_DIRS}/vecLib.framework/Headers)
-
-  include(FindPackageHandleStandardArgs)
-  find_package_handle_standard_args(vecLib DEFAULT_MSG VECLIB_INC_DIR)
-
-  if(VECLIB_FOUND)
-    if(VECLIB_INC_DIR MATCHES "^/System/Library/Frameworks/vecLib.framework.*")
-      set(IOS_LINK_FLAGS ${IOS_LINK_FLAGS} -lcblas "-framework vecLib")
-      message(STATUS "Found standalone vecLib.framework")
-    else()
-      set(IOS_LINK_FLAGS ${IOS_LINK_FLAGS} -lcblas "-framework Accelerate")
-      message(STATUS "Found vecLib as part of Accelerate.framework")
-    endif()
-
-  endif()
-endif()
-
-set(CMAKE_C_LINK_FLAGS "${IOS_LINK_FLAGS} ${CMAKE_C_LINK_FLAGS}")
-set(CMAKE_CXX_LINK_FLAGS "${IOS_LINK_FLAGS} ${CMAKE_CXX_LINK_FLAGS}")
-
-set(CMAKE_PLATFORM_HAS_INSTALLNAME 1)
-if(NOT IOS_ENABLE_BITCODE)
-  set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -headerpad_max_install_names")
-  set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -headerpad_max_install_names")
-else()
-  set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib")
-  set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle")
-endif()
-set(CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,")
-set(CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,")
-set(CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a")
-
-# hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old build tree
-# (where install_name_tool was hardcoded) and where CMAKE_INSTALL_NAME_TOOL isn't in the cache
-# and still cmake didn't fail in CMakeFindBinUtils.cmake (because it isn't rerun)
-# hardcode CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did before, Alex
-if(NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
-  find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool)
-endif()
-
-# Set the find root to the iOS developer roots and to user defined paths
-set(CMAKE_FIND_ROOT_PATH ${IOS_DEVELOPER_ROOT} ${IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH}
-    CACHE string  "iOS find search path root")
-
-# default to searching for frameworks first
-set(CMAKE_FIND_FRAMEWORK FIRST)
-
-# set up the default search directories for frameworks
-set(CMAKE_SYSTEM_FRAMEWORK_PATH
-    ${IOS_SDK_ROOT}/System/Library/Frameworks
-    ${IOS_SDK_ROOT}/System/Library/PrivateFrameworks
-    ${IOS_SDK_ROOT}/Developer/Library/Frameworks
-    )
-
-# only search the iOS sdks, not the remainder of the host filesystem
-set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
-set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
-
-message(STATUS "iOS: Targeting iOS '${CMAKE_SYSTEM_VERSION}', "
-        "building for '${IOS_PLATFORM}' platform, with architecture '${CMAKE_OSX_ARCHITECTURES}'")
-message(STATUS "System CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}")
-message(STATUS "System CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
-
-# Used in ExternalProject command
-string(REPLACE ";" "\\$<SEMICOLON>" EXTERNAL_IOS_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}")
-set(EXTERNAL_OPTIONAL_ARGS
-    -DCMAKE_OSX_SYSROOT=${CMAKE_OSX_SYSROOT}
-    -DCMAKE_OSX_ARCHITECTURES=${EXTERNAL_IOS_ARCHITECTURES})
-
-# This little macro lets you set any XCode specific property
-macro(set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE)
-  set_property (TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} ${XCODE_VALUE})
-endmacro(set_xcode_property)
-
-# This macro lets you find executable programs on the host system
-macro(find_host_package)
-  set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
-  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
-  set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
-  set(IOS FALSE)
-
-  find_package(${ARGN})
-
-  set(IOS TRUE)
-  set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
-  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-  set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
-endmacro(find_host_package)
diff --git a/cmake/cross_compiling/raspberry_pi.cmake b/cmake/cross_compiling/raspberry_pi.cmake
deleted file mode 100644
index 0425b2ae158b265fd6f8423b05190a8002f03f20..0000000000000000000000000000000000000000
--- a/cmake/cross_compiling/raspberry_pi.cmake
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-# http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This is a toolchain file for cross-compiling for Raspberry Pi.
-# The supported variables are listed belows:
-#
-# RPI_TOOLCHAIN
-# RPI_ARM_NEON
-#
-# Also you can set CMAKE_C/CXX_COMPILER yourself, through cmake arguments.
-
-IF(NOT RPI)
-    return()
-ENDIF()
- 
-SET(CMAKE_SYSTEM_NAME Linux)
-SET(CMAKE_SYSTEM_VERSION 1)
-SET(CMAKE_SYSTEM_PROCESSOR arm)
-
-# check the exist of raspberry pi toolchain
-IF(NOT DEFINED RPI_TOOLCHAIN)
-    SET(RPI_TOOLCHAIN $ENV{RPI_TOOLCHAIN}
-        CACHE PATH "Folder holds the toolchain of Raspberr Pi")
-ENDIF()
-IF(NOT RPI_TOOLCHAIN)
-    MESSAGE(WARNING "It is recommended to set RPI_TOOLCHAIN to use toolchain.\n"
-            "To cross-compile for Raspberry Pi, you need to download the tools using:\n"
-            " git clone https://github.com/raspberrypi/tools\n")
-ENDIF()
-
-IF(NOT DEFINED RPI_ARM_NEON)
-    SET(RPI_ARM_NEON ON)
-ENDIF()
-
-IF(RPI_TOOLCHAIN)
-    SET(RPI_TOOLCHAIN_ROOT ${RPI_TOOLCHAIN})
-    IF(RPI_TOOLCHAIN_ROOT MATCHES "gcc-linaro-arm-linux-gnueabihf-raspbian(-x64)?$")
-        # gcc-linaro-arm-linux-gnueabihf-raspbian
-        # gcc-linaro-arm-linux-gnueabihf-raspbian-x64
-        SET(RPI_TOOLCHAIN_NAME arm-linux-gnueabihf)
-    ENDIF()
-    SET(RPI_TOOLCHAIN_PREFIX "${RPI_TOOLCHAIN_ROOT}/bin/${RPI_TOOLCHAIN_NAME}-")
-ENDIF()
-
-# C compiler
-IF(NOT CMAKE_C_COMPILER)
-    SET(RPI_C_COMPILER "${RPI_TOOLCHAIN_PREFIX}gcc")
-ELSE()
-    GET_FILENAME_COMPONENT(RPI_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
-ENDIF()
-IF(NOT EXISTS ${RPI_C_COMPILER})
-    MESSAGE(FATAL_ERROR "Cannot find C compiler: ${RPI_C_COMPILER}")
-ENDIF()
-
-# CXX compiler
-IF(NOT CMAKE_CXX_COMPILER)
-    SET(RPI_CXX_COMPILER "${RPI_TOOLCHAIN_PREFIX}g++")
-ELSE()
-    GET_FILENAME_COMPONENT(RPI_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
-ENDIF()
-IF(NOT EXISTS ${RPI_CXX_COMPILER})
-    MESSAGE(FATAL_ERROR "Cannot find CXX compiler: ${RPI_CXX_COMPILER}")
-ENDIF()
-
-SET(CMAKE_C_COMPILER ${RPI_C_COMPILER} CACHE PATH "C compiler" FORCE)
-SET(CMAKE_CXX_COMPILER ${RPI_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
-
-IF(RPI_ARM_NEON)
-    SET(RPI_C_FLAGS "${RPI_C_FLAGS} -mfpu=neon")
-ENDIF()
-
-SET(CMAKE_C_FLAGS "${RPI_C_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags")
-SET(CMAKE_CXX_FLAGS "${RPI_C_FLAGS} ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 16432ce2b803f6d21bbf47200eda5404269b750f..ea46f6418edf1db70b2a308dd49cf2131cc89d3b 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -63,9 +63,7 @@ function(select_nvcc_arch_flags out_variable)
   # List of arch names
   set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual")
   set(archs_name_default "All")
-  if(NOT CMAKE_CROSSCOMPILING)
-    list(APPEND archs_names "Auto")
-  endif()
+  list(APPEND archs_names "Auto")
 
   # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
   set(CUDA_ARCH_NAME ${archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
diff --git a/cmake/external/cares.cmake b/cmake/external/cares.cmake
index a743b572a6c3f6f152d85909500e9dbb35c72a01..52507a6ae4aabe300cf8bf88d0946c45a2c0e79c 100644
--- a/cmake/external/cares.cmake
+++ b/cmake/external/cares.cmake
@@ -13,7 +13,7 @@
 # limitations under the License.
 #
 
-IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE)
+IF(NOT WITH_DISTRIBUTE)
     return()
 ENDIF()
 
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index 95ca16f57f2704eaded85aa5f5c0546310fba3a7..f3ca74faea3629ddce053c49ef1e629f230fdc49 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -71,13 +71,3 @@ if (WIN32)
     set_property(GLOBAL PROPERTY OS_DEPENDENCY_MODULES shlwapi.lib)
   endif(HAVE_SHLWAPI)
 endif (WIN32)
-
-IF(WITH_C_API)
-  INSTALL(DIRECTORY ${GFLAGS_INCLUDE_DIR} DESTINATION third_party/gflags)
-  IF(ANDROID)
-    INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib/${ANDROID_ABI})
-  ELSE()
-    INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib)
-  ENDIF()
-ENDIF()
-
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index 8cd0455c16bf84909b735102e7fb1089744c4245..7a6a4523886824a67c82f9ce978de025ddb9c2cd 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -26,14 +26,8 @@ ENDIF(WIN32)
 
 INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
 
-IF(ANDROID AND ${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
-  # Using the unofficial glog for Android API < 21
-  SET(GLOG_REPOSITORY "https://github.com/Xreki/glog.git")
-  SET(GLOG_TAG "8a547150548b284382ccb6582408e9140ff2bea8")
-ELSE()
-  SET(GLOG_REPOSITORY "https://github.com/google/glog.git")
-  SET(GLOG_TAG "v0.3.5")
-ENDIF()
+SET(GLOG_REPOSITORY "https://github.com/google/glog.git")
+SET(GLOG_TAG "v0.3.5")
 
 ExternalProject_Add(
     extern_glog
@@ -78,12 +72,3 @@ ADD_DEPENDENCIES(glog extern_glog gflags)
 LINK_LIBRARIES(glog gflags)
 
 LIST(APPEND external_project_dependencies glog)
-
-IF(WITH_C_API)
-  INSTALL(DIRECTORY ${GLOG_INCLUDE_DIR} DESTINATION third_party/glog)
-  IF(ANDROID)
-    INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib/${ANDROID_ABI})
-  ELSE()
-    INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib)
-  ENDIF()
-ENDIF()
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index fd9835d023c67b76579913f2ec56c2444fea8c15..c5754da59bf2053931be413eb10c481adecbae6b 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -13,7 +13,7 @@
 # limitations under the License.
 #
 
-IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE)
+IF(NOT WITH_DISTRIBUTE)
     return()
 ENDIF()
 
diff --git a/cmake/external/gzstream.cmake b/cmake/external/gzstream.cmake
index 3e36ef7ae205bbf85f345d55456309cc05a58fbd..af7a8bfda6f7db12049203e7c9d54885884d8cf2 100644
--- a/cmake/external/gzstream.cmake
+++ b/cmake/external/gzstream.cmake
@@ -13,10 +13,6 @@
 # limitations under the License.
 #
 
-IF(MOBILE_INFERENCE)
-    return()
-ENDIF()
-
 include (ExternalProject)
 
 # NOTE: gzstream is needed when linking with ctr reader.
diff --git a/cmake/external/libxsmm.cmake b/cmake/external/libxsmm.cmake
index 530f7ebe2813fb2f00c6b5b4d1f7b2f04fe650b0..39f49d210a20d49a06c120361ecf0a5d07d1af28 100644
--- a/cmake/external/libxsmm.cmake
+++ b/cmake/external/libxsmm.cmake
@@ -19,8 +19,8 @@ IF(NOT WITH_LIBXSMM)
     return()
 ENDIF()
 
-IF(WIN32 OR APPLE OR ANDROID OR IOS)
-    MESSAGE(WARNING "Windows, Mac or Mobile are not supported with libxsmm in Paddle yet.")
+IF(WIN32 OR APPLE)
+    MESSAGE(WARNING "Windows, Mac are not supported with libxsmm in Paddle yet.")
     SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM" FORCE)
     return()
 ENDIF()
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 03f0dee85911bdaa0312b624114b7f4aef1fb723..6a7be73f09a278ab0fd29c7599a7781df3d29413 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -110,7 +110,3 @@ else(WIN32)
 endif(WIN32)
 ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB})
 ADD_DEPENDENCIES(mkldnn_shared_lib ${MKLDNN_PROJECT} mkldnn)
-IF(WITH_C_API)
-  INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib)
-ENDIF()
-
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 43322a257a02c3fd756078db6fe20b582826066a..2caff27357687018f29c1efc55b7b82c9dc3ccf6 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -74,7 +74,3 @@ ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
 ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
 LIST(APPEND external_project_dependencies mklml)
-
-IF(WITH_C_API)
-  INSTALL(FILES ${MKLML_LIB} ${MKLML_IOMP_LIB} DESTINATION lib)
-ENDIF()
diff --git a/cmake/external/nnpack.cmake b/cmake/external/nnpack.cmake
deleted file mode 100644
index d42bcb0f329041462bd8b568052fbb8226d18e4e..0000000000000000000000000000000000000000
--- a/cmake/external/nnpack.cmake
+++ /dev/null
@@ -1,30 +0,0 @@
-# Find the NNPACK library
-#  NNPACK_ROOT - where to find NNPACK include and library.
-#
-
-set(NNPACK_FOUND OFF)
-set(NNPACK_ROOT $ENV{NNPACK_ROOT} CACHE PATH "Folder contains NNPACK")
-find_path(NNPACK_INC_DIR nnpack.h PATHS ${NNPACK_ROOT}/include)
-find_library(NNPACK_LIB NAMES nnpack PATHS ${NNPACK_ROOT}/lib)
-find_library(PTHREADPOOL_LIB NAMES pthreadpool PATHS ${NNPACK_ROOT}/lib)
-find_library(NNPACK_UKERNELS_LIB NAMES nnpack_ukernels PATHS ${NNPACK_ROOT}/lib)
-find_library(NNPACK_CPUFEATURES_LIB NAMES cpufeatures PATHS ${NNPACK_ROOT}/lib)
-
-if(NNPACK_INC_DIR AND NNPACK_LIB AND PTHREADPOOL_LIB)
-  set(NNPACK_FOUND ON)
-  INCLUDE_DIRECTORIES(${NNPACK_INC_DIR})
-
-  set(NNPACK_LIBS)
-  list(APPEND NNPACK_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB})
-  if (NNPACK_UKERNELS_LIB)
-    list(APPEND NNPACK_LIBS ${NNPACK_UKERNELS_LIB})
-  endif()
-  if (NNPACK_CPUFEATURES_LIB)
-    list(APPEND NNPACK_LIBS ${NNPACK_CPUFEATURES_LIB})
-  endif()
-  if(NOT ANDROID)
-    list(APPEND NNPACK_LIBS "rt")
-  endif()
-else()
-  message(FATAL_ERROR "Cannot find NNPACK in (${NNPACK_ROOT})")
-endif()
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index aeb976b840e999a20e8cab11939cbb1f49a27850..b347a592929836a473ac764c0af1153b07d54258 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -40,38 +40,12 @@ IF(NOT ${CBLAS_FOUND})
     SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
     SET(OPENBLAS_COMMIT "v0.2.20")
 
-    IF(CMAKE_CROSSCOMPILING)
-        SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
-        GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY)
-        SET(CROSS_SUFFIX ${CROSS_SUFFIX}/)
-        IF(ANDROID)
-            IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
-                # use softfp
-                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
-            ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
-                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
-            ENDIF()
-        ELSEIF(IOS)
-            IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
-                SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
-                SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
-                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
-            ELSE()
-                MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. "
-                       "You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.")
-            ENDIF()
-        ELSEIF(RPI)
-            # use hardfp
-            SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0)
-        ENDIF()
-    ELSE()
-        IF(APPLE)
-            SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
-        ENDIF()
-        SET(OPTIONAL_ARGS "")
-        IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
-            SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
-        ENDIF()
+    IF(APPLE)
+        SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
+    ENDIF()
+    SET(OPTIONAL_ARGS "")
+    IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
+        SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
     ENDIF()
 
     SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
@@ -92,25 +66,6 @@ IF(NOT ${CBLAS_FOUND})
     ELSE()
     ENDIF(NOT WIN32)
     SET(CBLAS_PROVIDER openblas)
-    IF(WITH_C_API)
-        INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas)
-        # Because libopenblas.a is a symbolic link of another library, thus need to
-        # install the whole directory.
-        IF(ANDROID)
-            SET(TMP_INSTALL_DIR third_party/openblas/lib/${ANDROID_ABI})
-        ELSE()
-            SET(TMP_INSTALL_DIR third_party/openblas/lib)
-        ENDIF()
-        INSTALL(CODE "execute_process(
-            COMMAND ${CMAKE_COMMAND} -E copy_directory ${CBLAS_INSTALL_DIR}/lib
-                    ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}
-            )"
-        )
-        INSTALL(CODE "MESSAGE(STATUS \"Installing: \"
-                \"${CBLAS_INSTALL_DIR}/lib -> ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}\"
-            )"
-        )
-    ENDIF()
 ENDIF(NOT ${CBLAS_FOUND})
 
 MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}")
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index e1e619e572b05e83fbe751af2e5391aafc494416..e05b7694ddf1e1652b00f156cde1a2d433c9fc46 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -204,15 +204,6 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
 
     SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
     SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
-    IF(MOBILE_INFERENCE)
-        # The reason why the official version is not used is described in
-        # https://github.com/PaddlePaddle/Paddle/issues/6114
-        SET(PROTOBUF_REPO "https://github.com/qingqing01/protobuf.git")
-        SET(PROTOBUF_TAG "v3.2.0")
-        IF(NOT BUILD_FOR_HOST)
-            SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-Dprotobuf_BUILD_PROTOC_BINARIES=OFF")
-        ENDIF()
-    ENDIF()
 
     ExternalProject_Add(
         ${TARGET_NAME}
@@ -240,19 +231,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
     )
 ENDFUNCTION()
 
-IF(NOT MOBILE_INFERENCE)
-    SET(PROTOBUF_VERSION 3.1)
-ELSE()
-    SET(PROTOBUF_VERSION 3.2)
-ENDIF()
-IF(CMAKE_CROSSCOMPILING)
-    build_protobuf(protobuf_host TRUE)
-    LIST(APPEND external_project_dependencies protobuf_host)
-
-    SET(PROTOBUF_PROTOC_EXECUTABLE ${protobuf_host_PROTOC_EXECUTABLE}
-        CACHE FILEPATH "protobuf executable." FORCE)
-ENDIF()
-
+SET(PROTOBUF_VERSION 3.1)
 
 IF(NOT PROTOBUF_FOUND)
     build_protobuf(extern_protobuf FALSE)
@@ -266,20 +245,7 @@ IF(NOT PROTOBUF_FOUND)
     SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY}
         CACHE FILEPATH "protoc library." FORCE)
 
-    IF(WITH_C_API)
-        INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf)
-        IF(ANDROID)
-            INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
-        ELSE()
-            INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib)
-        ENDIF()
-    ENDIF()
-
-    IF(CMAKE_CROSSCOMPILING)
-        PROMPT_PROTOBUF_LIB(protobuf_host extern_protobuf)
-    ELSE()
-        SET(PROTOBUF_PROTOC_EXECUTABLE ${extern_protobuf_PROTOC_EXECUTABLE}
-            CACHE FILEPATH "protobuf executable." FORCE)
-        PROMPT_PROTOBUF_LIB(extern_protobuf)
-    ENDIF()
+    SET(PROTOBUF_PROTOC_EXECUTABLE ${extern_protobuf_PROTOC_EXECUTABLE}
+        CACHE FILEPATH "protobuf executable." FORCE)
+    PROMPT_PROTOBUF_LIB(extern_protobuf)
 ENDIF(NOT PROTOBUF_FOUND)
diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake
index 3b495d78e2c61f90418adbc5746792bc6e49d90b..b4ea268e5a48e29d00b0ec8b957b61a42553ec7e 100644
--- a/cmake/external/pslib.cmake
+++ b/cmake/external/pslib.cmake
@@ -71,7 +71,3 @@ ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB})
 ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT})
 LIST(APPEND external_project_dependencies pslib)
-
-IF(WITH_C_API)
-  INSTALL(FILES ${PSLIB_LIB} ${PSLIB_IOMP_LIB} DESTINATION lib)
-ENDIF()
diff --git a/cmake/external/pslib_brpc.cmake b/cmake/external/pslib_brpc.cmake
index 7ff5a8aca187240108164900638f5a376e9fbc93..8b43f2ef5c999fc351543ba958c7cc4b0856625d 100644
--- a/cmake/external/pslib_brpc.cmake
+++ b/cmake/external/pslib_brpc.cmake
@@ -71,7 +71,3 @@ ADD_LIBRARY(pslib_brpc SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET pslib_brpc PROPERTY IMPORTED_LOCATION ${PSLIB_BRPC_LIB})
 ADD_DEPENDENCIES(pslib_brpc ${PSLIB_BRPC_PROJECT})
 LIST(APPEND external_project_dependencies pslib_brpc)
-
-IF(WITH_C_API)
-  INSTALL(FILES ${PSLIB_BRPC_LIB} ${PSLIB_BRPC_IOMP_LIB} DESTINATION lib)
-ENDIF()
diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake
index f9d4cd97400a68e613e3dd5467191a0d42a9942e..27d075336d556528ffaf1929c34753494692f0a0 100644
--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
@@ -12,10 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-if(MOBILE_INFERENCE OR RPI)
-    return()
-endif()
-
 include (ExternalProject)
 
 # NOTE: snappy is needed when linking with recordio
diff --git a/cmake/external/snappystream.cmake b/cmake/external/snappystream.cmake
index 1ec79462c14e44f2d0abe6904383ebd91d94d35e..392f186b7ce3821f313ed6fc3dd5a97c2a7adebd 100644
--- a/cmake/external/snappystream.cmake
+++ b/cmake/external/snappystream.cmake
@@ -12,10 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-IF(MOBILE_INFERENCE OR RPI)
-    return()
-ENDIF()
-
 include (ExternalProject)
 
 set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
diff --git a/cmake/external/swig.cmake b/cmake/external/swig.cmake
deleted file mode 100644
index de07703695eb14e76eedd3758d55cb98edd1e02b..0000000000000000000000000000000000000000
--- a/cmake/external/swig.cmake
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-IF(NOT WITH_SWIG_PY)
-    return()
-ENDIF()
-
-FIND_PACKAGE(SWIG)
-
-IF(NOT SWIG_FOUND)
-    # build swig as an external project
-    INCLUDE(ExternalProject)
-
-    SET(SWIG_SOURCES_DIR ${THIRD_PARTY_PATH}/swig)
-    SET(SWIG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/swig)
-    SET(SWIG_TARGET_VERSION "3.0.2")
-    SET(SWIG_DOWNLOAD_SRC_MD5 "62f9b0d010cef36a13a010dc530d0d41")
-    SET(SWIG_DOWNLOAD_WIN_MD5 "3f18de4fc09ab9abb0d3be37c11fbc8f")
-
-    IF(WIN32)
-        # swig.exe available as pre-built binary on Windows:
-        ExternalProject_Add(swig
-            URL                 http://prdownloads.sourceforge.net/swig/swigwin-${SWIG_TARGET_VERSION}.zip
-            URL_MD5             ${SWIG_DOWNLOAD_WIN_MD5}
-            SOURCE_DIR          ${SWIG_SOURCES_DIR}
-            CONFIGURE_COMMAND   ""
-            BUILD_COMMAND       ""
-            INSTALL_COMMAND     ""
-            UPDATE_COMMAND      ""
-        )
-        SET(SWIG_DIR ${SWIG_SOURCES_DIR} CACHE FILEPATH "SWIG Directory" FORCE)
-        SET(SWIG_EXECUTABLE ${SWIG_SOURCES_DIR}/swig.exe  CACHE FILEPATH "SWIG Executable" FORCE)
-    ELSE(WIN32)
-        # swig uses bison find it by cmake and pass it down
-        FIND_PACKAGE(BISON)
-
-        # From SWIG configure
-        ExternalProject_Add(swig
-            GIT_REPOSITORY      https://github.com/swig/swig.git
-            GIT_TAG             rel-3.0.10
-            PREFIX              ${SWIG_SOURCES_DIR}
-            CONFIGURE_COMMAND   cd <SOURCE_DIR> && ./autogen.sh && ./configure
-                                --prefix=${SWIG_INSTALL_DIR} --without-pcre
-            BUILD_COMMAND       cd <SOURCE_DIR> && make
-            INSTALL_COMMAND     cd <SOURCE_DIR> && make install
-            UPDATE_COMMAND      ""
-        )
-
-        SET(SWIG_DIR ${SWIG_INSTALL_DIR}/share/swig/${SWIG_TARGET_VERSION})
-        SET(SWIG_EXECUTABLE ${SWIG_INSTALL_DIR}/bin/swig)
-    ENDIF(WIN32)
-
-    LIST(APPEND external_project_dependencies swig)
-ENDIF(NOT SWIG_FOUND)
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 7b937c93febdfa1d7d5f4c73fc2a5830322688e5..7a25aaf15f2c7f46d99394d82d69bc24e4f5cb2c 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -12,10 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-IF(MOBILE_INFERENCE)
-    return()
-ENDIF()
-
 INCLUDE(ExternalProject)
 
 SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)
diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake
index c3e1212d8f8358e0148b5e00223414c9696686ee..a0f300c2e8bab9e7402f869eed1b4c2d1c579aab 100644
--- a/cmake/external/xxhash.cmake
+++ b/cmake/external/xxhash.cmake
@@ -73,12 +73,3 @@ include_directories(${XXHASH_INCLUDE_DIR})
 add_dependencies(xxhash extern_xxhash)
 
 LIST(APPEND external_project_dependencies xxhash)
-
-IF(WITH_C_API)
-  INSTALL(DIRECTORY ${XXHASH_INCLUDE_DIR} DESTINATION third_party/xxhash)
-  IF(ANDROID)
-    INSTALL(FILES ${XXHASH_LIBRARIES} DESTINATION third_party/xxhash/lib/${ANDROID_ABI})
-  ELSE()
-    INSTALL(FILES ${XXHASH_LIBRARIES} DESTINATION third_party/xxhash/lib)
-  ENDIF()
-ENDIF()
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index d35073753725cd5772de3fc7a23af5ba69a65558..6c8d79c25e6a2655711fe4450e65600c9a584015 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -59,12 +59,3 @@ SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
 ADD_DEPENDENCIES(zlib extern_zlib)
 
 LIST(APPEND external_project_dependencies zlib)
-
-IF(WITH_C_API)
-  INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib)
-  IF(ANDROID)
-    INSTALL(FILES ${ZLIB_LIBRARIES} DESTINATION third_party/zlib/lib/${ANDROID_ABI})
-  ELSE()
-    INSTALL(FILES ${ZLIB_LIBRARIES} DESTINATION third_party/zlib/lib)
-  ENDIF()
-ENDIF()
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index c4472040cef870454c072c1b84a04e1ac592b476..9e6c47f016fe6dfd809c5b2bc88ff59d0a6b2b84 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -156,10 +156,8 @@ set(GPU_COMMON_FLAGS
 endif(NOT WIN32)
 
 if (APPLE)
-    if(NOT CMAKE_CROSSCOMPILING)
-        # On Mac OS X build fat binaries with x86_64 architectures by default.
-        set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
-    endif()
+    # On Mac OS X build fat binaries with x86_64 architectures by default.
+    set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
     # On Mac OS X register class specifier is deprecated and will cause warning error on latest clang 10.0
     set (COMMON_FLAGS -Wno-deprecated-register)
 endif(APPLE)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 3f1be11d85555671eebb1c2ba3a5642d64d7f2bf..6679a09dfc9dd00cfe3b5c5da3e12bd1c1389432 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -90,11 +90,11 @@
 # including binary directory for generated headers.
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 
-if(NOT APPLE AND NOT ANDROID)
+if(NOT APPLE)
   find_package(Threads REQUIRED)
   link_libraries(${CMAKE_THREAD_LIBS_INIT})
   set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
-endif(NOT APPLE AND NOT ANDROID)
+endif(NOT APPLE)
 
 set_property(GLOBAL PROPERTY FLUID_MODULES "")
 # find all fluid modules is used for paddle fluid static library
@@ -388,6 +388,7 @@ function(cc_test TARGET_NAME)
     endif()
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
     # No unit test should exceed 10 minutes.
     set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
@@ -460,6 +461,7 @@ function(nv_test TARGET_NAME)
     endif()
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
   endif()
 endfunction(nv_test)
@@ -655,12 +657,6 @@ function(paddle_protobuf_generate_cpp SRCS HDRS)
   set(${SRCS})
   set(${HDRS})
 
-  if (MOBILE_INFERENCE)
-      set(EXTRA_FLAG "lite:")
-  else()
-      set(EXTRA_FLAG "")
-  endif()
-
   foreach(FIL ${ARGN})
     get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
     get_filename_component(FIL_WE ${FIL} NAME_WE)
@@ -677,7 +673,7 @@ function(paddle_protobuf_generate_cpp SRCS HDRS)
       COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}"
       COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
       -I${CMAKE_CURRENT_SOURCE_DIR}
-      --cpp_out "${EXTRA_FLAG}${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL}
+      --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL}
       DEPENDS ${ABS_FIL} protoc
       COMMENT "Running C++ protocol buffer compiler on ${FIL}"
       VERBATIM )
@@ -714,9 +710,10 @@ function(py_test TARGET_NAME)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS ARGS ENVS)
     cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
     add_test(NAME ${TARGET_NAME}
              COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
-             FLAGS_cpu_deterministic=true
+             FLAGS_cpu_deterministic=true FLAGS_limit_of_tmp_allocation=4294967296  # 4G
              PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
              ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 3e11d332ff71098adf65e487a39351ae57427e9e..a7dce4dfdb530b13bea9df128694f0946714ccff 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -149,25 +149,23 @@ if (WITH_NGRAPH)
             )
 endif ()
 
-if (NOT MOBILE_INFERENCE AND NOT RPI)
-    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
-    copy(snappy_lib
-            SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
-            DSTS ${dst_dir} ${dst_dir}/lib
-            DEPS snappy)
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
+copy(snappy_lib
+        SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
+        DSTS ${dst_dir} ${dst_dir}/lib
+        DEPS snappy)
 
-    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
-    copy(snappystream_lib
-            SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
-            DSTS ${dst_dir} ${dst_dir}/lib
-            DEPS snappystream)
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
+copy(snappystream_lib
+        SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
+        DSTS ${dst_dir} ${dst_dir}/lib
+        DEPS snappystream)
 
-    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
-    copy(zlib_lib
-            SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
-            DSTS ${dst_dir} ${dst_dir}/lib
-            DEPS zlib)
-endif ()
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
+copy(zlib_lib
+        SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
+        DSTS ${dst_dir} ${dst_dir}/lib
+        DEPS zlib)
 
 # paddle fluid module
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
diff --git a/cmake/system.cmake b/cmake/system.cmake
index c91ef91127511da9ac8b9e11349f4a23aaedd613..65db05bebe957d740e391847d980e211b0e9e750 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -74,21 +74,6 @@ MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES)
 MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}, version: ${HOST_SYSTEM_VERSION}")
 MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores")
 
-# configuration for cross-compiling
-IF(DEFINED CMAKE_SYSTEM_NAME)
-    INCLUDE(cross_compiling/host)
-    IF(${CMAKE_SYSTEM_NAME} STREQUAL "Android")
-        SET(ANDROID TRUE)
-        INCLUDE(cross_compiling/android)
-    ELSEIF(${CMAKE_SYSTEM_NAME} STREQUAL "RPi")
-        SET(RPI TRUE)
-        INCLUDE(cross_compiling/raspberry_pi)
-    ELSEIF(${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
-        SET(IOS TRUE)
-        INCLUDE(cross_compiling/ios)
-    ENDIF()
-ENDIF()
-
 # external dependencies log output
 SET(EXTERNAL_PROJECT_LOG_ARGS
     LOG_DOWNLOAD    0     # Wrap download in script to log output
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 0dc33ce385175d1e2dc454d41db467d4b9d9cf9a..02667dbce69ed159193ff88f38069dd08cdcf678 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -53,118 +53,3 @@ function(target_circle_link_libraries TARGET_NAME)
                 "-Wl,--end-group")
     endif()
 endfunction()
-
-# compile_cu_as_cpp
-# Make a cu file compiled as C++
-# Arguments: Source files
-macro(compile_cu_as_cpp)
-    foreach(s ${ARGN})
-        set_source_files_properties(${s} PROPERTIES LANGUAGE CXX)
-        set_source_files_properties(${s} PROPERTIES COMPILE_FLAGS "-x c++")
-    endforeach()
-endmacro()
-
-# link_paddle_exe
-# add paddle library for a paddle executable, such as trainer, pserver.
-#
-# It will handle WITH_PYTHON etc.
-function(link_paddle_exe TARGET_NAME)
-    if(WITH_RDMA)
-        generate_rdma_links()
-    endif()
-
-    if(MOBILE_INFERENCE)
-        target_circle_link_libraries(${TARGET_NAME}
-            ARCHIVE_START
-            paddle_gserver
-            paddle_function
-            ARCHIVE_END
-            paddle_math
-            paddle_utils
-            paddle_parameter
-            paddle_proto
-            paddle_cuda
-            ${EXTERNAL_LIBS}
-            ${CMAKE_THREAD_LIBS_INIT}
-            ${CMAKE_DL_LIBS}
-            ${RDMA_LD_FLAGS}
-            ${RDMA_LIBS})
-    else()
-        target_circle_link_libraries(${TARGET_NAME}
-            ARCHIVE_START
-            paddle_gserver
-            paddle_function
-            ARCHIVE_END
-            paddle_pserver
-            paddle_trainer_lib
-            paddle_network
-            paddle_math
-            paddle_utils
-            paddle_parameter
-            paddle_proto
-            paddle_cuda
-            paddle_optimizer
-            ${EXTERNAL_LIBS}
-            ${CMAKE_THREAD_LIBS_INIT}
-            ${CMAKE_DL_LIBS}
-            ${RDMA_LD_FLAGS}
-            ${RDMA_LIBS})
-    endif()
-
-    if(ANDROID)
-        target_link_libraries(${TARGET_NAME} log)
-    endif(ANDROID)
-
-    if(WITH_MKLML AND MKLML_LIB_DIR AND MKLML_IOMP_LIB)
-      target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
-    endif()
-
-    add_dependencies(${TARGET_NAME} ${external_project_dependencies})
-endfunction()
-
-# link_paddle_test
-# Link a paddle unittest for target
-# TARGET_NAME: the unittest target name
-# Rest Arguemnts: not used.
-function(link_paddle_test TARGET_NAME)
-    link_paddle_exe(${TARGET_NAME})
-    target_link_libraries(${TARGET_NAME}
-                          paddle_test_main
-                          paddle_test_util
-                          ${GTEST_LIBRARIES})
-endfunction()
-
-# add_unittest_without_exec
-#
-# create a paddle unittest. not specifically define how to run this unittest.
-# TARGET_NAME: the unittest target name, same as executable file name
-# Rest Arguments: the source files to compile this unittest.
-macro(add_unittest_without_exec TARGET_NAME)
-    add_executable(${TARGET_NAME} ${ARGN})
-    link_paddle_test(${TARGET_NAME})
-endmacro()
-
-# add_unittest
-# create a paddle unittest and just to execute this binary to make unittest.
-#
-# TARGET_NAME: the unittest target name, same as executable file name
-# Rest Arguments: the source files to compile this unittest.
-macro(add_unittest TARGET_NAME)
-    add_unittest_without_exec(${TARGET_NAME} ${ARGN})
-    add_test(${TARGET_NAME} ${TARGET_NAME})
-endmacro()
-
-# add_simple_unittest
-# create a paddle unittest with file name. It just compile ${TARGET_NAME}.cpp to
-# ${TARGET_NAME} and then execute it.
-macro(add_simple_unittest TARGET_NAME)
-    add_unittest(${TARGET_NAME} ${TARGET_NAME}.cpp)
-endmacro()
-
-# Creates C resources file from files in given resource file
-function(create_resources res_file output_file)
-  add_custom_command(
-    OUTPUT ${output_file}
-    COMMAND python ARGS ${PADDLE_SOURCE_DIR}/cmake/make_resource.py ${res_file} ${output_file}
-    DEPENDS ${res_file} ${PADDLE_SOURCE_DIR}/cmake/make_resource.py)
-endfunction()
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index ad39542b4d8356229a3e060bffb45d8ea89f5b70..7ceb180193191f90ea5178f63c569d6ecab65075 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -67,6 +67,7 @@ paddle.fluid.initializer.BilinearInitializer.__init__ ArgSpec(args=['self'], var
 paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0))
 paddle.fluid.initializer.force_init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
 paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.initializer.NumpyArrayInitializer.__init__ ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None))
 paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32'))
 paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None))
@@ -121,7 +122,7 @@ paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None,
 paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
 paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False))
 paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False))
-paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None))
+paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name'], varargs=None, keywords=None, defaults=(0, True, None))
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
@@ -197,7 +198,7 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None,
 paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
-paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name'], varargs=None, keywords=None, defaults=(-100, None))
+paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name', 'normalize'], varargs=None, keywords=None, defaults=(-100, None, False))
 paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
@@ -318,6 +319,7 @@ paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'asp
 paddle.fluid.layers.roi_perspective_transform ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,))
 paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True))
 paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None))
+paddle.fluid.layers.generate_mask_labels ArgSpec(args=['im_info', 'gt_classes', 'is_crowd', 'gt_segms', 'rois', 'labels_int32', 'num_classes', 'resolution'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None))
 paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,))
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index a167511160d074c13ca1dca36b4f2c5eeea4bb93..8cb0c4e668a71e1c06e2cf13ad6b25854077e705 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -131,8 +131,6 @@ cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc
 
 if(WITH_NGRAPH)
   cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
-  cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
-             shape_inference data_transform lod_tensor profiler)
 endif(WITH_NGRAPH)
 
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
@@ -171,13 +169,12 @@ if(WITH_DISTRIBUTE)
 
    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
    set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-
 else()
-  if(WITH_NGRAPH)
-    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper)
-  else(WITH_NGRAPH)
+  if (WITH_NGRAPH)
+    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper ngraph_engine)
+  else ()
     cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
-  endif(WITH_NGRAPH)
+  endif()
   cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
 
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 756470c5b0bfb2ce2a90531442211bcb19172dae..ce5731a1f414e8ef6d8af22a3bb17109e82beb87 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/sequential_execution_pass.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 
 namespace paddle {
@@ -243,3 +244,4 @@ USE_PASS(sequential_execution_pass);
 USE_PASS(all_reduce_deps_pass);
 USE_PASS(modify_op_lock_and_record_event_pass);
 USE_PASS(lock_free_optimize_pass);
+USE_PASS(graph_to_program_pass);
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 603df2e06936e3d9d8e7ec62efd0c6e83200239c..cd24a3175953bf323748bf0c7e3159761c13f0a9 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -91,7 +91,7 @@ struct BuildStrategy {
   int num_trainers_{1};
   int trainer_id_{0};
   std::vector<std::string> trainers_endpoints_;
-  bool remove_unnecessary_lock_{false};
+  bool remove_unnecessary_lock_{true};
 
   // NOTE:
   // Before you add new options, think if it's a general strategy that works
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index c93bbe7ceecce9193acfae0b4e03c06212edd6d6..4323883fa5cc9b26a68c2980f3b7a49eca610543 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -27,7 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 
 #ifdef PADDLE_WITH_NGRAPH
-#include "paddle/fluid/framework/ngraph_operator.h"
+#include "paddle/fluid/operators/ngraph/ngraph_engine.h"
 #endif
 
 DECLARE_bool(benchmark);
@@ -133,24 +133,6 @@ static void DeleteUnusedTensors(
   }
 }
 
-static void EnableFusedOp(ExecutorPrepareContext* ctx) {
-#ifdef PADDLE_WITH_NGRAPH
-  VLOG(3) << "use_ngraph=True";
-  auto intervals = NgraphOperator::NgraphOpIntervals(&ctx->ops_);
-  for (auto& interval : intervals) {
-    auto* ng_op = new NgraphOperator(ctx->prog_, ctx->block_id_, interval.at(0),
-                                     interval.at(1));
-    *interval[0] = std::unique_ptr<OperatorBase>(ng_op);
-  }
-  for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
-    ctx->ops_.erase(it->at(0) + 1, it->at(1));
-  }
-#else
-  LOG(WARNING)
-      << "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option";
-#endif
-}
-
 Executor::Executor(const platform::Place& place) : place_(place) {}
 
 void Executor::Close() {
@@ -204,6 +186,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
                    bool create_local_scope, bool create_vars) {
   platform::RecordBlock b(block_id);
   if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc);
+#ifdef PADDLE_WITH_NGRAPH
+  if (FLAGS_use_ngraph) operators::NgraphEngine::EnableNgraph(pdesc);
+#endif
   auto ctx = Prepare(pdesc, block_id);
   RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars);
 }
@@ -379,7 +364,6 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
   for (auto& op_desc : block.AllOps()) {
     ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
   }
-  if (FLAGS_use_ngraph) EnableFusedOp(ctx.get());
   return ctx;
 }
 
diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc
index 6cf405efe63d2bc284c4650771a747b27bb4a9f6..33ccee6aa0a94b8fd8308214d6144ae832d40bab 100644
--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
@@ -28,10 +28,14 @@ std::unique_ptr<Graph> Pass::Apply(std::unique_ptr<Graph> graph) const {
     PADDLE_ENFORCE(graph->Has(attr), "Required graph atrribute %s not set.",
                    attr);
   }
+  auto* native_graph = graph.get();
   auto applied_graph = ApplyImpl(std::move(graph));
   // TODO(panyx0718): Add more verifications.
   PADDLE_ENFORCE(!HasCircle(*applied_graph),
                  "Illegal Pass. Generated graph shouldn't has cycle.");
+  PADDLE_ENFORCE(applied_graph.get() == native_graph,
+                 "Pass::Apply() cannot delete the passed graph and shouldn't "
+                 "return a new graph.(For the need of pybind11)");
   applied_ = true;
   return applied_graph;
 }
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 8fbbc6584e121d22bdec8173d501a35dc97c9c06..f46bdf96ba1e9e1e137c690057051d9a127d45c9 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -54,13 +54,14 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) {
 
 std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
   if (!platform::is_cpu_place(t.place())) {
-    LoDTensor tt;
-    framework::TensorCopy(t, platform::CPUPlace(), &tt);
+    LoDTensor cpu_tensor;
+    cpu_tensor.set_lod(t.lod());
+    framework::TensorCopy(t, platform::CPUPlace(), &cpu_tensor);
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(t.place());
     dev_ctx.Wait();
 
-    os << tt;
+    os << cpu_tensor;
     return os;
   }
 
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index c3a044d22cf04dceecc164fae934ee15c4563af1..5d854cb8d7856a631faf01741d29d3cecfd9a627 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/fluid/framework/ngraph_operator.h b/paddle/fluid/framework/ngraph_operator.h
deleted file mode 100644
index ede80f44bea208b66acc3b3f4bc0f4adee4fb860..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ngraph_operator.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/framework/op_kernel_type.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/variant.h"
-
-#include "ngraph/type/element_type.hpp"
-
-namespace paddle {
-namespace framework {
-
-class NgraphOperator : public OperatorBase {
- public:
-  static std::vector<
-      std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
-  NgraphOpIntervals(
-      std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops);
-
-  explicit NgraphOperator(
-      const ProgramDesc& prog, size_t block_id,
-      std::vector<std::unique_ptr<OperatorBase>>::iterator start,
-      std::vector<std::unique_ptr<OperatorBase>>::iterator end,
-      const std::string& type = "fused_op", const VariableNameMap& inputs = {},
-      const VariableNameMap& outputs = {}, const AttributeMap& attrs = {});
-
-  void RunImpl(const Scope& scope, const platform::Place& place) const final;
-
- private:
-  const ProgramDesc pdesc_;
-  size_t block_;
-  std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
-  std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
-  std::unordered_set<std::string> persistables_;
-  std::unordered_set<std::string> fetches_;
-  std::unordered_set<std::string> post_op_inputs_;
-  bool is_full_ = false;
-
-  void Process();
-};
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 38f811c0e9e8ecc6a4226e17e621a3c2ef3f78c5..ee9f6a480542845beffdb26767ce1b1578118725 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 #include <sstream>
 #include <string>
 #include <vector>
-#include "gflags/gflags.h"
-#include "glog/logging.h"
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index a730b84a916ea2c3e17dd4becaf939cc28160457..5db422119966948f75970874e13d416ea699158a 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -1,5 +1,5 @@
 if(WITH_PYTHON)
-cc_library(layer SRCS layer.cc DEPS proto_desc operator)
-cc_library(tracer SRCS tracer.cc DEPS proto_desc)
+cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas)
+cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context)
 cc_library(engine SRCS engine.cc)
 endif()
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index b7df4b8886d629e98225c95eae9a4f2ed9400710..8029129b9a6a9fcbc0ff10daa1f25b210259e9d8 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/layer.h"
+
 #include <deque>
 #include <limits>
 #include <map>
@@ -22,6 +23,9 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/string/printf.h"
 
 namespace paddle {
@@ -34,22 +38,66 @@ std::map<int, py::object> py_funcs_;
 
 using framework::Variable;
 
-void AddTo(Variable* src, Variable* dst) {
-  framework::LoDTensor* dst_tensor = dst->GetMutable<framework::LoDTensor>();
-  framework::LoDTensor* src_tensor = src->GetMutable<framework::LoDTensor>();
+namespace detail {
+
+template <typename T>
+class TensorAddToFunctor : public boost::static_visitor<> {
+ public:
+  TensorAddToFunctor(int64_t numel, const T* x, T* y)
+      : numel_(numel), x_(x), y_(y) {}
+
+  void operator()(const platform::CPUPlace& place) {
+    platform::CPUDeviceContext* ctx = dynamic_cast<platform::CPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(place));
+    auto blas = operators::math::GetBlas<platform::CPUDeviceContext, T>(*ctx);
+    blas.AXPY(numel_, 1., x_, y_);
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  void operator()(const platform::CUDAPlace& place) {
+    platform::CUDADeviceContext* ctx =
+        dynamic_cast<platform::CUDADeviceContext*>(
+            platform::DeviceContextPool::Instance().Get(place));
+    auto blas = operators::math::GetBlas<platform::CUDADeviceContext, T>(*ctx);
+    blas.AXPY(numel_, 1., x_, y_);
+  }
+#else
+  void operator()(const platform::CUDAPlace& place) {
+    PADDLE_THROW("Do NOT support gradient merge in place %s", place);
+  }
+#endif
+
+  // there is NO blas in CUDAPinnedPlace
+  void operator()(const platform::CUDAPinnedPlace& place) {
+    PADDLE_THROW("Do NOT support gradient merge in place %s", place);
+  }
+
+ private:
+  int64_t numel_;
+  const T* x_;
+  T* y_;
+};
+
+}  // namespace detail
+
+void AddTo(Variable* src, Variable* dst, platform::Place place) {
+  framework::Tensor* dst_tensor = dst->GetMutable<framework::LoDTensor>();
+  framework::Tensor* src_tensor = src->GetMutable<framework::LoDTensor>();
+
   // FIXME(minqiyang): loss_grad op will pass a zero grad of label
   // ugly fix for it
   if (src_tensor->numel() == 0) {
     return;
   }
+
   PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(),
                  "dst_numel %lld vs. src_numel %lld", dst_tensor->numel(),
                  src_tensor->numel());
-  float* dst_data = dst_tensor->mutable_data<float>(platform::CPUPlace());
-  const float* src_data = src_tensor->data<float>();
-  for (int64_t i = 0; i < src_tensor->numel(); ++i) {
-    dst_data[i] += src_data[i];
-  }
+
+  detail::TensorAddToFunctor<float> func(
+      src_tensor->numel(), src_tensor->data<float>(),
+      dst_tensor->mutable_data<float>(place));
+  boost::apply_visitor(func, place);
 }
 
 class Autograd {
@@ -120,6 +168,36 @@ class Autograd {
   }
 };
 
+std::unique_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
+                                             const bool blocking) const {
+  PADDLE_ENFORCE(var_->IsInitialized(),
+                 "Variable must be initialized when getting numpy tensor");
+
+  std::unique_ptr<VarBase> new_var(new VarBase());
+  framework::LoDTensor* tensor =
+      new_var->var_->GetMutable<framework::LoDTensor>();
+  tensor->Resize(var_->Get<framework::LoDTensor>().dims());
+  tensor->set_lod(var_->Get<framework::LoDTensor>().lod());
+
+  if (blocking) {
+    platform::DeviceContext* dev_ctx =
+        platform::DeviceContextPool::Instance().Get(dst_place);
+
+    framework::TensorCopySync(var_->Get<framework::LoDTensor>(), dst_place,
+                              tensor);
+
+    dev_ctx->Wait();
+  } else {
+    framework::TensorCopy(var_->Get<framework::LoDTensor>(), dst_place, tensor);
+  }
+
+  if (platform::is_gpu_place(dst_place)) {
+    VLOG(3) << "copy tensor " << var_desc_->Name() << " from gpu";
+  }
+
+  return new_var;
+}
+
 framework::LoDTensor& VarBase::GradValue() {
   VLOG(3) << "get var grad " << var_desc_->Name();
   return *(grads_->var_->GetMutable<framework::LoDTensor>());
@@ -162,9 +240,8 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
     PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
 
     framework::Scope scope;
-    platform::CPUPlace place;
-    PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
-    p.op.RuntimeInferShape(scope, place, ctx);
+    PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_);
+    p.op.RuntimeInferShape(scope, place_, ctx);
     p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
   }
 
@@ -176,7 +253,7 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
     for (size_t i = 0; i < outputs.size(); ++i) {
       framework::Variable* grad = outputs[i];
       framework::Variable* orig_grad = origin_outputs[i];
-      AddTo(grad, orig_grad);
+      AddTo(grad, orig_grad, place_);
       delete grad;
     }
   }
@@ -188,8 +265,10 @@ void VarBase::RunBackward() {
 
   VLOG(3) << "start backward";
   auto grads_t = grads_->var_->GetMutable<framework::LoDTensor>();
-  float* data = grads_t->mutable_data<float>(platform::CPUPlace());
-  std::fill(data, data + grads_t->numel(), 1.0);
+  operators::math::set_constant(
+      *(platform::DeviceContextPool::Instance().Get(
+          var_->GetMutable<framework::LoDTensor>()->place())),
+      grads_t, 1.0);
 
   PADDLE_ENFORCE(
       grads_ ==
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 0b1077c640e076797ba7e0200dc8d0eb8bfcff16..633924aa417b8bd64bf4921054f82fdb7f7868fe 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -21,17 +21,21 @@
 #include <map>     // NOLINT
 #include <string>  // NOLINT
 #include <vector>  // NOLINT
+#include <memory>  // NOLINT
 
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/device_context.h"
 
 #include "paddle/fluid/imperative/type_defs.h"
 
 namespace paddle {
 namespace imperative {
 
+class VarBase;
+
 namespace py = ::pybind11;
 
 class PreparedOp {
@@ -81,6 +85,8 @@ class PreparedOp {
     return PreparedOp(op, ctx, kernel_iter->second, dev_ctx);
   }
 
+  inline platform::DeviceContext* GetDeviceContext() const { return dev_ctx; }
+
   const framework::OperatorBase& op;
   const framework::RuntimeContext& ctx;
   framework::OperatorWithKernel::OpKernelFunc func;
@@ -148,6 +154,9 @@ class VarBase {
 
   framework::LoDTensor& GradValue();
 
+  std::unique_ptr<VarBase> NewVarBase(const platform::Place& dst_place,
+                                      const bool blocking) const;
+
   inline std::string GradName() const {
     PADDLE_ENFORCE(
         var_desc_,
@@ -176,7 +185,8 @@ class OpBase {
       : op_desc_(nullptr),
         forward_id_(-1),
         grad_op_desc_(nullptr),
-        backward_id_(-1) {}
+        backward_id_(-1),
+        place_(platform::CPUPlace()) {}
 
   virtual ~OpBase() {
     if (grad_op_desc_) delete grad_op_desc_;
@@ -193,6 +203,8 @@ class OpBase {
   framework::OpDesc* grad_op_desc_;
   int backward_id_;
 
+  platform::Place place_;
+
   VarBasePtrMap input_vars_;
   VarBasePtrMap output_vars_;
   OpBasePtrMap pre_ops_;
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 7289f23d11a9e5350dba4a94546c4d76b7030e12..b590c3a2ffabec95ed886426998e1c6e4fcc07fd 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -14,6 +14,10 @@
 
 #include "paddle/fluid/imperative/tracer.h"
 
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+
 namespace paddle {
 namespace imperative {
 
@@ -33,16 +37,38 @@ void CreateGradOp(const framework::OpDesc& op_desc,
   *grad_op_desc = grad_op_descs[0].release();
 }
 
-void InitVar(framework::Variable* var, framework::Variable* grad_var) {
+void InitVar(framework::Variable* var, framework::Variable* grad_var,
+             platform::DeviceContext* dev_ctx) {
+  PADDLE_ENFORCE_NOT_NULL(dev_ctx,
+                          "Could not get valid device from forward op");
   auto& var_t = var->Get<framework::LoDTensor>();
-  float* data =
-      grad_var->GetMutable<framework::LoDTensor>()->mutable_data<float>(
-          var_t.dims(), platform::CPUPlace());
-  std::fill(data, data + var_t.numel(), 0.0);
+  grad_var->GetMutable<framework::LoDTensor>()->mutable_data<float>(
+      var_t.dims(), dev_ctx->GetPlace());
+  operators::math::set_constant(
+      *dev_ctx, grad_var->GetMutable<framework::LoDTensor>(), 0.0);
+}
+
+platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) {
+  platform::Place result = place;
+  for (auto it : inputs) {
+    for (VarBase* var : it.second) {
+      platform::Place tmp_place =
+          var->var_->Get<framework::LoDTensor>().place();
+      if (!platform::is_same_place(tmp_place, result)) {
+        PADDLE_THROW(
+            "Input variable should keep in the same place: %s, but get place: "
+            "%s of input %s instead",
+            result, tmp_place, it.first);
+      }
+    }
+  }
+
+  return result;
 }
 
 void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
                    const VarBasePtrMap& outputs, framework::BlockDesc* block,
+                   const platform::Place expected_place,
                    const bool stop_gradient) {
   std::map<std::string, VarBase*> vars;
 
@@ -107,10 +133,11 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
   PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
 
   framework::Scope scope;
-  platform::CPUPlace place;
-  PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
-  p.op.RuntimeInferShape(scope, place, ctx);
-  p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
+  op->place_ = GetExpectedPlace(expected_place, inputs);
+  PreparedOp prepared_op = PreparedOp::Prepare(ctx, *op_kernel, op->place_);
+  prepared_op.op.RuntimeInferShape(scope, op->place_, ctx);
+  prepared_op.func(framework::ExecutionContext(
+      prepared_op.op, scope, *prepared_op.dev_ctx, prepared_op.ctx));
 
   if (!stop_gradient) {
     framework::OpDesc* grad_op_desc;
@@ -133,7 +160,8 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
         } else {
           VarBase* var = vars[var_it->second];
           if (!var->grads_->var_->IsInitialized()) {
-            InitVar(var->var_, var->grads_->var_);
+            InitVar(var->var_, var->grads_->var_,
+                    prepared_op.GetDeviceContext());
           }
           // Douts.
           grad_in_vars.push_back(var->grads_->var_);
@@ -146,10 +174,13 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
       for (const std::string& grad_outvar : it.second) {
         block->FindRecursiveOrCreateVar(grad_outvar);
         auto var_it = grad_to_var->find(grad_outvar);
-        PADDLE_ENFORCE(var_it != grad_to_var->end());
+        PADDLE_ENFORCE(var_it != grad_to_var->end(),
+                       "Could not found the grad op output var, should this "
+                       "operator %s's stop gradient be True",
+                       op_desc->Type());
         VarBase* var = vars[var_it->second];
         if (!var->grads_->var_->IsInitialized()) {
-          InitVar(var->var_, var->grads_->var_);
+          InitVar(var->var_, var->grads_->var_, prepared_op.GetDeviceContext());
         }
         grad_out_vars.push_back(var->grads_->var_);
       }
@@ -191,16 +222,23 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
     for (VarBase* out : outputs) {
       grad_input_vars.push_back(out->var_);
     }
+
+    platform::CPUPlace place;
     for (VarBase* out : outputs) {
       grad_input_vars.push_back(out->grads_->var_);
       if (!grad_input_vars.back()->IsInitialized()) {
-        InitVar(out->var_, grad_input_vars.back());
+        // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now
+        InitVar(out->var_, grad_input_vars.back(),
+                platform::DeviceContextPool::Instance().Get(place));
       }
     }
+
     for (const VarBase* inp : inputs) {
       grad_output_vars.push_back(inp->grads_->var_);
       if (!grad_output_vars.back()->IsInitialized()) {
-        InitVar(inp->var_, grad_output_vars.back());
+        // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now
+        InitVar(inp->var_, grad_output_vars.back(),
+                platform::DeviceContextPool::Instance().Get(place));
       }
     }
   }
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index f225d8abe6c0635d2bdd8dba0b12c7fc3a4110db..690838215581b09ff35a0ea13f30655b77e6e187 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -22,6 +22,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/imperative/engine.h"
 #include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace imperative {
@@ -34,21 +35,25 @@ void CreateGradOp(const framework::OpDesc& op_desc,
 
 void InitVar(framework::Variable* var, framework::Variable* grad_var);
 
+platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs);
+
 class Tracer {
  public:
   explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {}
 
   virtual ~Tracer() {}
 
-  void Trace(OpBase* op,
-             const std::map<std::string, std::vector<VarBase*>>& inputs,
-             const std::map<std::string, std::vector<VarBase*>>& outputs,
-             framework::BlockDesc* block, const bool stop_gradient = false);
+  void Trace(OpBase* op, const VarBasePtrMap& inputs,
+             const VarBasePtrMap& outputs, framework::BlockDesc* block,
+             const platform::Place expected_place,
+             const bool stop_gradient = false);
 
   std::vector<VarBase*> PyTrace(OpBase* op, const std::vector<VarBase*>& inputs,
                                 bool stop_gradient = false);
 
  private:
+  platform::Place GetPlace(const VarBasePtrMap& inputs);
+
   framework::BlockDesc* root_block_;
 };
 
diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
index 691c336ebe4b6a6cb60023859b21665b6a4756a8..9d74dc6c211e4fcb6d1e7de5369eee847f49fc78 100644
--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
@@ -1,6 +1,6 @@
 cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager)
 cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager)
-cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass)
+cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass zero_copy_tensor)
 cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager)
 cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass)
 
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index d3a60d209922ebe8d31723ca25c71a952ea08bd6..391932a1ee018c45818457c55fd8f82a22ab7405 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -154,13 +154,16 @@ class GpuPassStrategy : public PassStrategy {
  public:
   GpuPassStrategy() : PassStrategy({}) {
     passes_.assign({
-        "infer_clean_graph_pass",                    //
-        "conv_affine_channel_fuse_pass",             //
-        "conv_eltwiseadd_affine_channel_fuse_pass",  //
-        "conv_bn_fuse_pass",                         //
-        "conv_elementwise_add_act_fuse_pass",        //
-        "conv_elementwise_add2_act_fuse_pass",       //
-        "conv_elementwise_add_fuse_pass",            //
+      "infer_clean_graph_pass",                        //
+          "conv_affine_channel_fuse_pass",             //
+          "conv_eltwiseadd_affine_channel_fuse_pass",  //
+          "conv_bn_fuse_pass",                         //
+#if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
+                           // guaranteed at least v7
+          "conv_elementwise_add_act_fuse_pass",   //
+          "conv_elementwise_add2_act_fuse_pass",  //
+          "conv_elementwise_add_fuse_pass",       //
+#endif
     });
 
     for (int i = 6; i >= 3; i--) {
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index 01d7f700da9cc67d0ebbd3d9649e3823f58a8811..c5a413221ebff6b9be114151dbb93fd23a148440 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -29,9 +29,9 @@ TEST(OpConverter, ConvertBlock) {
   // init trt engine
   cudaStream_t stream_;
   std::unique_ptr<TensorRTEngine> engine_;
-  engine_.reset(new TensorRTEngine(5, 1 << 15, &stream_));
-  engine_->InitNetwork();
   PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
+  engine_.reset(new TensorRTEngine(5, 1 << 15, stream_));
+  engine_->InitNetwork();
 
   engine_->DeclareInput("conv2d-X", nvinfer1::DataType::kFLOAT,
                         nvinfer1::Dims3(2, 5, 5));
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index f313beb73bb0d21cab1d62859a46fcc76a373548..e83961f3d7bda03a7659f175c59105dcb60708e9 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -78,11 +78,9 @@ class TRTConvertValidation {
         scope_(scope),
         if_add_batch_(if_add_batch),
         max_batch_size_(max_batch_size) {
-    // create engine.
-    engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, &stream_));
-    engine_->InitNetwork();
-
     PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
+    engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, stream_));
+    engine_->InitNetwork();
   }
 
   // Declare a Variable as input with random initialization.
@@ -175,7 +173,7 @@ class TRTConvertValidation {
     op_->Run(scope_, place);
     // Execute TRT.
     engine_->Execute(batch_size);
-    cudaStreamSynchronize(*engine_->stream());
+    cudaStreamSynchronize(engine_->stream());
 
     ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
     const size_t output_space_size = 3000;
@@ -184,7 +182,7 @@ class TRTConvertValidation {
       std::vector<float> fluid_out;
       std::vector<float> trt_out(output_space_size);
       engine_->GetOutputInCPU(output, &trt_out[0], output_space_size);
-      cudaStreamSynchronize(*engine_->stream());
+      cudaStreamSynchronize(engine_->stream());
 
       auto* var = scope_.FindVar(output);
       auto tensor = var->GetMutable<framework::LoDTensor>();
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index f739752cbc44805cb0fb3246385609cf16ba744a..78b590f15d639f7b21b403413760948c6343d998 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -42,14 +42,13 @@ void TensorRTEngine::Execute(int batch_size) {
     PADDLE_ENFORCE(buf.device == DeviceType::GPU);
     buffers.push_back(buf.buffer);
   }
-  PADDLE_ENFORCE_NOT_NULL(stream_);
-  infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);
-  cudaStreamSynchronize(*stream_);
+  infer_context_->enqueue(batch_size, buffers.data(), stream_, nullptr);
+  cudaStreamSynchronize(stream_);
   SetRuntimeBatch(batch_size);
 }
 
 TensorRTEngine::~TensorRTEngine() {
-  cudaStreamSynchronize(*stream_);
+  cudaStreamSynchronize(stream_);
   // clean buffer
   for (auto &buf : buffers_) {
     if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {
@@ -173,7 +172,7 @@ void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst,
   auto &buf = buffer(name);
   PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
   PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size,
-                                    cudaMemcpyDeviceToDevice, *stream_),
+                                    cudaMemcpyDeviceToDevice, stream_),
                     0);
 }
 
@@ -194,7 +193,7 @@ void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
   auto &buf = buffer(name);
   PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
   PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size,
-                                       cudaMemcpyDeviceToHost, *stream_));
+                                       cudaMemcpyDeviceToHost, stream_));
 }
 
 Buffer &TensorRTEngine::buffer(const std::string &name) {
@@ -211,12 +210,11 @@ void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data,
   auto &buf = buffer(name);
   PADDLE_ENFORCE_NOT_NULL(buf.buffer);
   PADDLE_ENFORCE_NOT_NULL(data);
-  PADDLE_ENFORCE_NOT_NULL(stream_);
   PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
   PADDLE_ENFORCE(buf.device == DeviceType::GPU);
   buf.size = size;
   PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
-                                       cudaMemcpyHostToDevice, *stream_));
+                                       cudaMemcpyHostToDevice, stream_));
 }
 
 void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
@@ -227,7 +225,7 @@ void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
   PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
   PADDLE_ENFORCE(buf.device == DeviceType::GPU);
   PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
-                                       cudaMemcpyDeviceToDevice, *stream_));
+                                       cudaMemcpyDeviceToDevice, stream_));
 }
 
 void TensorRTEngine::SetITensor(const std::string &name,
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index f5b2c28ba9e6fefc1d6c14640d696c3bf3ac8249..65ab7f3caaa746cf339de67706939070a0b7d87d 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -54,17 +54,14 @@ class TensorRTEngine : public EngineBase {
     nvinfer1::Weights w_;
   };
 
-  TensorRTEngine(int max_batch, int max_workspace,
-                 cudaStream_t* stream = nullptr, int device = 0,
+  TensorRTEngine(int max_batch, int max_workspace, cudaStream_t stream,
+                 int device = 0,
                  nvinfer1::ILogger& logger = NaiveLogger::Global())
       : max_batch_(max_batch),
         max_workspace_(max_workspace),
-        stream_(stream ? stream : &default_stream_),
+        stream_(stream),
         logger_(logger),
-        device_(device) {
-    freshDeviceId();
-    cudaStreamCreate(stream_);
-  }
+        device_(device) {}
 
   virtual ~TensorRTEngine();
 
@@ -102,7 +99,7 @@ class TensorRTEngine : public EngineBase {
   // NOTE this should be used after calling `FreezeNetwork`.
   Buffer& buffer(const std::string& name) override;
 
-  cudaStream_t* stream() { return stream_; }
+  cudaStream_t stream() { return stream_; }
 
   // Fill an input from CPU memory with name and size.
   void SetInputFromCPU(const std::string& name, const void* data, size_t size);
@@ -158,9 +155,8 @@ class TensorRTEngine : public EngineBase {
 
   // batch size of the current data, will be updated each Executation.
   int batch_size_{-1};
-  cudaStream_t* stream_;
-  // If stream_ is not set from outside, hold its own stream.
-  cudaStream_t default_stream_;
+  cudaStream_t stream_;
+
   nvinfer1::ILogger& logger_;
 
   std::vector<Buffer> buffers_;
@@ -208,38 +204,6 @@ class TensorRTEngine : public EngineBase {
 #define TRT_ENGINE_ADD_LAYER(engine__, layer__, ARGS...) \
   engine__->network()->add##layer__(ARGS);
 
-/*
- * Helper to control the TensorRT engine's creation and deletion.
- */
-class TRT_EngineManager {
- public:
-  bool HasEngine(const std::string& name) const {
-    return engines_.count(name) != 0;
-  }
-
-  // Get an engine called `name`.
-  TensorRTEngine* Get(const std::string& name) const {
-    return engines_.at(name).get();
-  }
-
-  // Create or get an engine called `name`
-  TensorRTEngine* Create(int max_batch, int max_workspace, cudaStream_t* stream,
-                         const std::string& name, int gpu_device = 0) {
-    auto* p = new TensorRTEngine(max_batch, max_workspace, stream, gpu_device);
-    engines_[name].reset(p);
-    return p;
-  }
-
-  void DeleteALl() {
-    for (auto& item : engines_) {
-      item.second.reset(nullptr);
-    }
-  }
-
- private:
-  std::unordered_map<std::string, std::unique_ptr<TensorRTEngine>> engines_;
-};
-
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index da1f6535cb3b2476cd475797861d6d2bb6d88856..9eed0f6ee9ce4d9e35bec718dc8e8435921dbd81 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -27,8 +27,8 @@ namespace tensorrt {
 class TensorRTEngineTest : public ::testing::Test {
  protected:
   void SetUp() override {
-    // ASSERT_EQ(0, cudaStreamCreate(&stream_));
-    engine_ = new TensorRTEngine(10, 1 << 10, &stream_);
+    ASSERT_EQ(0, cudaStreamCreate(&stream_));
+    engine_ = new TensorRTEngine(10, 1 << 10, stream_);
     engine_->InitNetwork();
   }
 
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index d2ca1d0b0098fd377725cc0fdf002d8d2e3539ec..b1f7a3464ac6027faffe283bccaf9793eae939e1 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -56,6 +56,13 @@ DECLARE_int32(paddle_num_threads);
 namespace paddle {
 namespace inference {
 
+float Random(float low, float high) {
+  static std::random_device rd;
+  static std::mt19937 mt(rd());
+  std::uniform_real_distribution<double> dist(low, high);
+  return dist(mt);
+}
+
 void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
   const auto *analysis_config =
       reinterpret_cast<const contrib::AnalysisConfig *>(config);
@@ -176,7 +183,7 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
     float *input_data = static_cast<float *>(input.data.data());
     // fill input data, for profile easily, do not use random data here.
     for (size_t j = 0; j < len; ++j) {
-      *(input_data + j) = static_cast<float>(j) / len;
+      *(input_data + j) = Random(0.0, 1.0) / 10.;
     }
   }
   (*inputs).emplace_back(input_slots);
@@ -344,6 +351,16 @@ void CompareNativeAndAnalysis(
   CompareResult(analysis_outputs, native_outputs);
 }
 
+void CompareNativeAndAnalysis(
+    PaddlePredictor *native_pred, PaddlePredictor *analysis_pred,
+    const std::vector<std::vector<PaddleTensor>> &inputs) {
+  int batch_size = FLAGS_batch_size;
+  std::vector<PaddleTensor> native_outputs, analysis_outputs;
+  native_pred->Run(inputs[0], &native_outputs, batch_size);
+  analysis_pred->Run(inputs[0], &analysis_outputs, batch_size);
+  CompareResult(analysis_outputs, native_outputs);
+}
+
 template <typename T>
 std::string LoDTensorSummary(const framework::LoDTensor &tensor) {
   std::stringstream ss;
diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc
index 5aca807ee3aee1bb323abe6d5c3700dfc08e30b4..db7109b7505d4fe4dcfcf88f303aa262bc5b44fb 100644
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -107,6 +107,27 @@ void compare(std::string model_dir, bool use_tensorrt) {
       inputs_all);
 }
 
+void compare_continuous_input(std::string model_dir, bool use_tensorrt) {
+  contrib::AnalysisConfig analysis_config;
+  SetConfig<contrib::AnalysisConfig>(&analysis_config, model_dir, true,
+                                     use_tensorrt, FLAGS_batch_size);
+  auto config =
+      reinterpret_cast<const PaddlePredictor::Config*>(&analysis_config);
+  auto native_pred = CreateTestPredictor(config, false);
+  auto analysis_pred = CreateTestPredictor(config, true);
+  for (int i = 0; i < 100; i++) {
+    std::vector<std::vector<PaddleTensor>> inputs_all;
+    if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
+      SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename,
+                        FLAGS_param_filename);
+    } else {
+      SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
+    }
+    CompareNativeAndAnalysis(native_pred.get(), analysis_pred.get(),
+                             inputs_all);
+  }
+}
+
 TEST(TensorRT_mobilenet, compare) {
   std::string model_dir = FLAGS_infer_model + "/mobilenet";
   compare(model_dir, /* use_tensorrt */ true);
@@ -162,5 +183,15 @@ TEST(TensorRT_mobilenet, profile) {
   profile(model_dir, true, false);
 }
 
+TEST(resnet50, compare_continuous_input) {
+  std::string model_dir = FLAGS_infer_model + "/resnet50";
+  compare_continuous_input(model_dir, true);
+}
+
+TEST(resnet50, compare_continuous_input_native) {
+  std::string model_dir = FLAGS_infer_model + "/resnet50";
+  compare_continuous_input(model_dir, false);
+}
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc
index 64aa63ffe9705d75e70c8d9d9cbc433dd6358596..5d8684f083bda8499000c9fd0a7617cf129db13b 100644
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/memory/allocation/legacy_allocator.h"
 #include <string>
+#include <utility>
 #include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
@@ -37,7 +38,7 @@ template <typename Place>
 void *Alloc(const Place &place, size_t size);
 
 template <typename Place>
-void Free(const Place &place, void *p);
+void Free(const Place &place, void *p, size_t size);
 
 template <typename Place>
 size_t Used(const Place &place);
@@ -52,6 +53,11 @@ size_t memory_usage(const platform::Place &p);
 
 using BuddyAllocator = detail::BuddyAllocator;
 
+std::unordered_map</*device id*/ int,
+                   std::pair</*current memory usage*/ uint64_t,
+                             /*peak memory usage*/ uint64_t>>
+    gpu_mem_info;
+
 BuddyAllocator *GetCPUBuddyAllocator() {
   // We tried thread_local for inference::RNN1 model, but that not works much
   // for multi-thread test.
@@ -98,7 +104,8 @@ void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) {
 }
 
 template <>
-void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p) {
+void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p,
+                              size_t size) {
   VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
   GetCPUBuddyAllocator()->Free(p);
 }
@@ -177,9 +184,16 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
     LOG(WARNING) << "GPU memory used: "
                  << string::HumanReadableSize(Used<platform::CUDAPlace>(place));
     platform::SetDeviceId(cur_dev);
-  }
-  if (FLAGS_init_allocated_mem) {
-    cudaMemset(ptr, 0xEF, size);
+  } else {
+    gpu_mem_info[place.device].first += size;
+    if (gpu_mem_info[place.device].first > gpu_mem_info[place.device].second) {
+      gpu_mem_info[place.device].second = gpu_mem_info[place.device].first;
+      VLOG(3) << "device: " << place.device << " peak memory usage : "
+              << (gpu_mem_info[place.device].second >> 20) << " MiB";
+    }
+    if (FLAGS_init_allocated_mem) {
+      cudaMemset(ptr, 0xEF, size);
+    }
   }
   return ptr;
 #else
@@ -188,9 +202,11 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
 }
 
 template <>
-void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p) {
+void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
+                               size_t size) {
 #ifdef PADDLE_WITH_CUDA
   GetGPUBuddyAllocator(place.device)->Free(p);
+  gpu_mem_info[place.device].first -= size;
 #else
   PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
 #endif
@@ -243,7 +259,7 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
 
 template <>
 void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
-                                     void *p) {
+                                     void *p, size_t size) {
 #ifdef PADDLE_WITH_CUDA
   GetCUDAPinnedBuddyAllocator()->Free(p);
 #else
@@ -264,15 +280,17 @@ struct AllocVisitor : public boost::static_visitor<void *> {
 };
 
 struct FreeVisitor : public boost::static_visitor<void> {
-  inline explicit FreeVisitor(void *ptr) : ptr_(ptr) {}
+  inline explicit FreeVisitor(void *ptr, size_t size)
+      : ptr_(ptr), size_(size) {}
 
   template <typename Place>
   inline void operator()(const Place &place) const {
-    Free<Place>(place, ptr_);
+    Free<Place>(place, ptr_, size_);
   }
 
  private:
   void *ptr_;
+  size_t size_;
 };
 
 size_t Usage::operator()(const platform::CPUPlace &cpu) const {
@@ -304,8 +322,9 @@ Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
 }
 
 void LegacyAllocator::Free(Allocation *allocation) {
-  boost::apply_visitor(legacy::FreeVisitor(allocation->ptr()),
-                       allocation->place());
+  boost::apply_visitor(
+      legacy::FreeVisitor(allocation->ptr(), allocation->size()),
+      allocation->place());
   delete allocation;
 }
 }  // namespace allocation
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 992a2bdd5ad639bf6176328e94da6eb71a41790c..e099425b94221bf1229e936fc1781615d13dbc26 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -13,6 +13,7 @@ add_subdirectory(detection)
 add_subdirectory(elementwise)
 add_subdirectory(fused)
 add_subdirectory(metrics)
+add_subdirectory(ngraph)
 add_subdirectory(optimizers)
 add_subdirectory(reduce_ops)
 add_subdirectory(sequence_ops)
@@ -66,7 +67,7 @@ set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler tree2col)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search)
 if (WITH_GPU)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu)
 endif()
@@ -86,7 +87,6 @@ set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies")
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function)
 cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
-cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu
index 2bebdb345ab324eb0a2dafd54c74833dd21bdb6d..c054fdb1ba6e5ae5970a51ac9f071f6ef535a4b5 100644
--- a/paddle/fluid/operators/affine_channel_op.cu
+++ b/paddle/fluid/operators/affine_channel_op.cu
@@ -83,7 +83,7 @@ __global__ void AffineChannelScaleBiasGradientCUDAKernel(
     T* dbias) {
   const int outer_size = C;
   const int inner_size = N * HxW;
-  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  typedef cub::BlockReduce<double, BlockDim> BlockReduce;
   __shared__ typename BlockReduce::TempStorage ds_storage;
   __shared__ typename BlockReduce::TempStorage db_storage;
 
@@ -97,13 +97,16 @@ __global__ void AffineChannelScaleBiasGradientCUDAKernel(
       ds_sum += dy[index] * x[index];
       db_sum += dy[index];
     }
-    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
-    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
+    __syncthreads();
+    auto ds_out =
+        BlockReduce(ds_storage).Reduce(static_cast<double>(ds_sum), cub::Sum());
+    auto db_out =
+        BlockReduce(db_storage).Reduce(static_cast<double>(db_sum), cub::Sum());
+    __syncthreads();
     if (threadIdx.x == 0) {
-      dscale[i] = ds_sum;
-      dbias[i] = db_sum;
+      dscale[i] = ds_out;
+      dbias[i] = db_out;
     }
-    __syncthreads();
   }
 }
 
diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
index 30f700f1d91c5a81f39594b6dab7e5e717c9818f..e78ecc1a12309fe084a4165e5bb0d8bfb1dcf957 100644
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -12,205 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <algorithm>
-#include <map>
+#include "paddle/fluid/operators/beam_search_op.h"
+
 #include <string>
 #include <vector>
-
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/beam_search_op.h"
 
 namespace paddle {
 namespace operators {
 
-void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
-                            const framework::LoDTensor &pre_scores,
-                            framework::LoDTensor *selected_ids,
-                            framework::LoDTensor *selected_scores) {
-  auto abs_lod = framework::ToAbsOffset(ids_->lod());
-  auto &high_level = abs_lod[lod_level_];
-
-  auto items = SelectTopBeamSizeItems(pre_ids, pre_scores);
-  auto selected_items = ToMap(items, high_level.back());
-  VLOG(3) << "selected_items:";
-  for (size_t i = 0; i < selected_items.size(); ++i) {
-    VLOG(3) << "offset:" << i;
-    for (auto &item : selected_items[i]) {
-      VLOG(3) << ItemToString(item);
-    }
-  }
-
-  PruneEndBeams(pre_ids, &selected_items);
-  // calculate the output tensor's height
-  size_t num_instances = std::accumulate(
-      std::begin(selected_items), std::end(selected_items), 0,
-      [](size_t a, std::vector<Item> &b) { return a + b.size(); });
-  // the output tensor shape should be [num_instances, 1]
-  auto dims = framework::make_ddim(
-      std::vector<int64_t>({static_cast<int>(num_instances), 1}));
-  selected_ids->Resize(dims);
-  selected_scores->Resize(dims);
-
-  std::map<size_t /*offset*/, std::vector<Item>> hash;
-  framework::LoD new_lod;
-  auto *ids_data = selected_ids->mutable_data<int64_t>(platform::CPUPlace());
-  auto *scores_data =
-      selected_scores->mutable_data<float>(platform::CPUPlace());
-
-  // fill in data
-  std::vector<size_t> low_level;
-  size_t low_offset = 0;
-  for (auto &items : selected_items) {
-    low_level.push_back(low_offset);
-    for (auto &item : items) {
-      ids_data[low_offset] = item.id;
-      scores_data[low_offset] = item.score;
-      low_offset++;
-    }
-  }
-  low_level.push_back(low_offset);
-
-  // fill lod
-  framework::LoD lod(2);
-  lod[0].assign(high_level.begin(), high_level.end());
-  lod[1].assign(low_level.begin(), low_level.end());
-  if (!framework::CheckLoD(lod)) {
-    PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
-  }
-  selected_ids->set_lod(lod);
-  selected_scores->set_lod(lod);
-}
-
-void BeamSearch::PruneEndBeams(const framework::LoDTensor &pre_ids,
-                               std::vector<std::vector<Item>> *items) {
-  auto *pre_ids_data = pre_ids.data<int64_t>();
-  auto abs_lod = framework::ToAbsOffset(ids_->lod());
-  auto &high_level = abs_lod[lod_level_];
-  for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) {
-    size_t src_prefix_start = high_level[src_idx];
-    size_t src_prefix_end = high_level[src_idx + 1];
-    bool finish_flag = true;
-    for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++) {
-      for (auto &item : items->at(offset)) {
-        if (item.id != static_cast<size_t>(end_id_) ||
-            pre_ids_data[offset] != end_id_) {
-          finish_flag = false;
-          break;
-        }
-      }
-      if (!finish_flag) break;
-    }
-    if (finish_flag) {  // all branchs of the beam (source sentence) end and
-                        // prune this beam
-      for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++)
-        items->at(offset).clear();
-    }
-  }
-}
-
-std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
-    const std::vector<std::vector<Item>> &items, size_t element_num) {
-  std::vector<std::vector<Item>> result;
-  result.resize(element_num);
-  for (auto &entries : items) {
-    for (const auto &item : entries) {
-      result[item.offset].push_back(item);
-    }
-  }
-  return result;
-}
-
-std::vector<std::vector<BeamSearch::Item>> BeamSearch::SelectTopBeamSizeItems(
-    const framework::LoDTensor &pre_ids,
-    const framework::LoDTensor &pre_scores) {
-  std::vector<std::vector<Item>> result;
-  std::vector<Item> items;
-  // for each source sentence, select the top beam_size items across all
-  // candidate sets.
-  while (NextItemSet(pre_ids, pre_scores, &items)) {
-    std::nth_element(
-        std::begin(items), std::begin(items) + beam_size_, std::end(items),
-        [](const Item &a, const Item &b) { return a.score > b.score; });
-    // prune the top beam_size items.
-    if (items.size() > beam_size_) {
-      items.resize(beam_size_);
-    }
-    result.emplace_back(items);
-  }
-  VLOG(3) << "SelectTopBeamSizeItems result size " << result.size();
-  for (auto &items : result) {
-    VLOG(3) << "item set:";
-    for (auto &item : items) {
-      VLOG(3) << ItemToString(item);
-    }
-  }
-
-  return result;
-}
-
-// the candidates of a source
-bool BeamSearch::NextItemSet(const framework::LoDTensor &pre_ids,
-                             const framework::LoDTensor &pre_scores,
-                             std::vector<BeamSearch::Item> *items) {
-  if (sent_offset_ >= ids_->NumElements(lod_level_)) {
-    return false;
-  }
-  // find the current candidates
-  auto ids = *ids_;
-  auto scores = *scores_;
-
-  auto abs_lod = framework::ToAbsOffset(ids.lod());
-
-  auto *ids_data = ids.data<int64_t>();
-  auto *scores_data = scores.data<float>();
-
-  size_t instance_dim = 1;
-  for (int i = 1; i < ids.dims().size(); i++) {
-    instance_dim *= ids.dims()[i];
-  }
-
-  auto *pre_ids_data = pre_ids.data<int64_t>();
-  auto *pre_scores_data = pre_scores.data<float>();
-  items->clear();
-  items->reserve(framework::product(ids.dims()));
-  for (size_t offset = abs_lod[lod_level_][sent_offset_];
-       offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) {
-    auto pre_id = pre_ids_data[offset];
-    auto pre_score = pre_scores_data[offset];
-    if (pre_id == end_id_) {
-      // Allocate all probability mass to eos_id for finished branchs and the
-      // other candidate ids can be ignored.
-      items->emplace_back(offset, end_id_, pre_score);
-    } else {
-      for (size_t d = 0; d < instance_dim; d++) {
-        const size_t dim_offset = offset * instance_dim + d;
-        items->emplace_back(offset, ids_data[dim_offset],
-                            scores_data[dim_offset]);
-      }
-    }
-  }
-
-  sent_offset_++;
-  return true;
-}
-
-std::ostream &operator<<(std::ostream &os, const BeamSearch::Item &item) {
-  os << "{";
-  os << "offset: " << item.offset << ", ";
-  os << "id: " << item.id << ", ";
-  os << "score: " << item.score << "";
-  os << "}";
-
-  return os;
-}
-
-std::string ItemToString(const BeamSearch::Item &item) {
-  std::ostringstream stream;
-  stream << item;
-  return stream.str();
-}
-
 class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -219,18 +29,23 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
              "(LoDTensor) The LoDTensor containing the selected ids at the "
              "previous step. It should be a tensor with shape (batch_size, 1) "
              "and lod `[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at "
-             "thefirst step.");
+             "the first step.");
     AddInput("pre_scores",
              "(LoDTensor) The LoDTensor containing the accumulated "
              "scores corresponding to the selected ids at the previous step.");
     AddInput("ids",
              "(LoDTensor) The LoDTensor containing the candidates ids. Its "
-             "shape should be (batch_size * beam_size, K), where K supposed to "
-             "be beam_size.");
+             "shape should be (batch_size * beam_size, W). If not set, it will "
+             "be calculated out according to Input(scores) in this operator.")
+        .AsDispensable();
     AddInput("scores",
-             "(LoDTensor) The LodTensor containing the accumulated scores "
-             "corresponding to Input(ids) and its shape is the same as the "
-             "shape of Input(ids).");
+             "(LoDTensor) The LoDTensor containing the current scores "
+             "corresponding to Input(ids). If Input(ids) is not nullptr, its "
+             "shape is the same as that of Input(ids)."
+             "If is_accumulated is true, Input(scores) is accumulated scores "
+             "and will be used derectedly. Else, each score will be "
+             "transformed to the log field and accumulate Input(pre_sores) "
+             "first.");
     AddOutput("selected_ids",
               "A LodTensor that stores the IDs selected by beam search.");
     AddOutput("selected_scores",
@@ -242,6 +57,9 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("beam_size", "beam size for beam search");
     AddAttr<int>("end_id",
                  "the token id which indicates the end of a sequence");
+    AddAttr<bool>("is_accumulated",
+                  "Whether the Input(scores) is accumulated scores.")
+        .SetDefault(true);
 
     AddComment(R"DOC(
 This operator does the search in beams for one time step. 
@@ -265,10 +83,9 @@ class BeamSearchOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     for (const std::string &arg :
-         std::vector<std::string>({"pre_ids", "ids", "scores"})) {
+         std::vector<std::string>({"pre_ids", "scores"})) {
       PADDLE_ENFORCE(ctx->HasInput(arg), "BeamSearch need input argument '%s'",
                      arg);
     }
@@ -279,12 +96,22 @@ class BeamSearchOp : public framework::OperatorWithKernel {
     }
   }
 
+ protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    framework::OpKernelType kt = framework::OpKernelType(
-        ctx.Input<framework::LoDTensor>("pre_ids")->type(),
-        platform::CPUPlace());
-    return kt;
+    auto *scores = ctx.Input<framework::LoDTensor>("scores");
+    size_t level = ctx.Attr<int>("level");
+    size_t batch_size = scores->lod()[level].size() - 1;
+    // The current CUDA kernel only support cases with batch_size < 4.
+    // Compute on CPU for cases with batch_size > 4.
+    if (batch_size <= 4) {
+      return framework::OpKernelType(
+          ctx.Input<framework::LoDTensor>("pre_ids")->type(), ctx.GetPlace());
+    } else {
+      return framework::OpKernelType(
+          ctx.Input<framework::LoDTensor>("pre_ids")->type(),
+          platform::CPUPlace());
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/beam_search_op.cu.cc b/paddle/fluid/operators/beam_search_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4ef9476eee5d3fac4decd7273da824b2f2349199
--- /dev/null
+++ b/paddle/fluid/operators/beam_search_op.cu.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/beam_search_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    beam_search,
+    ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h
index b5e2ed05924cc8b7bc06058b9b1103ba10be486e..1b939e742de06aedf187d25d002d19e0a4fafc9d 100644
--- a/paddle/fluid/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -14,187 +14,12 @@ limitations under the License. */
 
 #pragma once
 
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/math/beam_search.h"
 
 namespace paddle {
 namespace operators {
 
-/*
- * This is an implementation of beam search.
- *
- * To explain the details, lets take machine translation task for example, in
- * this task, one source sentence is translated to multiple target sentences,
- * during this period, one sentence will be translated to multiple translation
- * prefixes(target sentence that have not ended), in each time step a prefix
- * will have some candidates, input the candidate ids and their corresponding
- * scores (probabilities), it will sort and select the top beam_size candidates
- * for each source sentence, and store the selected candidates's score and their
- * corresponding ids to LoDTensors.
- *
- * A detailed example:
- *
- * Input
- *
- * ids:
- * LoD (should have 2 levels)
- * first level: [0, 1, 4]
- * second level: [0, 1, 2, 3, 4]
- *
- * tensor's data
- * [
- * [4, 2, 5]
- * [2, 1, 3]
- * [3, 5, 2]
- * [8, 2, 1]
- * ]
- *
- * scores:
- * LoD same as `ids`
- * tensor's data
- * [
- * [0.5, 0.3, 0.2]
- * [0.6, 0.3, 0.1]
- * [0.9, 0.5, 0.1]
- * [0.7, 0.5, 0.1]
- * ]
- *
- * the inputs means that there are 2 source sentences to translate, and the
- * first source has 1 prefix, the second source has 2 prefix.
- *
- * lets assume beam size is 2, and the beam search's output should be
- * LoD
- * first level:
- * [0, 1, 2]
- * second level:
- * [0, 2, 4]
- *
- * id tensor's data
- * [[
- * 4,
- * 1,
- * 3,
- * 8,
- * ]]
- *
- * score tensor's data
- * [[
- * 0.5,
- * 0.3,
- * 0.9,
- * 0.7
- * ]]
- *
- * TODO all the prune operations should be in the beam search, so it is better
- * to split the beam search algorithm into a sequence of smaller operators, and
- * the prune operators can be inserted in this sequence.
- */
-class BeamSearch {
- public:
-  // TODO(superjom) make type customizable
-  using id_t = size_t;
-  using score_t = float;
-  /*
-   * Input the arguments that needed by this class.
-   */
-  BeamSearch(const framework::LoDTensor& ids,
-             const framework::LoDTensor& scores, size_t level, size_t beam_size,
-             int end_id)
-      : beam_size_(beam_size),
-        ids_(&ids),
-        scores_(&scores),
-        lod_level_(level),
-        end_id_(end_id) {}
-
-  /*
-   * The main function of beam search.
-   *
-   * @selected_ids: a [None, 1]-shaped tensor with LoD.
-   *   In a machine translation model, it might be the candidate term id sets,
-   *   each set stored as a varience-length sequence.
-   *   The format might be described with a two-level LoD
-   *   - [[0 1]
-   *   -  [0 1 2]]
-   *   - [[]
-   *   -  [0 1]]
-   *   the first level of LoD tells that there are two source sentences. The
-   *   second level describes the details of the candidate id set's offsets in
-   * the
-   *   source sentences.
-   *
-   *  @selected_scores: a LoD tensor with the same shape and LoD with
-   * selected_ids.
-   *   It stores the corresponding scores of candidate ids in selected_ids.
-   *
-   * Return false if all the input tensor is empty, in machine translation task
-   * that means no candidates is provided, and the task will stop running.
-   */
-  void operator()(const framework::LoDTensor& pre_ids,
-                  const framework::LoDTensor& pre_scores,
-                  framework::LoDTensor* selected_ids,
-                  framework::LoDTensor* selected_scores);
-  /*
-   * The basic items help to sort.
-   */
-  struct Item {
-    Item() {}
-    Item(size_t offset, size_t id, float score)
-        : offset(offset), id(id), score(score) {}
-    // offset in the higher lod level.
-    size_t offset;
-    // // prefix id in the lower lod level.
-    // size_t prefix;
-    // the candidate id
-    id_t id;
-    // the corresponding score
-    score_t score;
-  };
-
- protected:
-  /*
-   * Prune the source sentences all branchs finished, and it is optional.
-   * Pruning must one step later than finishing (thus pre_ids is needed here),
-   * since the end tokens must be writed out.
-   */
-  void PruneEndBeams(const framework::LoDTensor& pre_ids,
-                     std::vector<std::vector<Item>>* items);
-
-  /*
-   * Transform the items into a map whose key is offset, value is the items.
-   * NOTE low performance.
-   */
-  std::vector<std::vector<Item>> ToMap(
-      const std::vector<std::vector<Item>>& inputs, size_t element_num);
-
-  /*
-   * For each source, select top beam_size records.
-   */
-  std::vector<std::vector<Item>> SelectTopBeamSizeItems(
-      const framework::LoDTensor& pre_ids,
-      const framework::LoDTensor& pre_scores);
-
-  /*
-   * Get the items of next source sequence, return false if no remaining items.
-   */
-  bool NextItemSet(const framework::LoDTensor& pre_ids,
-                   const framework::LoDTensor& pre_scores,
-                   std::vector<Item>* items);
-
- private:
-  size_t beam_size_;
-  const framework::LoDTensor* ids_;
-  const framework::LoDTensor* scores_;
-  size_t lod_level_{0};
-  size_t sent_offset_{0};
-  int end_id_{0};
-};
-
-std::ostream& operator<<(std::ostream& os, const BeamSearch::Item& item);
-
-std::string ItemToString(const BeamSearch::Item& item);
-
 template <typename DeviceContext, typename T>
 class BeamSearchOpKernel : public framework::OpKernel<T> {
  public:
@@ -203,7 +28,7 @@ class BeamSearchOpKernel : public framework::OpKernel<T> {
     auto* scores = context.Input<framework::LoDTensor>("scores");
     auto* pre_ids = context.Input<framework::LoDTensor>("pre_ids");
     auto* pre_scores = context.Input<framework::LoDTensor>("pre_scores");
-    PADDLE_ENFORCE_NOT_NULL(ids);
+
     PADDLE_ENFORCE_NOT_NULL(scores);
     PADDLE_ENFORCE_NOT_NULL(pre_ids);
     PADDLE_ENFORCE_NOT_NULL(pre_scores);
@@ -211,14 +36,20 @@ class BeamSearchOpKernel : public framework::OpKernel<T> {
     size_t level = context.Attr<int>("level");
     size_t beam_size = context.Attr<int>("beam_size");
     int end_id = context.Attr<int>("end_id");
-    BeamSearch alg(*ids, *scores, level, beam_size, end_id);
+    bool is_accumulated = context.Attr<bool>("is_accumulated");
+
     auto selected_ids = context.Output<framework::LoDTensor>("selected_ids");
     auto selected_scores =
         context.Output<framework::LoDTensor>("selected_scores");
     PADDLE_ENFORCE_NOT_NULL(selected_ids);
     PADDLE_ENFORCE_NOT_NULL(selected_scores);
-    alg(*pre_ids, *pre_scores, selected_ids, selected_scores);
+
+    math::BeamSearchFunctor<DeviceContext, T> alg;
+    alg(context.template device_context<DeviceContext>(), pre_ids, pre_scores,
+        ids, scores, selected_ids, selected_scores, level, beam_size, end_id,
+        is_accumulated);
   }
 };
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/beam_search_op_test.cc b/paddle/fluid/operators/beam_search_op_test.cc
deleted file mode 100644
index 40b46781daa989fcd89887a3c01e97e39ea71255..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/beam_search_op_test.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/fluid/operators/beam_search_op.h"
-
-#include <gtest/gtest.h>
-#include <vector>
-
-namespace paddle {
-namespace test {
-
-using std::vector;
-using framework::LoDTensor;
-using framework::LoD;
-using operators::BeamSearch;
-using paddle::platform::CPUPlace;
-using std::cout;
-using std::endl;
-
-void CreateInput(LoDTensor* ids, LoDTensor* scores) {
-  LoD lod;
-  vector<size_t> level0({0, 2, 4});
-  vector<size_t> level1({0, 1, 2, 3, 4});
-  lod.push_back(level0);
-  lod.push_back(level1);
-  ids->set_lod(lod);
-  scores->set_lod(lod);
-
-  auto dims = framework::make_ddim(vector<int64_t>({4, 3}));
-  ids->Resize(dims);
-  scores->Resize(dims);
-  CPUPlace place;
-
-  auto* ids_data = ids->mutable_data<int64_t>(place);
-  auto* scores_data = scores->mutable_data<float>(place);
-  vector<int64_t> _ids({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1});
-  vector<float> _scores(
-      {0.5f, 0.3f, 0.2f, 0.6f, 0.3f, 0.1f, 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f});
-
-  for (int i = 0; i < 12; i++) {
-    ids_data[i] = _ids[i];
-    scores_data[i] = _scores[i];
-  }
-}
-
-// It seems that beam_search_op has bugs.
-TEST(DISABLED_beam_search_op, run) {
-  CPUPlace place;
-  LoDTensor ids, scores;
-  CreateInput(&ids, &scores);
-
-  LoDTensor pre_ids;
-  pre_ids.Resize(framework::make_ddim(vector<int64_t>(4, 1)));
-  for (int i = 0; i < 4; i++) {
-    pre_ids.mutable_data<int64_t>(place)[i] = i + 1;
-  }
-  LoDTensor pre_scores;
-  pre_scores.Resize(framework::make_ddim(vector<int64_t>(4, 1)));
-  for (int i = 0; i < 4; i++) {
-    pre_scores.mutable_data<float>(place)[i] = 0.1 * (i + 1);
-  }
-
-  BeamSearch beamsearch(ids, scores, (size_t)0, (size_t)2, 0);
-  LoDTensor sids, sscores;
-  beamsearch(pre_ids, pre_scores, &sids, &sscores);
-
-  LOG(INFO) << "score: " << sscores << endl;
-
-  ASSERT_EQ(sids.lod(), sscores.lod());
-
-  vector<int> tids({4, 2, 3, 8});
-  vector<float> tscores({0.5f, 0.6f, 0.9f, 0.7f});
-
-  for (int i = 0; i < 4; i++) {
-    ASSERT_EQ(tids[i], sids.data<int64_t>()[i]);
-    ASSERT_EQ(tscores[i], sscores.data<float>()[i]);
-  }
-}
-
-}  // namespace test
-}  // namespace paddle
diff --git a/paddle/fluid/operators/bpr_loss_op.h b/paddle/fluid/operators/bpr_loss_op.h
index e223be7af82146e7c69c7c5aab8f08d0fe0d1710..f9570e4e2ed0d9ac8739410eb7cd7397ad09fae4 100644
--- a/paddle/fluid/operators/bpr_loss_op.h
+++ b/paddle/fluid/operators/bpr_loss_op.h
@@ -87,8 +87,8 @@ class BprLossGradientOpKernel : public framework::OpKernel<T> {
     auto* label = ctx.Input<Tensor>("Label");
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
 
-    const int step_size = x->dims()[0];
-    const int num_classes = x->dims()[1];
+    const size_t step_size = static_cast<size_t>(x->dims()[0]);
+    const size_t num_classes = static_cast<size_t>(x->dims()[1]);
     T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
     const T* dy_data = dy->data<T>();
     const T* x_data = x->data<T>();
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 6c85f1577e0c49d00f4ccf7fa7be0974eb62bdf3..d3a61dc367c3642b8faa9085a470a302712395e5 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -45,3 +45,7 @@ detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op
 foreach(src ${LOCAL_DETECTION_LIBS})
     set(OP_LIBRARY ${src} ${OP_LIBRARY} CACHE INTERNAL "op libs")
 endforeach()
+
+cc_library(mask_util SRCS mask_util.cc DEPS memory)
+cc_test(mask_util_test SRCS mask_util_test.cc DEPS memory mask_util)
+detection_library(generate_mask_labels_op SRCS generate_mask_labels_op.cc DEPS mask_util)
diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h
index 6abeca1da443248d6ad3c1bcc64dd775d77f4ed8..b99edb5bf05f94e762b377a8882e4c3fcdb5afad 100644
--- a/paddle/fluid/operators/detection/bbox_util.h
+++ b/paddle/fluid/operators/detection/bbox_util.h
@@ -1,13 +1,17 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
 #pragma once
 #include <algorithm>
 #include "paddle/fluid/framework/eigen.h"
@@ -88,7 +92,9 @@ void BboxOverlaps(const framework::Tensor& r_boxes,
       inter_w = std::max(x_max - x_min + 1, zero);
       inter_h = std::max(y_max - y_min + 1, zero);
       inter_area = inter_w * inter_h;
-      overlaps_et(i, j) = inter_area / (r_box_area + c_box_area - inter_area);
+      overlaps_et(i, j) =
+          (inter_area == 0.) ? 0 : inter_area /
+                                       (r_box_area + c_box_area - inter_area);
     }
   }
 }
diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..46727c29de13c1213694540e6614a05f9008d232
--- /dev/null
+++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
@@ -0,0 +1,437 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <math.h>
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detection/bbox_util.h"
+#include "paddle/fluid/operators/detection/mask_util.h"
+#include "paddle/fluid/operators/gather.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+const int kBoxDim = 4;
+
+template <typename T>
+void AppendMask(LoDTensor* out, int64_t offset, Tensor* to_add) {
+  auto* out_data = out->data<T>();
+  auto* to_add_data = to_add->data<T>();
+  memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T));
+}
+
+class GenerateMaskLabelsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("GtClasses"),
+                   "Input(GtClasses) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("IsCrowd"),
+                   "Input(IsCrowd) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("GtSegms"),
+                   "Input(GtSegms) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Rois"), "Input(Rois) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LabelsInt32"),
+                   "Input(LabelsInt32) shouldn't be null.");
+
+    PADDLE_ENFORCE(
+        ctx->HasOutput("MaskRois"),
+        "Output(MaskRois) of GenerateMaskLabelsOp should not be null");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("RoiHasMaskInt32"),
+        "Output(RoiHasMaskInt32) of GenerateMaskLabelsOp should not be null");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("MaskInt32"),
+        "Output(MaskInt32) of GenerateMaskLabelsOp should not be null");
+
+    auto im_info_dims = ctx->GetInputDim("ImInfo");
+    auto gt_segms_dims = ctx->GetInputDim("GtSegms");
+    PADDLE_ENFORCE_EQ(im_info_dims.size(), 2,
+                      "The rank of Input(ImInfo) must be 2.");
+    PADDLE_ENFORCE_EQ(gt_segms_dims.size(), 2,
+                      "The rank of Input(GtSegms) must be 2.");
+    PADDLE_ENFORCE_EQ(gt_segms_dims[1], 2,
+                      "The second dim of Input(GtSegms) must be 2.");
+    int num_classes = ctx->Attrs().Get<int>("num_classes");
+    int resolution = ctx->Attrs().Get<int>("resolution");
+
+    ctx->SetOutputDim("MaskRois", {-1, 4});
+    ctx->SetOutputDim("RoiHasMaskInt32", {-1, 1});
+    ctx->SetOutputDim("MaskInt32", {-1, num_classes * resolution * resolution});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Rois"));
+    return framework::OpKernelType(data_type, platform::CPUPlace());
+  }
+};
+
+/*
+ * Expand masks from shape (#masks, M ** 2) to (#masks, #classes * M ** 2)
+ * to encode class specific mask targets.
+ */
+template <typename T>
+static inline void ExpandMaskTarget(const platform::CPUDeviceContext& ctx,
+                                    const Tensor& masks,
+                                    const Tensor& mask_class_labels,
+                                    const int resolution, const int num_classes,
+                                    Tensor* mask_targets) {
+  const uint8_t* masks_data = masks.data<uint8_t>();
+  int64_t num_mask = masks.dims()[0];
+  const int* mask_class_labels_data = mask_class_labels.data<int>();
+  const int M = resolution * resolution;
+  const int mask_dim = M * num_classes;
+
+  int* mask_targets_data =
+      mask_targets->mutable_data<int>({num_mask, mask_dim}, ctx.GetPlace());
+  math::set_constant(ctx, mask_targets, -1);
+  for (int64_t mask_id = 0; mask_id < num_mask; ++mask_id) {
+    int cls = mask_class_labels_data[mask_id];
+    int start = M * cls;
+    if (cls > 0) {
+      for (int i = 0; i < M; ++i) {
+        mask_targets_data[mask_id * mask_dim + start + i] =
+            static_cast<int>(masks_data[mask_id * M + i]);
+      }
+    }
+  }
+}
+
+template <typename T>
+std::vector<Tensor> SampleMaskForOneImage(
+    const platform::CPUDeviceContext& ctx, const Tensor& im_info,
+    const Tensor& gt_classes, const Tensor& is_crowd, const Tensor& gt_segms,
+    const Tensor& rois, const Tensor& label_int32, const int num_classes,
+    const int resolution, const framework::LoD& segm_length) {
+  // Prepare the mask targets by associating one gt mask to each training roi
+  // that has a fg (non-bg) class label.
+  const int64_t gt_size = static_cast<int64_t>(gt_classes.dims()[0]);
+  const int64_t roi_size = static_cast<int64_t>(rois.dims()[0]);
+  const int* gt_classes_data = gt_classes.data<int>();
+  const int* is_crowd_data = is_crowd.data<int>();
+  const int* label_int32_data = label_int32.data<int>();
+  PADDLE_ENFORCE_EQ(roi_size, label_int32.dims()[0]);
+
+  std::vector<int> mask_gt_inds, fg_inds;
+  std::vector<std::vector<std::vector<T>>> gt_polys;
+
+  auto polys_num = segm_length[1];
+  auto segm_lod_offset = framework::ConvertToOffsetBasedLoD(segm_length);
+  auto lod1 = segm_lod_offset[1];
+  auto lod2 = segm_lod_offset[2];
+  const T* polys_data = gt_segms.data<T>();
+  for (int64_t i = 0; i < gt_size; ++i) {
+    if ((gt_classes_data[i] > 0) && (is_crowd_data[i] == 0)) {
+      mask_gt_inds.emplace_back(i);
+
+      // slice fg segmentation polys
+      int poly_num = polys_num[i];
+      std::vector<std::vector<T>> polys;
+      int s_idx = lod1[i];
+      for (int j = 0; j < poly_num; ++j) {
+        int s = lod2[s_idx + j];
+        int e = lod2[s_idx + j + 1];
+        PADDLE_ENFORCE_NE(s, e);
+        std::vector<T> plts(polys_data + s * 2, polys_data + e * 2);
+        polys.push_back(plts);
+      }
+      gt_polys.push_back(polys);
+    }
+  }
+  for (int64_t i = 0; i < roi_size; ++i) {
+    if (label_int32_data[i] > 0) {
+      fg_inds.emplace_back(i);
+    }
+  }
+  int gt_num = mask_gt_inds.size();
+  int fg_num = fg_inds.size();
+
+  Tensor boxes_from_polys;
+  boxes_from_polys.mutable_data<T>({gt_num, 4}, platform::CPUPlace());
+  Poly2Boxes(gt_polys, boxes_from_polys.data<T>());
+
+  std::vector<int> roi_has_mask =
+      std::vector<int>(fg_inds.begin(), fg_inds.end());
+  Tensor mask_class_labels;
+  Tensor masks;
+  Tensor rois_fg;
+
+  auto im_scale = im_info.data<T>()[2];
+  if (fg_num > 0) {
+    // Class labels for the foreground rois
+    mask_class_labels.mutable_data<int>({fg_num, 1}, ctx.GetPlace());
+    Gather<int>(label_int32_data, 1, fg_inds.data(), fg_inds.size(),
+                mask_class_labels.data<int>());
+
+    uint8_t* masks_data = masks.mutable_data<uint8_t>(
+        {fg_num, resolution * resolution}, ctx.GetPlace());
+
+    // Find overlap between all foreground rois and the bounding boxes
+    // enclosing each segmentation
+    T* rois_fg_data = rois_fg.mutable_data<T>({fg_num, 4}, ctx.GetPlace());
+    Gather<T>(rois.data<T>(), 4, fg_inds.data(), fg_inds.size(),
+              rois_fg.data<T>());
+
+    for (int k = 0; k < rois_fg.numel(); ++k) {
+      rois_fg_data[k] = rois_fg_data[k] / im_scale;
+    }
+
+    Tensor overlaps_bbfg_bbpolys;
+    overlaps_bbfg_bbpolys.mutable_data<T>({fg_num, gt_num}, ctx.GetPlace());
+    BboxOverlaps<T>(rois_fg, boxes_from_polys, &overlaps_bbfg_bbpolys);
+
+    // Map from each fg rois to the index of the mask with highest overlap
+    // (measured by bbox overlap)
+    T* overlaps_bbfg_bbpolys_data = overlaps_bbfg_bbpolys.data<T>();
+    std::vector<int> fg_masks_inds;
+    for (int64_t i = 0; i < fg_num; ++i) {
+      const T* v = overlaps_bbfg_bbpolys_data + i * gt_num;
+      T max_overlap = std::numeric_limits<T>::min();
+      int id = 0;
+      for (int64_t j = 0; j < gt_num; ++j) {
+        if (v[j] > max_overlap) {
+          max_overlap = v[j];
+          id = j;
+        }
+      }
+      fg_masks_inds.push_back(id);
+    }
+
+    // add fg targets
+    for (int64_t i = 0; i < fg_num; ++i) {
+      int fg_polys_ind = fg_masks_inds[i];
+      T* roi_fg = rois_fg_data + i * 4;
+      uint8_t* mask = masks_data + i * resolution * resolution;
+      Polys2MaskWrtBox(gt_polys[fg_polys_ind], roi_fg, resolution, mask);
+    }
+  } else {
+    // The network cannot handle empty blobs, so we must provide a mask
+    // We simply take the first bg roi, given it an all -1's mask (ignore
+    // label), and label it with class zero (bg).
+    int bg_num = 1;
+    T* rois_fg_data = rois_fg.mutable_data<T>({bg_num, 4}, ctx.GetPlace());
+    const T* rois_data = rois.data<T>();
+    std::vector<int> bg_inds;
+    for (int64_t i = 0; i < roi_size; ++i) {
+      if (label_int32_data[i] == 0) {
+        bg_inds.emplace_back(i);
+        rois_fg_data[0] = rois_data[0] / im_scale;
+        rois_fg_data[1] = rois_data[1] / im_scale;
+        rois_fg_data[2] = rois_data[2] / im_scale;
+        rois_fg_data[3] = rois_data[3] / im_scale;
+        break;
+      }
+    }
+    masks.mutable_data<uint8_t>({bg_num, resolution * resolution},
+                                ctx.GetPlace());
+    math::set_constant(ctx, &masks, -1);
+    int* mask_class_labels_data =
+        mask_class_labels.mutable_data<int>({bg_num, 1}, ctx.GetPlace());
+    mask_class_labels_data[0] = 0;
+    roi_has_mask = std::vector<int>(bg_inds.begin(), bg_inds.end());
+  }
+
+  Tensor masks_expand;
+  ExpandMaskTarget<T>(ctx, masks, mask_class_labels, resolution, num_classes,
+                      &masks_expand);
+
+  T* rois_fg_data = rois_fg.data<T>();
+  for (int k = 0; k < rois_fg.numel(); ++k) {
+    rois_fg_data[k] = rois_fg_data[k] * im_scale;
+  }
+
+  Tensor roi_has_mask_t;
+  int roi_has_mask_size = roi_has_mask.size();
+  int* roi_has_mask_data =
+      roi_has_mask_t.mutable_data<int>({roi_has_mask_size, 1}, ctx.GetPlace());
+  std::copy(roi_has_mask.begin(), roi_has_mask.end(), roi_has_mask_data);
+
+  std::vector<Tensor> res;
+  res.emplace_back(rois_fg);
+  res.emplace_back(roi_has_mask_t);
+  res.emplace_back(masks_expand);
+  return res;
+}
+
+template <typename T>
+class GenerateMaskLabelsKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* im_info = ctx.Input<LoDTensor>("ImInfo");
+    auto* gt_classes = ctx.Input<LoDTensor>("GtClasses");
+    auto* is_crowd = ctx.Input<LoDTensor>("IsCrowd");
+    auto* gt_segms = ctx.Input<LoDTensor>("GtSegms");
+    auto* rois = ctx.Input<LoDTensor>("Rois");
+    auto* label_int32 = ctx.Input<LoDTensor>("LabelsInt32");
+
+    auto* mask_rois = ctx.Output<LoDTensor>("MaskRois");
+    auto* roi_has_mask_int32 = ctx.Output<LoDTensor>("RoiHasMaskInt32");
+    auto* mask_int32 = ctx.Output<LoDTensor>("MaskInt32");
+
+    int num_classes = ctx.Attr<int>("num_classes");
+    int resolution = ctx.Attr<int>("resolution");
+
+    PADDLE_ENFORCE_EQ(gt_classes->lod().size(), 1UL,
+                      "GenerateMaskLabelsOp gt_classes needs 1 level of LoD");
+    PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL,
+                      "GenerateMaskLabelsOp is_crowd needs 1 level of LoD");
+    PADDLE_ENFORCE_EQ(rois->lod().size(), 1UL,
+                      "GenerateMaskLabelsOp rois needs 1 level of LoD");
+    PADDLE_ENFORCE_EQ(label_int32->lod().size(), 1UL,
+                      "GenerateMaskLabelsOp label_int32 needs 1 level of LoD");
+
+    PADDLE_ENFORCE_EQ(gt_segms->lod().size(), 3UL);
+
+    int64_t n = static_cast<int64_t>(gt_classes->lod().back().size() - 1);
+    PADDLE_ENFORCE_EQ(gt_segms->lod()[0].size() - 1, n);
+
+    int mask_dim = num_classes * resolution * resolution;
+
+    mask_rois->mutable_data<T>({rois->numel(), kBoxDim}, ctx.GetPlace());
+    roi_has_mask_int32->mutable_data<int>({rois->numel(), 1}, ctx.GetPlace());
+    mask_int32->mutable_data<int>({rois->numel(), mask_dim}, ctx.GetPlace());
+
+    framework::LoD lod;
+    std::vector<size_t> lod0(1, 0);
+
+    int64_t num_mask = 0;
+    auto& dev_ctx = ctx.device_context<platform::CPUDeviceContext>();
+
+    auto gt_classes_lod = gt_classes->lod().back();
+    auto is_crowd_lod = is_crowd->lod().back();
+    auto rois_lod = rois->lod().back();
+    auto label_int32_lod = label_int32->lod().back();
+    auto gt_segms_lod = gt_segms->lod();
+
+    for (int i = 0; i < n; ++i) {
+      Tensor im_info_slice = im_info->Slice(i, i + 1);
+      Tensor gt_classes_slice =
+          gt_classes->Slice(gt_classes_lod[i], gt_classes_lod[i + 1]);
+      Tensor is_crowd_slice =
+          is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
+      Tensor label_int32_slice =
+          label_int32->Slice(label_int32_lod[i], label_int32_lod[i + 1]);
+      Tensor rois_slice = rois->Slice(rois_lod[i], rois_lod[i + 1]);
+
+      auto sub_lod_and_offset =
+          framework::GetSubLoDAndAbsoluteOffset(gt_segms_lod, i, i + 1, 0);
+      auto lod_length = sub_lod_and_offset.first;
+      size_t s = sub_lod_and_offset.second.first;
+      size_t e = sub_lod_and_offset.second.second;
+      Tensor gt_segms_slice = gt_segms->Slice(s, e);
+
+      std::vector<Tensor> tensor_output = SampleMaskForOneImage<T>(
+          dev_ctx, im_info_slice, gt_classes_slice, is_crowd_slice,
+          gt_segms_slice, rois_slice, label_int32_slice, num_classes,
+          resolution, lod_length);
+
+      Tensor sampled_mask_rois = tensor_output[0];
+      Tensor sampled_roi_has_mask_int32 = tensor_output[1];
+      Tensor sampled_mask_int32 = tensor_output[2];
+
+      AppendMask<T>(mask_rois, kBoxDim * num_mask, &sampled_mask_rois);
+      AppendMask<int>(roi_has_mask_int32, num_mask,
+                      &sampled_roi_has_mask_int32);
+      AppendMask<int>(mask_int32, mask_dim * num_mask, &sampled_mask_int32);
+
+      num_mask += sampled_mask_rois.dims()[0];
+      lod0.emplace_back(num_mask);
+    }
+
+    lod.emplace_back(lod0);
+    mask_rois->set_lod(lod);
+    roi_has_mask_int32->set_lod(lod);
+    mask_int32->set_lod(lod);
+    mask_rois->Resize({num_mask, kBoxDim});
+    roi_has_mask_int32->Resize({num_mask, 1});
+    mask_int32->Resize({num_mask, mask_dim});
+  }
+};
+
+class GenerateMaskLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("ImInfo",
+             "(Tensor), This input is a 2D Tensor with shape [B, 3]. "
+             "B is the number of input images, "
+             "each element consists of im_height, im_width, im_scale.");
+    AddInput("GtClasses",
+             "(LoDTensor), This input is a 2D LoDTensor with shape [M, 1]. "
+             "M is the number of groundtruth, "
+             "each element is a class label of groundtruth.");
+    AddInput(
+        "IsCrowd",
+        "(LoDTensor), This input is a 2D LoDTensor with shape [M, 1]. "
+        "M is the number of groundtruth, "
+        "each element is a flag indicates whether a groundtruth is crowd.");
+    AddInput(
+        "GtSegms",
+        "(LoDTensor), This input is a 2D LoDTensor with shape [S, 2], it's LoD "
+        "level is 3. The LoD[0] represents the gt objects number of each "
+        "instance. LoD[1] represents the segmentation counts of each objects. "
+        "LoD[2] represents the polygons number of each segmentation. S the "
+        "total number of polygons coordinate points. Each element is (x, y) "
+        "coordinate points.");
+    AddInput(
+        "Rois",
+        "(LoDTensor), This input is a 2D LoDTensor with shape [R, 4]. "
+        "R is the number of rois which is the output of "
+        "generate_proposal_labels, "
+        "each element is a bounding box with (xmin, ymin, xmax, ymax) format.");
+    AddInput("LabelsInt32",
+             "(LoDTensor), This intput is a 2D LoDTensor with shape [R, 1], "
+             "each element repersents a class label of a roi");
+    AddOutput(
+        "MaskRois",
+        "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4]. "
+        "P is the number of mask, "
+        "each element is a bounding box with [xmin, ymin, xmax, ymax] format.");
+    AddOutput("RoiHasMaskInt32",
+              "(LoDTensor), This output is a 2D LoDTensor with shape [P, 1], "
+              "each element repersents the output mask rois index with regard "
+              "to input rois");
+    AddOutput("MaskInt32",
+              "(LoDTensor), This output is a 4D LoDTensor with shape [P, Q], "
+              "Q equal to num_classes * resolution * resolution");
+
+    AddAttr<int>("num_classes", "Class number.");
+    AddAttr<int>("resolution", "Resolution of mask.");
+
+    AddComment(R"DOC(
+This operator can be, for given the RoIs and corresponding labels,
+to sample foreground RoIs. This mask branch also has
+a :math: `K \\times M^{2}` dimensional output targets for each foreground
+RoI, which encodes K binary masks of resolution M x M, one for each of the
+K classes. This mask targets are used to compute loss of mask branch.
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(generate_mask_labels, ops::GenerateMaskLabelsOp,
+                  ops::GenerateMaskLabelsOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(generate_mask_labels,
+                       ops::GenerateMaskLabelsKernel<float>);
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index a652d4d95750ff89f0ef63338031e80eed6f92bb..5b2e571baf390bfa9b4bdfa6e0f151102de709fc 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -48,20 +48,21 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
                    "Input(GtBoxes) shouldn't be null.");
     PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null.");
 
-    PADDLE_ENFORCE(ctx->HasOutput("Rois"),
-                   "Output(Rois) of RpnTargetAssignOp should not be null");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Rois"),
+        "Output(Rois) of GenerateProposalLabelsOp should not be null");
     PADDLE_ENFORCE(
         ctx->HasOutput("LabelsInt32"),
-        "Output(LabelsInt32) of RpnTargetAssignOp should not be null");
+        "Output(LabelsInt32) of GenerateProposalLabelsOp should not be null");
     PADDLE_ENFORCE(
         ctx->HasOutput("BboxTargets"),
-        "Output(BboxTargets) of RpnTargetAssignOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("BboxInsideWeights"),
-        "Output(BboxInsideWeights) of RpnTargetAssignOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("BboxOutsideWeights"),
-        "Output(BboxOutsideWeights) of RpnTargetAssignOp should not be null");
+        "Output(BboxTargets) of GenerateProposalLabelsOp should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput("BboxInsideWeights"),
+                   "Output(BboxInsideWeights) of GenerateProposalLabelsOp "
+                   "should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput("BboxOutsideWeights"),
+                   "Output(BboxOutsideWeights) of GenerateProposalLabelsOp "
+                   "should not be null");
 
     auto rpn_rois_dims = ctx->GetInputDim("RpnRois");
     auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
@@ -225,30 +226,36 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context,
 
 template <typename T>
 std::vector<Tensor> SampleRoisForOneImage(
-    const platform::CPUDeviceContext& context, Tensor* rpn_rois,
-    Tensor* gt_classes, Tensor* is_crowd, Tensor* gt_boxes, Tensor* im_info,
-    const int batch_size_per_im, const float fg_fraction, const float fg_thresh,
-    const float bg_thresh_hi, const float bg_thresh_lo,
+    const platform::CPUDeviceContext& context, const Tensor& rpn_rois_in,
+    const Tensor& gt_classes, const Tensor& is_crowd, const Tensor& gt_boxes,
+    const Tensor& im_info, const int batch_size_per_im, const float fg_fraction,
+    const float fg_thresh, const float bg_thresh_hi, const float bg_thresh_lo,
     const std::vector<float>& bbox_reg_weights, const int class_nums,
     std::minstd_rand engine, bool use_random) {
-  auto rpn_rois_et = framework::EigenTensor<T, 2>::From(*rpn_rois);
-  auto im_scale = im_info->data<T>()[2];
-  rpn_rois_et = rpn_rois_et / im_scale;
+  auto im_scale = im_info.data<T>()[2];
+
+  Tensor rpn_rois;
+  rpn_rois.mutable_data<T>(rpn_rois_in.dims(), context.GetPlace());
+  T* rpn_rois_dt = rpn_rois.data<T>();
+  const T* rpn_rois_in_dt = rpn_rois_in.data<T>();
+  for (int i = 0; i < rpn_rois.numel(); ++i) {
+    rpn_rois_dt[i] = rpn_rois_in_dt[i] / im_scale;
+  }
 
   Tensor boxes;
-  int proposals_num = gt_boxes->dims()[0] + rpn_rois->dims()[0];
+  int proposals_num = gt_boxes.dims()[0] + rpn_rois.dims()[0];
   boxes.mutable_data<T>({proposals_num, kBoxDim}, context.GetPlace());
-  Concat<T>(context, *gt_boxes, *rpn_rois, &boxes);
+  Concat<T>(context, gt_boxes, rpn_rois, &boxes);
 
   // Overlaps
   Tensor proposal_to_gt_overlaps;
-  proposal_to_gt_overlaps.mutable_data<T>({proposals_num, gt_boxes->dims()[0]},
+  proposal_to_gt_overlaps.mutable_data<T>({proposals_num, gt_boxes.dims()[0]},
                                           context.GetPlace());
-  BboxOverlaps<T>(boxes, *gt_boxes, &proposal_to_gt_overlaps);
+  BboxOverlaps<T>(boxes, gt_boxes, &proposal_to_gt_overlaps);
 
   // Generate proposal index
   std::vector<std::vector<int>> fg_bg_gt = SampleFgBgGt<T>(
-      context, &proposal_to_gt_overlaps, *is_crowd, batch_size_per_im,
+      context, &proposal_to_gt_overlaps, is_crowd, batch_size_per_im,
       fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, engine, use_random);
   std::vector<int> fg_inds = fg_bg_gt[0];
   std::vector<int> bg_inds = fg_bg_gt[1];
@@ -263,7 +270,7 @@ std::vector<Tensor> SampleRoisForOneImage(
   sampled_boxes.mutable_data<T>(bbox_dim, context.GetPlace());
   sampled_labels.mutable_data<int>({boxes_num}, context.GetPlace());
   sampled_gts.mutable_data<T>({fg_num, kBoxDim}, context.GetPlace());
-  GatherBoxesLabels<T>(context, boxes, *gt_boxes, *gt_classes, fg_inds, bg_inds,
+  GatherBoxesLabels<T>(context, boxes, gt_boxes, gt_classes, fg_inds, bg_inds,
                        gt_inds, &sampled_boxes, &sampled_labels, &sampled_gts);
 
   // Compute targets
@@ -397,8 +404,8 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
           gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]);
       Tensor im_info_slice = im_info->Slice(i, i + 1);
       std::vector<Tensor> tensor_output = SampleRoisForOneImage<T>(
-          dev_ctx, &rpn_rois_slice, &gt_classes_slice, &is_crowd_slice,
-          &gt_boxes_slice, &im_info_slice, batch_size_per_im, fg_fraction,
+          dev_ctx, rpn_rois_slice, gt_classes_slice, is_crowd_slice,
+          gt_boxes_slice, im_info_slice, batch_size_per_im, fg_fraction,
           fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums,
           engine, use_random);
       Tensor sampled_rois = tensor_output[0];
@@ -467,7 +474,7 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
         "P usuall equal to  batch_size_per_im * batch_size, "
         "each element is a bounding box with [xmin, ymin, xmax, ymax] format.");
     AddOutput("LabelsInt32",
-              "(LoDTensor), This output is a 2D LoDTensor with shape [P], "
+              "(LoDTensor), This output is a 2D LoDTensor with shape [P, 1], "
               "each element repersents a class label of a roi");
     AddOutput("BboxTargets",
               "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4 * "
diff --git a/paddle/fluid/operators/detection/mask_util.cc b/paddle/fluid/operators/detection/mask_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bd6fee713815345152fce73e85a45aa5cd68b1da
--- /dev/null
+++ b/paddle/fluid/operators/detection/mask_util.cc
@@ -0,0 +1,229 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/mask_util.h"
+#include <math.h>
+#include <stdlib.h>
+#include <algorithm>
+#include <limits>
+#include <utility>
+#include "paddle/fluid/memory/memory.h"
+
+namespace paddle {
+namespace operators {
+
+uint32_t UMax(uint32_t a, uint32_t b) { return (a > b) ? a : b; }
+
+static inline int Compare(const void* a, const void* b) {
+  uint32_t c = *(reinterpret_cast<const uint32_t*>(a));
+  uint32_t d = *(reinterpret_cast<const uint32_t*>(b));
+  return c > d ? 1 : c < d ? -1 : 0;
+}
+
+void Decode(const uint32_t* cnts, int m, uint8_t* mask) {
+  uint8_t v = 0;
+  for (int j = 0; j < m; j++) {
+    for (uint32_t k = 0; k < cnts[j]; k++) {
+      *(mask++) = v;
+    }
+    v = !v;
+  }
+}
+
+typedef uint32_t uint;
+void Poly2Mask(const float* xy, int k, int h, int w, uint8_t* mask) {
+  int j, m = 0;
+  double scale = 5;
+  int *x, *y, *u, *v;
+  uint *a, *b;
+  platform::CPUPlace cpu;
+  auto xptr = memory::Alloc(cpu, sizeof(int) * (k + 1) * 2);
+  x = reinterpret_cast<int*>(xptr->ptr());
+  y = x + (k + 1);
+
+  for (j = 0; j < k; j++) x[j] = static_cast<int>(scale * xy[j * 2 + 0] + .5);
+  x[k] = x[0];
+  for (j = 0; j < k; j++) y[j] = static_cast<int>(scale * xy[j * 2 + 1] + .5);
+  y[k] = y[0];
+  for (j = 0; j < k; j++) {
+    m += UMax(abs(x[j] - x[j + 1]), abs(y[j] - y[j + 1])) + 1;
+  }
+  auto vptr = memory::Alloc(cpu, sizeof(int) * m * 2);
+  u = reinterpret_cast<int*>(vptr->ptr());
+  v = u + m;
+  m = 0;
+  for (j = 0; j < k; j++) {
+    int xs = x[j], xe = x[j + 1], ys = y[j], ye = y[j + 1], dx, dy, t, d;
+    int flip;
+    double s;
+    dx = abs(xe - xs);
+    dy = abs(ys - ye);
+    flip = (dx >= dy && xs > xe) || (dx < dy && ys > ye);
+    if (flip) {
+      t = xs;
+      xs = xe;
+      xe = t;
+      t = ys;
+      ys = ye;
+      ye = t;
+    }
+    if (dx >= dy) {
+      s = dx == 0 ? 0 : static_cast<double>(ye - ys) / dx;
+      for (d = 0; d <= dx; d++) {
+        t = flip ? dx - d : d;
+        u[m] = t + xs;
+        v[m] = static_cast<int>(ys + s * t + .5);
+        m++;
+      }
+    } else {
+      s = dy == 0 ? 0 : static_cast<double>(xe - xs) / dy;
+      for (d = 0; d <= dy; d++) {
+        t = flip ? dy - d : d;
+        v[m] = t + ys;
+        u[m] = static_cast<int>(xs + s * t + .5);
+        m++;
+      }
+    }
+  }
+  /* get points along y-boundary and downsample */
+  k = m;
+  m = 0;
+  double xd, yd;
+  auto xyptr = memory::Alloc(cpu, sizeof(int) * k * 2);
+  x = reinterpret_cast<int*>(xyptr->ptr());
+  y = x + k;
+  for (j = 1; j < k; j++) {
+    if (u[j] != u[j - 1]) {
+      xd = static_cast<double>(u[j] < u[j - 1] ? u[j] : u[j] - 1);
+      xd = (xd + .5) / scale - .5;
+      if (floor(xd) != xd || xd < 0 || xd > w - 1) continue;
+      yd = static_cast<double>(v[j] < v[j - 1] ? v[j] : v[j - 1]);
+      yd = (yd + .5) / scale - .5;
+      if (yd < 0)
+        yd = 0;
+      else if (yd > h)
+        yd = h;
+      yd = ceil(yd);
+      x[m] = static_cast<int>(xd);
+      y[m] = static_cast<int>(yd);
+      m++;
+    }
+  }
+  /* compute rle encoding given y-boundary points */
+  k = m;
+  auto aptr = memory::Alloc(cpu, sizeof(uint) * (k + 1));
+  a = reinterpret_cast<uint*>(aptr->ptr());
+  for (j = 0; j < k; j++) a[j] = static_cast<uint>(x[j] * h + y[j]);
+  a[k++] = static_cast<uint>(h * w);
+
+  qsort(a, k, sizeof(uint), Compare);
+  uint p = 0;
+  for (j = 0; j < k; j++) {
+    uint t = a[j];
+    a[j] -= p;
+    p = t;
+  }
+  auto bptr = memory::Alloc(cpu, sizeof(uint32_t) * k);
+  b = reinterpret_cast<uint32_t*>(bptr->ptr());
+  j = m = 0;
+  b[m++] = a[j++];
+  while (j < k) {
+    if (a[j] > 0) {
+      b[m++] = a[j++];
+    } else {
+      j++;
+      if (j < k) b[m - 1] += a[j++];
+    }
+  }
+
+  // convert to mask
+  auto mskptr = memory::Alloc(cpu, sizeof(uint8_t) * h * w);
+  uint8_t* msk = reinterpret_cast<uint8_t*>(mskptr->ptr());
+  Decode(b, m, msk);
+
+  for (int ii = 0; ii < h; ++ii) {
+    for (int jj = 0; jj < w; ++jj) {
+      mask[ii * w + jj] = msk[jj * h + ii];
+    }
+  }
+}
+
+void Poly2Boxes(const std::vector<std::vector<std::vector<float>>>& polys,
+                float* boxes) {
+  // lists
+  for (size_t i = 0; i < polys.size(); ++i) {
+    float x0 = std::numeric_limits<float>::max();
+    float x1 = std::numeric_limits<float>::min();
+    float y0 = std::numeric_limits<float>::max();
+    float y1 = std::numeric_limits<float>::min();
+    // each list may have more than one polys
+    for (size_t j = 0; j < polys[i].size(); ++j) {
+      for (size_t k = 0; k < polys[i][j].size() / 2; ++k) {
+        x0 = std::min(x0, polys[i][j][2 * k]);
+        x1 = std::max(x1, polys[i][j][2 * k]);
+        y0 = std::min(y0, polys[i][j][2 * k + 1]);
+        y1 = std::max(y1, polys[i][j][2 * k + 1]);
+      }
+    }
+    boxes[i * 4] = x0;
+    boxes[i * 4 + 1] = y0;
+    boxes[i * 4 + 2] = x1;
+    boxes[i * 4 + 3] = y1;
+  }
+}
+
+void Polys2MaskWrtBox(const std::vector<std::vector<float>>& polygons,
+                      const float* box, int M, uint8_t* mask) {
+  float w = box[2] - box[0];
+  float h = box[3] - box[1];
+  w = std::max(w, static_cast<float>(1.));
+  h = std::max(h, static_cast<float>(1.));
+
+  uint8_t* msk = nullptr;
+  if (polygons.size() == 1UL) {
+    msk = mask;
+  } else {
+    msk = reinterpret_cast<uint8_t*>(
+        malloc(M * M * polygons.size() * sizeof(uint8_t)));
+  }
+  for (size_t i = 0; i < polygons.size(); ++i) {
+    int k = polygons[i].size() / 2;
+    std::vector<float> p;
+    for (int j = 0; j < k; ++j) {
+      float pw = (polygons[i][2 * j] - box[0]) * M / w;
+      float ph = (polygons[i][2 * j + 1] - box[1]) * M / h;
+      p.push_back(pw);
+      p.push_back(ph);
+    }
+    uint8_t* msk_i = msk + i * M * M;
+    Poly2Mask(p.data(), k, M, M, msk_i);
+  }
+
+  if (polygons.size() > 1UL) {
+    for (size_t i = 0; i < polygons.size(); ++i) {
+      uint8_t* msk_i = msk + i * M * M;
+      for (int j = 0; j < M * M; ++j) {
+        if (i == 0) {
+          mask[j] = msk_i[j];
+        } else {
+          mask[j] = (mask[j] + msk_i[j]) > 0 ? 1 : 0;
+        }
+      }
+    }
+    free(msk);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/mask_util.h b/paddle/fluid/operators/detection/mask_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e0ea54f6d89ff273382afc1e9a151cfd9773cc6
--- /dev/null
+++ b/paddle/fluid/operators/detection/mask_util.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <stdint.h>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+void Poly2Mask(const float* ploy, int k, int h, int w, uint8_t* mask);
+
+void Poly2Boxes(const std::vector<std::vector<std::vector<float>>>& polys,
+                float* boxes);
+
+void Polys2MaskWrtBox(const std::vector<std::vector<float>>& polygons,
+                      const float* box, int M, uint8_t* mask);
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/mask_util_test.cc b/paddle/fluid/operators/detection/mask_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..de904e947463977229545897b723b98b4d0708d6
--- /dev/null
+++ b/paddle/fluid/operators/detection/mask_util_test.cc
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/mask_util.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/memory/memory.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+void Compare(const T* a, const T* b, const int n) {
+  for (int i = 0; i < n; i++) {
+    EXPECT_EQ(a[i], b[i]);
+  }
+}
+
+TEST(MaskUtil, Poly2MaskTest) {
+  float polys[] = {1.97f, 1.88f, 5.81f, 1.88f, 1.69f,
+                   6.53f, 5.94f, 6.38f, 1.97f, 1.88f};
+  int h = 8, w = 8;
+  int k = 5;  // length(polys) / 2
+  // clang-format off
+  uint8_t expect_mask[] = {
+      0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 1, 1, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 1, 0, 0, 0, 0,
+      0, 0, 1, 1, 1, 0, 0, 0,
+      0, 0, 1, 1, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0
+  };
+  // clang-format on
+
+  // the groud-truth mask is computed by coco API:
+  //
+  // import pycocotools.mask as mask_util
+  // import numpy as np
+  // segm = [1.97, 1.88, 5.81, 1.88, 1.69, 6.53, 5.94, 6.38, 1.97, 1.88]
+  // rles = mask_util.frPyObjects([segm], im_h, im_w)
+  // mask = mask_util.decode(rles)
+  // print mask
+  platform::CPUPlace cpu;
+  auto allocation = memory::Alloc(cpu, sizeof(expect_mask));
+  uint8_t* mask = reinterpret_cast<uint8_t*>(allocation->ptr());
+  Poly2Mask(polys, k, h, w, mask);
+  Compare<uint8_t>(expect_mask, mask, h * w);
+}
+
+TEST(MaskUtil, Poly2BoxesTest) {
+  // clang-format off
+  std::vector<std::vector<std::vector<float>>> polys = {
+      {{1.97f, 1.88f, 5.81f, 1.88f, 1.69f, 6.53f, 5.94f, 6.38f, 1.97f, 1.88f}},
+      {{2.97f, 1.88f, 3.81f, 1.68f, 1.69f, 6.63f, 6.94f, 6.58f, 2.97f, 0.88f}}
+  };
+  float expect_boxes[] = {
+      1.69f, 1.88f, 5.94f, 6.53f,
+      1.69f, 0.88f, 6.94f, 6.63f
+  };
+  // clang-format on
+
+  platform::CPUPlace cpu;
+  auto allocation = memory::Alloc(cpu, sizeof(expect_boxes));
+  float* boxes = reinterpret_cast<float*>(allocation->ptr());
+  Poly2Boxes(polys, boxes);
+  Compare<float>(expect_boxes, boxes, 8);
+}
+
+TEST(MaskUtil, Polys2MaskWrtBoxTest) {
+  // clang-format off
+  std::vector<std::vector<std::vector<float>>> polys = {{
+      {1.97f, 1.88f, 5.81f, 1.88f, 1.69f, 6.53f, 5.94f, 6.38f, 1.97f, 1.88f},
+      {2.97f, 1.88f, 3.81f, 1.68f, 1.69f, 6.63f, 6.94f, 6.58f, 2.97f, 0.88f}}};
+  float expect_boxes[] = {
+      1.69f, 0.88f, 6.94f, 6.63f
+  };
+  uint8_t expect_mask[] = {
+      0, 0, 0, 0, 0, 0, 0, 0,
+      0, 1, 1, 1, 1, 1, 0, 0,
+      0, 0, 1, 1, 1, 0, 0, 0,
+      0, 0, 1, 1, 1, 0, 0, 0,
+      0, 0, 1, 1, 1, 0, 0, 0,
+      0, 1, 1, 1, 1, 1, 0, 0,
+      0, 1, 1, 1, 1, 1, 1, 0,
+      1, 1, 1, 1, 1, 1, 1, 1
+  };
+  // clang-format on
+
+  platform::CPUPlace cpu;
+  auto allocation = memory::Alloc(cpu, sizeof(expect_boxes));
+  float* boxes = reinterpret_cast<float*>(allocation->ptr());
+  Poly2Boxes(polys, boxes);
+  Compare<float>(expect_boxes, boxes, 4);
+
+  auto allocat_mask = memory::Alloc(cpu, sizeof(expect_mask));
+  uint8_t* mask = reinterpret_cast<uint8_t*>(allocat_mask->ptr());
+  int M = 8;
+  Polys2MaskWrtBox(polys[0], expect_boxes, M, mask);
+  Compare<uint8_t>(expect_mask, mask, M * M);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index cb492f999532fff3562050135c3a1abdcda06ad5..fc28fe818dc0bd2a8607118c015b6b5fd168fb43 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -20,7 +20,7 @@ if(WITH_GRPC)
         collective_client.cc collective_server.cc
         ${GRPC_SRCS}
       PROTO send_recv.proto 
-      DEPS lod_tensor selected_rows_functor memory)
+      DEPS lod_tensor selected_rows_functor memory scope ${GRPC_DEPS})
 
   set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   set(RPC_DEPS sendrecvop_rpc ${GRPC_DEPS})
@@ -32,15 +32,17 @@ else()
   set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc_server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc)
   set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
+  set(BRPC_DEPS brpc ssl crypto protobuf leveldb snappystream snappy zlib)
+
   brpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc
       request_handler_impl.cc rpc_client.cc rpc_server.cc
       variable_response.cc
       collective_client.cc collective_server.cc
       ${BRPC_SRCS}
     PROTO send_recv.proto
-    DEPS lod_tensor selected_rows memory)
+    DEPS lod_tensor selected_rows memory scope ${BRPC_DEPS})
 
-  set(RPC_DEPS sendrecvop_rpc brpc ssl crypto protobuf leveldb snappystream snappy zlib)
+  set(RPC_DEPS sendrecvop_rpc ${BRPC_DEPS})
   cc_test(brpc_serde_test SRCS brpc/brpc_serde_test.cc
       DEPS ${RPC_DEPS} gflags glog executor proto_desc lookup_sparse_table_op SERIAL)
 endif()
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.cc b/paddle/fluid/operators/distributed/brpc/brpc_client.cc
index 87bdb83503783b32720eb57bd303ad7eb4bc17a8..b8e63f42e2040730ac79c57651d86d9e3176fa01 100644
--- a/paddle/fluid/operators/distributed/brpc/brpc_client.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_client.cc
@@ -62,7 +62,7 @@ VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep,
   const std::string var_name_val = var_name;
   const framework::Scope* p_scope = &scope;
   const auto ch_ptr = GetChannel(ep_val);
-  const std::string method = "SendRPC";
+  const std::string method = kSendRPC;
   VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
 
   framework::AsyncIO([=] {
@@ -156,15 +156,18 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
                                       const platform::DeviceContext& ctx,
                                       const framework::Scope& scope,
                                       const std::string& var_name,
+                                      const std::string& out_var_name,
                                       const std::string& method_name,
                                       int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string var_name_val = var_name;
+  const std::string out_varname_val = out_var_name;
   const framework::Scope* p_scope = &scope;
   const auto ch_ptr = GetChannel(ep_val);
-  const std::string method = "GetRPC";
-  VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
+  const std::string method = kGetRPC;
+  VarHandlePtr var_h(
+      new VarHandle(ep, method, out_varname_val, p_ctx, p_scope));
 
   framework::AsyncIO([=] {
     auto ch_ctx = ch_ptr->Pop();
@@ -175,6 +178,7 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
 
     sendrecv::VariableMessage req;
     req.set_varname(var_name_val);
+    req.set_out_varname(out_varname_val);
     req.set_trainer_id(trainer_id_);
 
     google::protobuf::Closure* done = brpc::NewCallback(
@@ -182,8 +186,10 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
 
     platform::RecordRPCEvent record_event(method, p_ctx);
 
-    if (method_name == "GetMonomerVariable") {
+    if (method_name == kGetMonomerRPC) {
       ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done);
+    } else if (method_name == kGetNoBarrierRPC) {
+      ch_ctx->stub->GetVariableNoBarrier(cntl, &req, response, done);
     } else {
       ch_ctx->stub->GetVariable(cntl, &req, response, done);
     }
@@ -198,25 +204,39 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
   return var_h;
 }
 
+VarHandlePtr BRPCClient::AsyncGetVarNoBarrier(
+    const std::string& ep, const platform::DeviceContext& ctx,
+    const framework::Scope& scope, const std::string& var_name,
+    const std::string& out_var_name, int64_t time_out) {
+  std::string var_name_no_barrier =
+      string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE);
+
+  return _AsyncGetVar(ep, ctx, scope, var_name_no_barrier, out_var_name,
+                      kGetNoBarrierRPC, time_out);
+}
+
 VarHandlePtr BRPCClient::AsyncGetMonomerVariable(
     const std::string& ep, const platform::DeviceContext& ctx,
     const framework::Scope& scope, const std::string& var_name,
     int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, var_name, "GetMonomerVariable", time_out);
+  return _AsyncGetVar(ep, ctx, scope, var_name, var_name, kGetMonomerRPC,
+                      time_out);
 }
 
 VarHandlePtr BRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
                                                 const std::string& var_name,
                                                 int64_t time_out) {
-  return AsyncSendMessage(ep, "GetMonomerBarrier", var_name, time_out);
+  return AsyncSendMessage(ep, kSendMonomerFetchBarrierRPC, var_name, time_out);
 }
 
 VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep,
                                      const platform::DeviceContext& ctx,
                                      const framework::Scope& scope,
                                      const std::string& var_name,
+                                     const std::string& out_var_name,
                                      int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, var_name, "GetVariable", time_out);
+  return _AsyncGetVar(ep, ctx, scope, var_name, out_var_name, kGetRPC,
+                      time_out);
 }
 
 VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
@@ -234,7 +254,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
   const framework::Scope* p_scope = &scope;
   const auto ch_ptr = GetChannel(ep_val);
 
-  const std::string method = "PrefetchRPC";
+  const std::string method = kPrefetchRPC;
 
   VarHandlePtr var_h(
       new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope));
@@ -270,7 +290,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
 
 VarHandlePtr BRPCClient::AsyncSendBatchBarrier(const std::string& ep,
                                                int64_t time_out) {
-  return AsyncSendMessage(ep, "BatchBarrierRPC", BATCH_BARRIER_MESSAGE,
+  return AsyncSendMessage(ep, kBatchBarrierRPC, BATCH_BARRIER_MESSAGE,
                           time_out);
 }
 
@@ -286,7 +306,7 @@ VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep,
   sendrecv::VariableMessage req;
   req.set_varname(FETCH_BARRIER_MESSAGE);
 
-  const std::string method = "FetchBarrierRPC";
+  const std::string method = kFetchBarrierRPC;
   // var handle
   VarHandlePtr var_h(
       new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
@@ -367,7 +387,7 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
 
 VarHandlePtr BRPCClient::AsyncSendComplete(const std::string& ep,
                                            int64_t time_out) {
-  return AsyncSendMessage(ep, "SendCompleteRPC", COMPLETE_MESSAGE, time_out);
+  return AsyncSendMessage(ep, kSendCompleteRPC, COMPLETE_MESSAGE, time_out);
 }
 
 void BRPCClient::SendComplete() {
@@ -394,9 +414,9 @@ VarHandlePtr BRPCClient::AsyncSendVarMessage(
   google::protobuf::Closure* done = brpc::NewCallback(
       &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
 
-  if (method_name == "CheckPointNotifyRPC") {
+  if (method_name == kCheckPointNotifyRPC) {
     ch_ctx->stub->CheckpointNotify(cntl, &req, response, done);
-  } else if (method_name == "GetMonomerBarrier") {
+  } else if (method_name == kSendMonomerFetchBarrierRPC) {
     ch_ctx->stub->GetMonomerBarrier(cntl, &req, response, done);
   } else {
     ch_ctx->stub->SendVariable(cntl, &req, response, done);
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.h b/paddle/fluid/operators/distributed/brpc/brpc_client.h
index 2066ade8a5621f2c201b76690421a943db44535e..501a593b11d35c160348e42ee47216a85647aac4 100644
--- a/paddle/fluid/operators/distributed/brpc/brpc_client.h
+++ b/paddle/fluid/operators/distributed/brpc/brpc_client.h
@@ -65,6 +65,7 @@ class BRPCClient : public RPCClient {
                            const platform::DeviceContext& ctx,
                            const framework::Scope& scope,
                            const std::string& var_name,
+                           const std::string& out_var_name,
                            int64_t time_out = FLAGS_rpc_deadline) override;
 
   VarHandlePtr AsyncGetMonomerBarrier(
@@ -76,6 +77,13 @@ class BRPCClient : public RPCClient {
       const framework::Scope& scope, const std::string& var_name,
       int64_t time_out = FLAGS_rpc_deadline) override;
 
+  VarHandlePtr AsyncGetVarNoBarrier(const std::string& ep,
+                                    const platform::DeviceContext& ctx,
+                                    const framework::Scope& scope,
+                                    const std::string& var_name,
+                                    const std::string& out_varname,
+                                    int64_t time_out = FLAGS_rpc_deadline);
+
   VarHandlePtr AsyncPrefetchVar(const std::string& ep,
                                 const platform::DeviceContext& ctx,
                                 const framework::Scope& scope,
@@ -103,6 +111,7 @@ class BRPCClient : public RPCClient {
                             const platform::DeviceContext& ctx,
                             const framework::Scope& scope,
                             const std::string& var_name,
+                            const std::string& out_var_name,
                             const std::string& method_name,
                             int64_t time_out = FLAGS_rpc_deadline);
 
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_server.cc b/paddle/fluid/operators/distributed/brpc/brpc_server.cc
index cbe0bd09c7b272c35b78818aa9e26feeb5497779..fea9b09414638b607ca7f7d558ce14a2d5bfa03d 100644
--- a/paddle/fluid/operators/distributed/brpc/brpc_server.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_server.cc
@@ -45,6 +45,13 @@ class BRPCServiceImpl : public SendRecvService {
           rpc_server_->GetThreadNum(distributed::kRequestGet)));
     }
 
+    it = rpc_call_map.find(distributed::kRequestGetNoBarrier);
+    if (it != rpc_call_map.end()) {
+      request_getnobarrier_h_ = it->second;
+      getnobarrier_threads_.reset(new paddle::framework::ThreadPool(
+          rpc_server_->GetThreadNum(distributed::kRequestGetNoBarrier)));
+    }
+
     it = rpc_call_map.find(distributed::kRequestPrefetch);
     if (it != rpc_call_map.end()) {
       request_prefetch_h_ = it->second;
@@ -112,6 +119,14 @@ class BRPCServiceImpl : public SendRecvService {
         [=] { _GetVariable(cntl_butil, request, response, done); });
   }
 
+  void GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil,
+                            const VariableMessage* request,
+                            VariableMessage* response,
+                            google::protobuf::Closure* done) override {
+    getnobarrier_threads_->Run(
+        [=] { _GetVariableNoBarrier(cntl_butil, request, response, done); });
+  }
+
   void _GetVariable(google::protobuf::RpcController* cntl_butil,
                     const VariableMessage* request, VariableMessage* response,
                     google::protobuf::Closure* done) {
@@ -122,23 +137,59 @@ class BRPCServiceImpl : public SendRecvService {
     brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
 
     std::string varname = request->varname();
+    std::string out_varname = request->out_varname();
     VLOG(3) << "RequestGet varname:" << varname
+            << ", out_varname:" << out_varname
             << ", trainer_id:" << request->trainer_id()
             << ", from:" << cntl->remote_side();
 
     auto scope = request_get_h_->scope();
-    auto invar = scope->FindVar(varname);
+    paddle::framework::Variable* invar = nullptr;
+    int trainer_id = request->trainer_id();
+    paddle::framework::Variable* outvar = nullptr;
+
+    request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id,
+                           out_varname);
+
+    if (outvar) {
+      distributed::SerializeToIOBuf(out_varname, outvar,
+                                    *request_get_h_->dev_ctx(), response,
+                                    &cntl->response_attachment(), "", false);
+    }
+  }
+
+  void _GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil,
+                             const VariableMessage* request,
+                             VariableMessage* response,
+                             google::protobuf::Closure* done) {
+    PADDLE_ENFORCE(request_getnobarrier_h_ != nullptr,
+                   "RequestGetNoBarrier handler should be registed first!");
+
+    brpc::ClosureGuard done_guard(done);
+    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
+
+    std::string varname = request->varname();
+    std::string out_varname = request->out_varname();
     int trainer_id = request->trainer_id();
+
+    VLOG(3) << "RequestGetNoBarrier varname:" << varname
+            << ", out_varname:" << out_varname << ", trainer_id:" << trainer_id
+            << ", from:" << cntl->remote_side();
+
+    auto scope = request_getnobarrier_h_->scope();
+    paddle::framework::Variable* invar = nullptr;
     paddle::framework::Variable* outvar = nullptr;
 
-    request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id);
+    request_getnobarrier_h_->Handle(varname, scope, invar, &outvar, trainer_id,
+                                    out_varname);
 
     if (outvar) {
-      distributed::SerializeToIOBuf(varname, outvar, *request_get_h_->dev_ctx(),
-                                    response, &cntl->response_attachment(), "",
-                                    false);
+      distributed::SerializeToIOBuf(
+          out_varname, outvar, *request_getnobarrier_h_->dev_ctx(), response,
+          &cntl->response_attachment(), "", false);
     }
   }
+
   void PrefetchVariable(google::protobuf::RpcController* cntl_butil,
                         const VariableMessage* request,
                         VariableMessage* response,
@@ -282,6 +333,7 @@ class BRPCServiceImpl : public SendRecvService {
  private:
   distributed::RequestHandler* request_send_h_{nullptr};
   distributed::RequestHandler* request_get_h_{nullptr};
+  distributed::RequestHandler* request_getnobarrier_h_{nullptr};
   distributed::RequestHandler* request_prefetch_h_{nullptr};
   distributed::RequestHandler* request_checkpoint_h_{nullptr};
   distributed::RequestHandler* request_get_monomer_handler_h_{nullptr};
@@ -289,9 +341,10 @@ class BRPCServiceImpl : public SendRecvService {
 
   distributed::RPCServer* rpc_server_{nullptr};
 
-  // FIXME(gongwb): brpc should support process one rpce use one threadpool.
+  // FIXME(gongwb): brpc should support process one rpc use one threadpool.
   std::unique_ptr<paddle::framework::ThreadPool> send_threads_;
   std::unique_ptr<paddle::framework::ThreadPool> get_threads_;
+  std::unique_ptr<paddle::framework::ThreadPool> getnobarrier_threads_;
   std::unique_ptr<paddle::framework::ThreadPool> prefetch_threads_;
   std::unique_ptr<paddle::framework::ThreadPool> checkpoint_notify_threads_;
 };
diff --git a/paddle/fluid/operators/distributed/collective_server_test.cc b/paddle/fluid/operators/distributed/collective_server_test.cc
index 5009058422b81d3187f7792bf7bf56db1d03f4d6..90f2f9fd65bf1b8c1edda6a2ebe0ce5288ddcb5d 100644
--- a/paddle/fluid/operators/distributed/collective_server_test.cc
+++ b/paddle/fluid/operators/distributed/collective_server_test.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor_util.h"
 
 #include "paddle/fluid/operators/distributed/collective_client.h"
 #include "paddle/fluid/operators/distributed/collective_server.h"
@@ -57,7 +58,7 @@ std::unique_ptr<framework::Scope> GenerateVars(platform::Place place) {
   auto* tensor = slr->mutable_value();
   auto* rows = slr->mutable_rows();
 
-  tensor->Resize(framework::make_ddim({20000, 1024}));
+  tensor->Resize(framework::make_ddim({3, 1024}));
   tensor->mutable_data<float>(place);
 
   paddle::operators::math::set_constant(ctx, tensor, 32.7);
@@ -80,6 +81,20 @@ void Gather(const std::vector<distributed::RemoteVar>& vars,
   std::vector<const framework::SelectedRows*> dst;
   client->Gather(vars, &dst, *dev_ctx, scope);
   std::cout << "dst:" << distributed::GetSelectedRowsInfo(*dst[0]);
+  dev_ctx->Wait();
+
+  ASSERT_EQ(dst[0]->value().dims(), framework::make_ddim({3, 1024}));
+  ASSERT_EQ(dst[0]->height(), 20000);
+  ASSERT_EQ(dst[0]->rows().size(), static_cast<size_t>(3));
+  for (int i = 0; i < 3; i++) {
+    ASSERT_EQ(dst[0]->rows()[i], i);
+  }
+
+  std::vector<float> vec;
+  TensorToVector(dst[0]->value(), *dev_ctx, &vec);
+  for (size_t i = 0; i < 3 * 1024; i++) {
+    ASSERT_FLOAT_EQ(vec[i], 32.7);
+  }
 }
 
 TEST(CollectiveServer, GPU) {
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
index 7875c16c3cf412ee06fa7c8eb36400b1096f156b..52310f8d04db6a5df9967c0a5ec9a5e95a24cdab 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
@@ -74,7 +74,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
   SendProcessor* s = new SendProcessor(ch);
-  const std::string method = "SendRPC";
+  const std::string method = kSendRPC;
   VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
   s->Prepare(h, time_out);
 
@@ -107,7 +107,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
 
 void ProcGetResponse(const VarHandle& var_h,
                      const ::grpc::ByteBuffer& ret_msg) {
-  VLOG(100) << "ProcGetResponse";
+  VLOG(4) << "ProcGetResponse";
   framework::Variable* outvar = nullptr;
   // get response's trainer_id is not used
   int trainer_id;
@@ -127,59 +127,74 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
                                      const platform::DeviceContext& ctx,
                                      const framework::Scope& scope,
                                      const std::string& var_name,
+                                     const std::string& out_varname,
                                      int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, var_name,
+  return _AsyncGetVar(ep, ctx, scope, kGetRPC, var_name, out_varname,
                       "/sendrecv.SendRecvService/GetVariable", time_out);
 }
 
+VarHandlePtr GRPCClient::AsyncGetVarNoBarrier(
+    const std::string& ep, const platform::DeviceContext& ctx,
+    const framework::Scope& scope, const std::string& var_name,
+    const std::string& out_varname, int64_t time_out) {
+  std::string var_name_no_barrier =
+      string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE);
+
+  return _AsyncGetVar(
+      ep, ctx, scope, kGetNoBarrierRPC, var_name_no_barrier, out_varname,
+      "/sendrecv.SendRecvService/GetVariableNoBarrier", time_out);
+}
+
 VarHandlePtr GRPCClient::AsyncGetMonomerVariable(
     const std::string& ep, const platform::DeviceContext& ctx,
     const framework::Scope& scope, const std::string& var_name,
     int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, var_name,
+  return _AsyncGetVar(ep, ctx, scope, kGetMonomerRPC, var_name, var_name,
                       "/sendrecv.SendRecvService/GetMonomerVariable", time_out);
 }
 
-VarHandlePtr GRPCClient::_AsyncGetVar(const std::string& ep,
-                                      const platform::DeviceContext& ctx,
-                                      const framework::Scope& scope,
-                                      const std::string& var_name,
-                                      const std::string& rpc_path,
-                                      int64_t time_out) {
+VarHandlePtr GRPCClient::_AsyncGetVar(
+    const std::string& ep, const platform::DeviceContext& ctx,
+    const framework::Scope& scope, const std::string& method,
+    const std::string& var_name, const std::string& out_varname,
+    const std::string& rpc_path, int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string var_name_val = var_name;
+  const std::string out_varname_val = out_varname;
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
   GetProcessor* s = new GetProcessor(ch);
-  const std::string method = "GetRPC";
-  VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
+
+  VarHandlePtr h(new VarHandle(ep, method, out_varname_val, p_ctx, p_scope));
   s->Prepare(h, time_out);
 
-  framework::AsyncIO([var_name_val, s, method, p_ctx, h, rpc_path, this] {
-    // prepare input
-    sendrecv::VariableMessage req;
-    req.set_varname(var_name_val);
-    req.set_trainer_id(trainer_id_);
-    ::grpc::ByteBuffer buf;
-    RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
+  framework::AsyncIO(
+      [var_name_val, out_varname_val, s, method, p_ctx, h, rpc_path, this] {
+        // prepare input
+        sendrecv::VariableMessage req;
+        req.set_varname(var_name_val);
+        req.set_out_varname(out_varname_val);
+        req.set_trainer_id(trainer_id_);
+        ::grpc::ByteBuffer buf;
+        RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
 
-    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
+        VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
 
-    // stub context
-    s->response_call_back_ = ProcGetResponse;
+        // stub context
+        s->response_call_back_ = ProcGetResponse;
 
-    platform::RecordRPCEvent record_event(method, p_ctx);
+        platform::RecordRPCEvent record_event(method, p_ctx);
 
-    auto call =
-        s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_);
-    call->StartCall();
-    call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+        auto call =
+            s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_);
+        call->StartCall();
+        call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
 
-    if (UNLIKELY(platform::IsProfileEnabled())) {
-      h->Wait();
-    }
-  });
+        if (UNLIKELY(platform::IsProfileEnabled())) {
+          h->Wait();
+        }
+      });
 
   req_count_++;
 
@@ -202,7 +217,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
   const auto ch = GetChannel(ep_val);
   GetProcessor* s = new GetProcessor(ch);
 
-  const std::string method = "PrefetchRPC";
+  const std::string method = kPrefetchRPC;
 
   VarHandlePtr h(new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope));
   s->Prepare(h, time_out);
@@ -242,7 +257,7 @@ VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
   const auto ch = GetChannel(ep);
 
   BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  const std::string method = "BatchBarrierRPC";
+  const std::string method = kBatchBarrierRPC;
   VarHandlePtr h(
       new VarHandle(ep, method, BATCH_BARRIER_MESSAGE, nullptr, nullptr));
   s->Prepare(h, time_out);
@@ -267,7 +282,7 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
                                                int64_t time_out) {
   const auto ch = GetChannel(ep);
   FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
-  const std::string method = "FetchBarrierRPC";
+  const std::string method = kFetchBarrierRPC;
   VarHandlePtr h(
       new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
   s->Prepare(h, time_out);
@@ -293,7 +308,7 @@ VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
                                                 int64_t time_out) {
   const auto ch = GetChannel(ep);
   BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  const std::string method = "SendMonomerFetchBarrierRPC";
+  const std::string method = kSendMonomerFetchBarrierRPC;
   VarHandlePtr h(new VarHandle(ep, method, var_name, nullptr, nullptr));
   s->Prepare(h, time_out);
 
@@ -320,7 +335,7 @@ VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
   const auto ch = GetChannel(ep);
 
   BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  const std::string method = "SendCompleteRPC";
+  const std::string method = kSendCompleteRPC;
   VarHandlePtr h(new VarHandle(ep, method, COMPLETE_MESSAGE, nullptr, nullptr));
   s->Prepare(h, time_out);
 
@@ -347,7 +362,7 @@ VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep,
 
   CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch);
 
-  const std::string method = "CheckPointNotifyRPC";
+  const std::string method = kCheckPointNotifyRPC;
 
   VarHandlePtr h(
       new VarHandle(ep, method, CHECKPOINT_SAVE_MESSAGE, nullptr, nullptr));
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.h b/paddle/fluid/operators/distributed/grpc/grpc_client.h
index fa77d21257647b23b8ac9f8161a216d36d7df773..ce0d2152aa27c62b6e12881aaf2ae458597e67e6 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.h
@@ -186,8 +186,15 @@ class GRPCClient : public RPCClient {
                            const platform::DeviceContext& ctx,
                            const framework::Scope& scope,
                            const std::string& var_name,
+                           const std::string& out_varname,
                            int64_t time_out = FLAGS_rpc_deadline) override;
 
+  VarHandlePtr AsyncGetVarNoBarrier(
+      const std::string& ep, const platform::DeviceContext& ctx,
+      const framework::Scope& scope, const std::string& var_name,
+      const std::string& out_varname,
+      int64_t time_out = FLAGS_rpc_deadline) override;
+
   VarHandlePtr AsyncGetMonomerVariable(
       const std::string& ep, const platform::DeviceContext& ctx,
       const framework::Scope& scope, const std::string& var_name,
@@ -228,11 +235,11 @@ class GRPCClient : public RPCClient {
   void Proceed();
 
   std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
-  VarHandlePtr _AsyncGetVar(const std::string& ep,
-                            const platform::DeviceContext& ctx,
-                            const framework::Scope& scope,
-                            const std::string& var_name, const std::string& rpc,
-                            int64_t time_out);
+  VarHandlePtr _AsyncGetVar(
+      const std::string& ep, const platform::DeviceContext& ctx,
+      const framework::Scope& scope, const std::string& method,
+      const std::string& var_name, const std::string& out_varname,
+      const std::string& rpc_path, int64_t time_out = FLAGS_rpc_deadline);
 
  private:
   grpc::CompletionQueue cq_;
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
index 08f777e279e34da0c0ac89afd3f660fa089599fe..4a9c158cb0ab7f2d6fecbba9f957ae6ef153074c 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
@@ -136,17 +136,65 @@ class RequestGet final : public RequestBase {
   void Process() override {
     // proc request.
     std::string varname = request_.varname();
+    std::string out_varname = request_.out_varname();
     int trainer_id = request_.trainer_id();
-    VLOG(4) << "RequestGet " << varname;
+
+    VLOG(4) << "RequestGet " << out_varname << " from " << varname;
 
     auto scope = request_handler_->scope();
-    auto invar = scope->FindVar(varname);
+    framework::Variable* invar = nullptr;
     framework::Variable* outvar = nullptr;
 
-    request_handler_->Handle(varname, scope, invar, &outvar, trainer_id);
+    request_handler_->Handle(varname, scope, invar, &outvar, trainer_id,
+                             out_varname);
 
     if (outvar) {
-      SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(),
+      SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(),
+                            &reply_);
+    }
+    Finish(reply_, &responder_);
+  }
+
+ protected:
+  sendrecv::VariableMessage request_;
+  ::grpc::ByteBuffer reply_;
+  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
+};
+
+class RequestGetNoBarrier final : public RequestBase {
+ public:
+  explicit RequestGetNoBarrier(GrpcService::AsyncService* service,
+                               ::grpc::ServerCompletionQueue* cq,
+                               RequestHandler* request_handler, int req_id)
+      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
+    auto method_id =
+        static_cast<int>(distributed::GrpcMethod::kGetVariableNoBarrier);
+    service_->RequestAsyncUnary(
+        method_id, &ctx_, &request_, &responder_, cq_, cq_,
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
+  }
+
+  virtual ~RequestGetNoBarrier() {}
+
+  std::string GetReqName() override { return request_.varname(); }
+
+  void Process() override {
+    // proc request.
+    std::string varname = request_.varname();
+    std::string out_varname = request_.out_varname();
+    int trainer_id = request_.trainer_id();
+
+    VLOG(4) << "RequestGetNoBarrier " << out_varname << " from " << varname;
+
+    auto scope = request_handler_->scope();
+    framework::Variable* invar = nullptr;
+    framework::Variable* outvar = nullptr;
+
+    request_handler_->Handle(varname, scope, invar, &outvar, trainer_id,
+                             out_varname);
+
+    if (outvar) {
+      SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(),
                             &reply_);
     }
     Finish(reply_, &responder_);
@@ -460,6 +508,9 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
     b = new RequestSend(&service_, cq.get(), handler, req_id);
   } else if (rpc_name == kRequestGet) {
     b = new RequestGet(&service_, cq.get(), handler, req_id);
+
+  } else if (rpc_name == kRequestGetNoBarrier) {
+    b = new RequestGetNoBarrier(&service_, cq.get(), handler, req_id);
   } else if (rpc_name == kRequestGetMonomerVariable) {
     b = new RequestGetMonomerVariable(&service_, cq.get(), handler, req_id,
                                       this);
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_service.h b/paddle/fluid/operators/distributed/grpc/grpc_service.h
index 0b5c5151e637f0d7aeafaefefb01006ffe0f05c8..2965fe4490bedd0253682f0aef44e096232fc2fc 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_service.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_service.h
@@ -81,6 +81,7 @@ enum class GrpcMethod {
   kGetVariable,
   kPrefetchVariable,
   kCheckpointNotify,
+  kGetVariableNoBarrier,
   kGetMonomerVariable,
   kGetMonomerBarrier,
 };
@@ -94,6 +95,8 @@ inline const char* GrpcMethodName(GrpcMethod id) {
       return "/sendrecv.SendRecvService/SendVariable";
     case GrpcMethod::kGetVariable:
       return "/sendrecv.SendRecvService/GetVariable";
+    case GrpcMethod::kGetVariableNoBarrier:
+      return "/sendrecv.SendRecvService/GetVariableNoBarrier";
     case GrpcMethod::kGetMonomerVariable:
       return "/sendrecv.SendRecvService/GetMonomerVariable";
     case GrpcMethod::kGetMonomerBarrier:
diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
index 62b24f150b41efead24c8bdbe08c9b44e160445a..991158ac72007efc1233f852caed4f90f35fe1cd 100644
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -42,11 +42,24 @@ constexpr char kRequestGetMonomerBarrier[] = "RequestGetMonomerBarrier";
 constexpr char kRequestPrefetch[] = "RequestPrefetch";
 constexpr char kRequestCheckpoint[] = "RequestCheckpoint";
 constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
+constexpr char kRequestGetNoBarrier[] = "GetVariableNoBarrier";
+
+constexpr char kSendRPC[] = "SendRPC";
+constexpr char kGetRPC[] = "GetRPC";
+constexpr char kGetNoBarrierRPC[] = "GetNoBarrierRPC";
+constexpr char kGetMonomerRPC[] = "GetMonomerRPC";
+constexpr char kPrefetchRPC[] = "PrefetchRPC";
+constexpr char kBatchBarrierRPC[] = "BatchBarrierRPC";
+constexpr char kFetchBarrierRPC[] = "FetchBarrierRPC";
+constexpr char kSendMonomerFetchBarrierRPC[] = "SendMonomerFetchBarrierRPC";
+constexpr char kSendCompleteRPC[] = "SendCompleteRPC";
+constexpr char kCheckPointNotifyRPC[] = "CheckPointNotifyRPC";
 
 #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
 #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
 #define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
 #define COMPLETE_MESSAGE "COMPLETE@RECV"
+#define WITHOUT_BARRIER_MESSAGE "@WITHOUT_BARRIER@RECV"
 
 #define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY"
 #define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY"
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index 9722f8c96e91d2dfbe929dcc11645a40c44afb4e..913ae76b38dc663d6fb4102f795ac713fd8a6bdf 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -23,6 +23,7 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/operators/distributed/rpc_server.h"
+#include "paddle/fluid/string/piece.h"
 #include "paddle/fluid/string/printf.h"
 
 namespace paddle {
@@ -81,7 +82,8 @@ bool RequestGetHandler::Handle(const std::string& varname,
                                const int trainer_id,
                                const std::string& out_var_name,
                                const std::string& table_name) {
-  VLOG(4) << "RequestGetHandler:" << varname;
+  VLOG(4) << "RequestGetHandler:" << varname
+          << " out_var_name: " << out_var_name;
 
   if (sync_mode_) {
     if (varname == FETCH_BARRIER_MESSAGE) {
@@ -112,6 +114,32 @@ bool RequestGetHandler::Handle(const std::string& varname,
   return true;
 }
 
+bool RequestGetNoBarrierHandler::Handle(const std::string& varname,
+                                        framework::Scope* scope,
+                                        framework::Variable* invar,
+                                        framework::Variable** outvar,
+                                        const int trainer_id,
+                                        const std::string& out_var_name,
+                                        const std::string& table_name) {
+  VLOG(4) << "RequestGetNoBarrierHandler:" << varname
+          << " out_var_name: " << out_var_name;
+
+  // get var from pserver immediately without barriers
+  string::Piece without_barrier_piece(WITHOUT_BARRIER_MESSAGE);
+  string::Piece var_name_piece = string::Piece(varname);
+
+  if (string::Contains(var_name_piece, without_barrier_piece)) {
+    var_name_piece = string::TrimSuffix(var_name_piece, without_barrier_piece);
+    VLOG(4) << "Get var " << var_name_piece << " with "
+            << WITHOUT_BARRIER_MESSAGE;
+    *outvar = scope_->FindVar(var_name_piece.ToString());
+    return true;
+  } else {
+    PADDLE_THROW("GetNoBarrier must contain %s", WITHOUT_BARRIER_MESSAGE);
+  }
+  return true;
+}
+
 bool RequestPrefetchHandler::Handle(const std::string& varname,
                                     framework::Scope* scope,
                                     framework::Variable* invar,
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h
index 5e0b25c5c2ce161dee0948a07baab32dfff9be6f..f3c1b24526b8b28033c0c979f74d44a3d7a94201 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.h
+++ b/paddle/fluid/operators/distributed/request_handler_impl.h
@@ -67,6 +67,16 @@ class RequestGetHandler final : public RequestHandler {
   bool enable_dc_asgd_;
 };
 
+class RequestGetNoBarrierHandler final : public RequestHandler {
+ public:
+  RequestGetNoBarrierHandler() : RequestHandler(false) {}
+  virtual ~RequestGetNoBarrierHandler() {}
+  bool Handle(const std::string& varname, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar,
+              const int trainer_id, const std::string& out_var_name = "",
+              const std::string& table_name = "") override;
+};
+
 static inline void BuildVar(const std::string& param_name,
                             std::initializer_list<const char*> arguments,
                             paddle::framework::proto::OpDesc::Var* var) {
diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h
index b668d869787a47ebd36f570061421ddbeae5a09a..ea54e0c2951253fc009672f4cd2e5233ed56944e 100644
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -43,8 +43,15 @@ class RPCClient {
                                    const platform::DeviceContext& ctx,
                                    const framework::Scope& scope,
                                    const std::string& var_name,
+                                   const std::string& out_varname,
                                    int64_t time_out = FLAGS_rpc_deadline) = 0;
 
+  virtual VarHandlePtr AsyncGetVarNoBarrier(
+      const std::string& ep, const platform::DeviceContext& ctx,
+      const framework::Scope& scope, const std::string& var_name,
+      const std::string& out_varname,
+      int64_t time_out = FLAGS_rpc_deadline) = 0;
+
   virtual VarHandlePtr AsyncGetMonomerVariable(
       const std::string& ep, const platform::DeviceContext& ctx,
       const framework::Scope& scope, const std::string& var_name,
diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in
index b39eef04d8d1de77cb951f90a10e69eebb495282..6303667884361be050ac62c604274c87caa72444 100644
--- a/paddle/fluid/operators/distributed/send_recv.proto.in
+++ b/paddle/fluid/operators/distributed/send_recv.proto.in
@@ -17,8 +17,14 @@ package sendrecv;
 option cc_generic_services = @cc_generic_services@;
 
 service SendRecvService {
+  // For parameter server round-robin like hashing, do not split tensors.
+  // Send and recv only one tensor
+  // TODO(typhoonzero): add streaming API
   rpc SendVariable(VariableMessage) returns (VoidMessage) {}
+  // Argument VariableMessage for GetVariable should only contain varname.
   rpc GetVariable(VariableMessage) returns (VariableMessage) {}
+  rpc GetVariableNoBarrier(VariableMessage) returns (VariableMessage) {}
+  // pre-fetch variable by given variable name and Ids
   rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
 
   rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
@@ -27,12 +33,17 @@ service SendRecvService {
   rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {}
 }
 
+// It can be: LoDTensor、SelectedRows or NCCL_ID
 enum VarType {
   LOD_TENSOR = 0;
   SELECTED_ROWS = 1;
   NCCL_ID = 2;
 }
 
+// VariableMessage is serialized paddle variable message.
+// NOTICE(gongwb):don't modify this proto if you are not
+//   not familar with how we serialize in sendrecvop_utils.h
+//   and deserilize it in  variable_response.h.
 message VariableMessage {
   enum Type {
     // Pod Types
@@ -49,14 +60,21 @@ message VariableMessage {
   string varname = 1;
   // TODO(Yancey1989): reference framework::proto::VarDesc::VarType
   VarType type = 2;
+  // bool persistable is not needed for sending.
+  // tensor info:
   Type data_type = 3;
   repeated int64 dims = 4;
 
+  // lod details:
   int64 lod_level = 5;
   repeated LodData lod = 6;
+  // selected_rows height, aka. original dim0
   int64 slr_height = 7;
+  // tensor data
   bytes serialized = 8;
+  // selected_rows data
   bytes rows = 9;
+  // Look up table block execution output variable name.
   string out_varname = 10;
   // If 1, the ps server will start profiling, the ps
   // server stops profiling and generates a profile to /tmp/profile_ps_*
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
index 629f364d712694d2bc1d2483711f5247561ed1da..53968831ea0d640d13fc69ce1855257e8deed54c 100644
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
@@ -347,6 +347,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
       new distributed::RequestPrefetchHandler(sync_mode));
   request_checkpoint_handler_.reset(new distributed::RequestCheckpointHandler(
       sync_mode, checkpoint_block_id));
+  request_get_no_barrier_handler_.reset(
+      new distributed::RequestGetNoBarrierHandler());
 
   rpc_service_->RegisterRPC(distributed::kRequestSend,
                             request_send_handler_.get(),
@@ -359,6 +361,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
                             FLAGS_rpc_prefetch_thread_num);
   rpc_service_->RegisterRPC(distributed::kRequestCheckpoint,
                             request_checkpoint_handler_.get());
+  rpc_service_->RegisterRPC(distributed::kRequestGetNoBarrier,
+                            request_get_no_barrier_handler_.get());
 
   auto optimize_blocks =
       Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
@@ -413,6 +417,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   f(request_get_handler_.get());
   f(request_prefetch_handler_.get());
   f(request_checkpoint_handler_.get());
+  f(request_get_no_barrier_handler_.get());
 
   // start the server listening after all member initialized.
   server_thread_.reset(new std::thread(RunServer, rpc_service_));
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
index 9431978df836121baacc12ed4e1ee6b218cc7d7a..f20442bad7c5bd96173b9d6efc4dceb13feacf5b 100644
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
@@ -55,7 +55,6 @@ class ListenAndServOp : public framework::OperatorBase {
                   const framework::VariableNameMap& inputs,
                   const framework::VariableNameMap& outputs,
                   const framework::AttributeMap& attrs);
-
   virtual ~ListenAndServOp();
 
   void RunSyncLoop(framework::Executor* executor,
@@ -89,6 +88,8 @@ class ListenAndServOp : public framework::OperatorBase {
   mutable std::shared_ptr<distributed::RPCServer> rpc_service_;
   mutable std::shared_ptr<distributed::RequestHandler> request_send_handler_;
   mutable std::shared_ptr<distributed::RequestHandler> request_get_handler_;
+  mutable std::shared_ptr<distributed::RequestHandler>
+      request_get_no_barrier_handler_;
   mutable std::shared_ptr<distributed::RequestHandler>
       request_prefetch_handler_;
   mutable std::shared_ptr<distributed::RequestHandler>
diff --git a/paddle/fluid/operators/distributed_ops/merge_ids_op.h b/paddle/fluid/operators/distributed_ops/merge_ids_op.h
index 99c57590191d58a12760fb335df76037685d1ced..05c00251b97bb5071102a43208c1fbbfa4ef8d2d 100644
--- a/paddle/fluid/operators/distributed_ops/merge_ids_op.h
+++ b/paddle/fluid/operators/distributed_ops/merge_ids_op.h
@@ -43,9 +43,9 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(ids.size(), outs.size(),
                       "the number of Ids and Out should be the same");
 
-    size_t row_ids_size = 0;
-    int row_size = 0;
-    int embedding_size = 0;
+    int64_t row_ids_size = 0;
+    int64_t row_size = 0;
+    int64_t embedding_size = 0;
 
     for (size_t i = 0; i < x_tensors.size(); ++i) {
       const auto *x_tensor = x_tensors[i];
@@ -69,7 +69,7 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
     for (size_t i = 0; i < x_tensors.size(); ++i) {
       const auto *row_id = row_ids[i];
 
-      for (int j = 0; j < row_id->numel(); ++j) {
+      for (auto j = 0; j < row_id->numel(); ++j) {
         int64_t key = row_id->data<int64_t>()[j];
         std::tuple<int64_t, int64_t> val = std::make_tuple(i, j);
         selected_rows_idx_map.insert(std::make_pair(key, val));
@@ -84,13 +84,13 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
 
       out->set_lod(out_ids->lod());
 
-      int nums = static_cast<int>(out_ids->dims()[0]);
+      auto nums = out_ids->dims()[0];
       auto *out_data = out->mutable_data<T>(
           framework::make_ddim({nums, embedding_size}), place);
-      for (int j = 0; j < nums; ++j) {
-        int id = out_ids->data<int64_t>()[j];
-        auto row_tuple = selected_rows_idx_map[id];
-        int64_t row_idx = std::get<1>(row_tuple);
+      for (auto j = 0; j < nums; ++j) {
+        auto id = out_ids->data<int64_t>()[j];
+        auto row_tuple = selected_rows_idx_map.at(id);
+        auto row_idx = std::get<1>(row_tuple);
         const auto *x_tensor = x_tensors[std::get<0>(row_tuple)];
 
         memcpy(out_data + embedding_size * j,
diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc
index 48065437e38b2c5457c135cce03075f81110a329..120c65f29699bf2745b09ea312d1de069c8173c5 100644
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_op.cc
@@ -27,30 +27,50 @@ namespace operators {
 
 class RecvOp : public framework::OperatorBase {
  public:
-  RecvOp(const std::string& type, const framework::VariableNameMap& inputs,
-         const framework::VariableNameMap& outputs,
-         const framework::AttributeMap& attrs)
+  RecvOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    auto outs = Outputs("Out");
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+    std::vector<std::string> varnames =
+        Attr<std::vector<std::string>>("varnames");
     int sync_mode = Attr<int>("sync_mode");
+    auto outs = Outputs("Out");
+    bool with_barrier = Attr<bool>("with_barrier");
 
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &ctx = *pool.Get(place);
 
-    distributed::RPCClient* rpc_client =
+    distributed::RPCClient *rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>(
             Attr<int>("trainer_id"));
 
-    std::vector<distributed::VarHandlePtr> rets;
-    for (size_t i = 0; i < outs.size(); i++) {
-      VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
-      rets.push_back(rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]));
-    }
-    if (sync_mode) {
+    if (with_barrier) {
+      std::vector<distributed::VarHandlePtr> rets;
+      for (size_t i = 0; i < outs.size(); i++) {
+        std::string varname = varnames.size() == 0 ? outs[i] : varnames[i];
+        VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with "
+                << varname << " and with AsyncGetVar";
+        rets.push_back(
+            rpc_client->AsyncGetVar(epmap[i], ctx, scope, varname, outs[i]));
+      }
+      if (sync_mode) {
+        for (size_t i = 0; i < rets.size(); i++) {
+          PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+        }
+      }
+    } else {
+      std::vector<distributed::VarHandlePtr> rets;
+      for (size_t i = 0; i < outs.size(); i++) {
+        std::string varname = varnames.size() == 0 ? outs[i] : varnames[i];
+        VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with "
+                << varname << " and with AsyncGetVarNoBarrier";
+        rets.push_back(rpc_client->AsyncGetVarNoBarrier(epmap[i], ctx, scope,
+                                                        varname, outs[i]));
+      }
       for (size_t i = 0; i < rets.size(); i++) {
         PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
       }
@@ -79,12 +99,23 @@ This operator can get variables from server side.
                  "(int, default 0)"
                  "sync recv or async recv.")
         .SetDefault(0);
+    AddAttr<bool>("with_barrier",
+                  "(bool, default True) if with_barrier=False, will use "
+                  "AsyncGetVarNoBarrier get variable from pserver immediately")
+        .SetDefault(true);
+    AddAttr<std::vector<std::string>>(
+        "varnames",
+        "(string vector, default {}) "
+        "sometimes we need to put received var in another name "
+        "for example: we need var named 'moment_1@127.0.0.1:1001', "
+        "and it real name on parameter server is 'moment_1'. ")
+        .SetDefault({});
   }
 };
 
 class RecvOpShapeInference : public framework::InferShapeBase {
  public:
-  void operator()(framework::InferShapeContext* ctx) const override {}
+  void operator()(framework::InferShapeContext *ctx) const override {}
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 7bb6934e1496cc989eee8ba82f56959522803bfb..cb8a4e7e1502e7e6ceb48e51452c2c7ab8313972 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -277,68 +277,6 @@ class TransformFunctor {
   Functor func_;
 };
 
-#define EIGEN_FUNCTOR(name, eigen_op)                                          \
-  struct Eigen##name##Functor {                                                \
-    template <typename DeviceContext, typename T>                              \
-    inline void Run(const framework::Tensor *x, const framework::Tensor *y,    \
-                    framework::Tensor *z,                                      \
-                    const framework::ExecutionContext &ctx) {                  \
-      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
-      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
-      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
-      z_e.device(                                                              \
-          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
-          eigen_op(x_e, y_e);                                                  \
-    }                                                                          \
-    template <typename DeviceContext, typename T>                              \
-    inline void RunBroadCast(const framework::Tensor *x,                       \
-                             const framework::Tensor *y, framework::Tensor *z, \
-                             const framework::ExecutionContext &ctx, int pre,  \
-                             int n) {                                          \
-      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
-      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
-      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
-      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))                  \
-                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))             \
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
-      z_e.device(                                                              \
-          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
-          eigen_op(x_e, y_bcast);                                              \
-    }                                                                          \
-    template <typename DeviceContext, typename T>                              \
-    inline void RunBroadCast2(const framework::Tensor *x,                      \
-                              const framework::Tensor *y,                      \
-                              framework::Tensor *z,                            \
-                              const framework::ExecutionContext &ctx, int pre, \
-                              int n, int post) {                               \
-      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
-      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
-      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
-      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))               \
-                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))       \
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
-      z_e.device(                                                              \
-          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
-          eigen_op(x_e, y_bcast);                                              \
-    }                                                                          \
-  }
-
-#define EIGEN_ADD(x, y) ((x) + (y))
-
-EIGEN_FUNCTOR(Add, EIGEN_ADD);
-
-#define EIGEN_SUB(x, y) ((x) - (y))
-
-EIGEN_FUNCTOR(Sub, EIGEN_SUB);
-
-#define EIGEN_MUL(x, y) ((x) * (y))
-
-EIGEN_FUNCTOR(Mul, EIGEN_MUL);
-
-#define EIGEN_DIV(x, y) ((x) / (y))
-
-EIGEN_FUNCTOR(Div, EIGEN_DIV);
-
 template <typename T, typename DX_OP, typename DY_OP>
 struct ElemwiseGradNoBroadcast {
   const T *x_;
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index 0a8c0814a7d472bb1b527a4df470a34dcaf00e81..55cef93aacd43174edefbb8aa740bcbea3d8feef 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -103,8 +103,10 @@ REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
 REGISTER_OPERATOR(gather_grad, ops::GatherGradOp);
 REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>,
                        ops::GatherOpKernel<double>, ops::GatherOpKernel<int>,
+                       ops::GatherOpKernel<uint8_t>,
                        ops::GatherOpKernel<int64_t>);
 REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>,
                        ops::GatherGradientOpKernel<double>,
                        ops::GatherGradientOpKernel<int>,
+                       ops::GatherGradientOpKernel<uint8_t>,
                        ops::GatherGradientOpKernel<int64_t>);
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index dc27e543f0dfd65e556f9e3a138778972ad6982f..6bbb7155dda9b2c844f793a63adb861c2ed956e8 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -54,6 +54,7 @@ math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function jit_kernel_helper)
 math_library(sequence_scale)
 math_library(softmax DEPS math_function)
+math_library(beam_search DEPS math_function)
 
 math_library(matrix_bit_code)
 
@@ -68,6 +69,7 @@ cc_test(im2col_test SRCS im2col_test.cc DEPS im2col)
 cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col)
 cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding)
 cc_test(sequence_pooling_test SRCS sequence_pooling_test.cc DEPS sequence_pooling)
+cc_test(beam_search_test SRCS beam_search_test.cc DEPS beam_search)
 if(WITH_GPU)
     nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function)
     nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu.cc DEPS selected_rows_functor math_function)
diff --git a/paddle/fluid/operators/math/beam_search.cc b/paddle/fluid/operators/math/beam_search.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fb7119273a734feba870fdabade6a4faa1d5e9a3
--- /dev/null
+++ b/paddle/fluid/operators/math/beam_search.cc
@@ -0,0 +1,283 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/beam_search.h"
+#include <algorithm>
+#include <map>
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+class BeamSearchFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext &context,
+                  const framework::LoDTensor *pre_ids,
+                  const framework::LoDTensor *pre_scores,
+                  const framework::LoDTensor *ids,
+                  const framework::LoDTensor *scores,
+                  framework::LoDTensor *selected_ids,
+                  framework::LoDTensor *selected_scores, size_t level,
+                  size_t beam_size, int end_id, bool is_accumulated) {
+    auto abs_lod = framework::ToAbsOffset(scores->lod());
+    auto &high_level = abs_lod[level];
+
+    auto items = SelectTopBeamSizeItems(pre_ids, pre_scores, ids, scores, level,
+                                        beam_size, end_id, is_accumulated);
+    auto selected_items = ToMap(items, high_level.back());
+    if (FLAGS_v == 3) {
+      VLOG(3) << "selected_items:";
+      for (size_t i = 0; i < selected_items.size(); ++i) {
+        VLOG(3) << "offset: " << i;
+        for (auto &item : selected_items[i]) {
+          VLOG(3) << item.ToString();
+        }
+      }
+    }
+
+    PruneEndBeams(pre_ids, abs_lod, &selected_items, level, end_id);
+    // calculate the output tensor's height
+    size_t num_instances = std::accumulate(
+        std::begin(selected_items), std::end(selected_items), 0,
+        [](size_t a, std::vector<Item> &b) { return a + b.size(); });
+    // the output tensor shape should be [num_instances, 1]
+    auto dims = framework::make_ddim(
+        std::vector<int64_t>({static_cast<int>(num_instances), 1}));
+    selected_ids->Resize(dims);
+    selected_scores->Resize(dims);
+
+    auto *selected_ids_data =
+        selected_ids->mutable_data<int64_t>(platform::CPUPlace());
+    auto *selected_scores_data =
+        selected_scores->mutable_data<float>(platform::CPUPlace());
+
+    // fill in data
+    std::vector<size_t> low_level;
+    size_t low_offset = 0;
+    for (auto &items : selected_items) {
+      low_level.push_back(low_offset);
+      for (auto &item : items) {
+        selected_ids_data[low_offset] = item.id;
+        selected_scores_data[low_offset] = item.score;
+        low_offset++;
+      }
+    }
+    low_level.push_back(low_offset);
+
+    // fill lod
+    framework::LoD lod(2);
+    lod[0].assign(high_level.begin(), high_level.end());
+    lod[1].assign(low_level.begin(), low_level.end());
+    if (!framework::CheckLoD(lod)) {
+      PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
+    }
+    selected_ids->set_lod(lod);
+    selected_scores->set_lod(lod);
+  }
+
+  /*
+   * The basic items help to sort.
+   */
+  struct Item {
+    Item() {}
+    Item(size_t offset, size_t id, float score)
+        : offset(offset), id(id), score(score) {}
+    // offset in the higher lod level.
+    size_t offset;
+    // prefix id in the lower lod level.
+    // size_t prefix;
+    // the candidate id
+    size_t id;
+    // the corresponding score
+    float score;
+
+    inline bool operator<(const Item &in) const {
+      return (score < in.score) ||
+             ((score == in.score) && (offset < in.offset));
+    }
+
+    inline void operator=(const Item &in) {
+      offset = in.offset;
+      id = in.id;
+      score = in.score;
+    }
+
+    std::string ToString() {
+      std::ostringstream os;
+      os << "{";
+      os << "offset: " << offset << ", ";
+      os << "id: " << id << ", ";
+      os << "score: " << score << "";
+      os << "}";
+      return os.str();
+    }
+  };
+
+ protected:
+  /*
+   * Prune the source sentences all branchs finished, and it is optional.
+   * Pruning must one step later than finishing (thus pre_ids is needed here),
+   * since the end tokens must be writed out.
+   */
+  void PruneEndBeams(const framework::LoDTensor *pre_ids,
+                     const framework::LoD &abs_lod,
+                     std::vector<std::vector<Item>> *items, size_t lod_level,
+                     int end_id) {
+    auto *pre_ids_data = pre_ids->data<int64_t>();
+    auto &high_level = abs_lod[lod_level];
+    for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) {
+      size_t src_prefix_start = high_level[src_idx];
+      size_t src_prefix_end = high_level[src_idx + 1];
+      bool finish_flag = true;
+      for (size_t offset = src_prefix_start; offset < src_prefix_end;
+           offset++) {
+        for (auto &item : items->at(offset)) {
+          if (item.id != static_cast<size_t>(end_id) ||
+              pre_ids_data[offset] != end_id) {
+            finish_flag = false;
+            break;
+          }
+        }
+        if (!finish_flag) break;
+      }
+      if (finish_flag) {  // all branchs of the beam (source sentence) end and
+                          // prune this beam
+        for (size_t offset = src_prefix_start; offset < src_prefix_end;
+             offset++)
+          items->at(offset).clear();
+      }
+    }
+  }
+
+  /*
+   * Transform the items into a map whose key is offset, value is the items.
+   * NOTE low performance.
+   */
+  std::vector<std::vector<Item>> ToMap(
+      const std::vector<std::vector<Item>> &items, size_t element_num) {
+    std::vector<std::vector<Item>> result;
+    result.resize(element_num);
+    for (auto &entries : items) {
+      for (const auto &item : entries) {
+        result[item.offset].push_back(item);
+      }
+    }
+    return result;
+  }
+
+  void Insert(std::vector<Item> *top_beam_ptr, const Item &item,
+              size_t beam_size) {
+    std::vector<Item> &top_beam = *top_beam_ptr;
+
+    size_t num_beams = top_beam.size();
+    if (num_beams < beam_size) {
+      top_beam.resize(num_beams + 1);
+      num_beams++;
+    } else {
+      if (item < top_beam[beam_size - 1]) {
+        return;
+      }
+    }
+
+    for (int k = static_cast<int>(num_beams) - 2; k >= 0; --k) {
+      if (top_beam[k] < item) {
+        top_beam[k + 1] = top_beam[k];
+      } else {
+        top_beam[k + 1] = item;
+        return;
+      }
+    }
+    top_beam[0] = item;
+  }
+
+  /*
+   * For each source, select top beam_size records.
+   */
+  std::vector<std::vector<Item>> SelectTopBeamSizeItems(
+      const framework::LoDTensor *pre_ids,
+      const framework::LoDTensor *pre_scores, const framework::LoDTensor *ids,
+      const framework::LoDTensor *scores, size_t lod_level, size_t beam_size,
+      int end_id, bool is_accumulated) {
+    std::vector<std::vector<Item>> result;
+
+    // find the current candidates
+    auto abs_lod = framework::ToAbsOffset(scores->lod());
+
+    auto *pre_ids_data = pre_ids->data<int64_t>();
+    auto *pre_scores_data = pre_scores->data<float>();
+
+    auto *ids_data = ids ? ids->data<int64_t>() : nullptr;
+    auto *scores_data = scores->data<float>();
+
+    size_t num_seqs = scores->NumElements(lod_level);
+    size_t seq_width = 1;
+    for (int i = 1; i < scores->dims().size(); i++) {
+      seq_width *= scores->dims()[i];
+    }
+
+    for (size_t seq_id = 0; seq_id < num_seqs; ++seq_id) {
+      size_t seq_offset_start = abs_lod[lod_level][seq_id];
+      size_t seq_offset_end = abs_lod[lod_level][seq_id + 1];
+
+      std::vector<Item> top_beam;
+      top_beam.reserve(beam_size);
+
+      for (size_t offset = seq_offset_start; offset < seq_offset_end;
+           ++offset) {
+        auto pre_id = pre_ids_data[offset];
+        auto pre_score = pre_scores_data[offset];
+        if (pre_id == end_id) {
+          // Allocate all probability mass to end_id for finished branchs and
+          // the other candidate ids can be ignored.
+          Item item(offset, end_id, pre_score);
+          Insert(&top_beam, item, beam_size);
+        } else {
+          size_t index = offset * seq_width;
+          for (size_t d = 0; d < seq_width; d++, index++) {
+            int64_t id = ids_data ? ids_data[index] : static_cast<int64_t>(d);
+            float score = is_accumulated
+                              ? scores_data[index]
+                              : pre_score + std::log(scores_data[index]);
+            Item item(offset, id, score);
+            Insert(&top_beam, item, beam_size);
+          }
+        }
+      }
+
+      result.emplace_back(top_beam);
+    }
+
+    if (FLAGS_v == 3) {
+      VLOG(3) << "SelectTopBeamSizeItems result size " << result.size();
+      for (auto &items : result) {
+        VLOG(3) << "item set:";
+        for (auto &item : items) {
+          VLOG(3) << item.ToString();
+        }
+      }
+    }
+
+    return result;
+  }
+};
+
+template class BeamSearchFunctor<platform::CPUDeviceContext, int>;
+template class BeamSearchFunctor<platform::CPUDeviceContext, int64_t>;
+template class BeamSearchFunctor<platform::CPUDeviceContext, float>;
+template class BeamSearchFunctor<platform::CPUDeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d94e3023ce537cb9fa456e079c4fa3cf57fb954d
--- /dev/null
+++ b/paddle/fluid/operators/math/beam_search.cu
@@ -0,0 +1,393 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/beam_search.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+struct Triple {
+  __device__ __forceinline__ Triple() {}
+  __device__ __forceinline__ Triple(int o, int i, float s)
+      : offset(o), id(i), score(s) {}
+
+  __device__ __forceinline__ void set(int o, int i, float s) {
+    offset = o;
+    id = i;
+    score = s;
+  }
+
+  __device__ __forceinline__ void operator=(const Triple& in) {
+    offset = in.offset;
+    id = in.id;
+    score = in.score;
+  }
+
+  __device__ __forceinline__ bool operator<(const float s) const {
+    return score < s;
+  }
+
+  __device__ __forceinline__ bool operator<(const Triple& in) const {
+    return (score < in.score) || ((score == in.score) && (offset < in.offset));
+  }
+
+  int offset;
+  int id;
+  float score;
+};
+
+__device__ __forceinline__ void Insert(Triple* top_beam, const Triple& p,
+                                       int beam_size) {
+  if (p < top_beam[beam_size - 1]) {
+    return;
+  }
+  for (int k = beam_size - 2; k >= 0; --k) {
+    if (top_beam[k] < p) {
+      top_beam[k + 1] = top_beam[k];
+    } else {
+      top_beam[k + 1] = p;
+      return;
+    }
+  }
+  top_beam[0] = p;
+}
+
+template <int MaxThreadsPerSeq, bool IsAccumulated = true>
+__device__ __forceinline__ int SelectTopBeam(
+    Triple* top_beam, const int64_t* pre_ids, const float* pre_scores,
+    const int64_t* ids, const float* scores, const int seq_offset_start,
+    const int seq_offset_end, const int seq_width, int beam_size, int end_id,
+    int used_threads) {
+  // top_beam is shared memory
+  const int tid = threadIdx.x;
+  const int tid_of_seq = threadIdx.x % MaxThreadsPerSeq;
+
+  int num_used_threads = used_threads;
+
+  Triple* top_beam_local = top_beam + tid * beam_size;
+  if (tid_of_seq < num_used_threads) {
+    for (int i = 0; i < beam_size; ++i) {
+      top_beam_local[i].set(-1, -1, -INFINITY);
+    }
+
+    for (int offset = seq_offset_start; offset < seq_offset_end; ++offset) {
+      int pre_id = static_cast<int>(pre_ids[offset]);
+      if (pre_id == end_id) {
+        if (tid_of_seq == 0) {
+          Triple tmp(offset, end_id, pre_scores[offset]);
+          Insert(top_beam_local, tmp, beam_size);
+        }
+      } else {
+        int index = offset * seq_width + tid_of_seq;
+        if (!IsAccumulated) {
+          float pre_score = pre_scores[offset];
+          for (int i = tid_of_seq; i < seq_width; i += num_used_threads) {
+            float score = pre_score + __logf(scores[index]);
+            int id = ids ? static_cast<int>(ids[index]) : i;
+            Triple tmp(offset, id, score);
+            Insert(top_beam_local, tmp, beam_size);
+            index += num_used_threads;
+          }
+        } else {
+          for (int i = tid_of_seq; i < seq_width; i += num_used_threads) {
+            int id = ids ? static_cast<int>(ids[index]) : i;
+            float score = scores[index];
+            Triple tmp(offset, id, score);
+            Insert(top_beam_local, tmp, beam_size);
+            index += num_used_threads;
+          }
+        }
+      }
+    }
+  }
+
+  while (num_used_threads > 1) {
+    if (num_used_threads > 16) {
+      __syncthreads();
+    }
+
+    num_used_threads = num_used_threads >> 1;
+    if (tid_of_seq < num_used_threads) {
+      int index_in_sh = (num_used_threads + tid) * beam_size;
+      for (int i = 0; i < beam_size; i++) {
+        Insert(top_beam_local, top_beam[index_in_sh], beam_size);
+        index_in_sh++;
+      }
+    }
+  }
+
+  if (tid_of_seq == 0) {
+    int num_items = 0;
+    for (int i = 0; i < beam_size; ++i) {
+      num_items =
+          (top_beam_local[i].score > -INFINITY) ? num_items + 1 : num_items;
+    }
+    return num_items;
+  }
+
+  return 0;
+}
+
+__device__ __forceinline__ bool PruneEndBeams(Triple* top_beam_local,
+                                              const int64_t* pre_ids,
+                                              const int end_id, int num_items) {
+  bool finish_flag = true;
+  for (int i = 0; i < num_items; ++i) {
+    int offset = top_beam_local[i].offset;
+    if (top_beam_local[i].id != end_id ||
+        static_cast<int>(pre_ids[offset]) != end_id) {
+      finish_flag = false;
+      break;
+    }
+  }
+  return finish_flag;
+}
+
+__device__ __forceinline__ void WriteBack(
+    int64_t* selected_ids, float* selected_scores, size_t* selected_offsets,
+    Triple* top_beam_local, const int seq_offset_start,
+    const int seq_offset_end, const int selected_seq_start,
+    const int selected_seq_length) {
+  const int tid = threadIdx.x;  // use 1 thread only for each sequence
+  int global_index = selected_seq_start;
+  for (int global_offset = seq_offset_start; global_offset < seq_offset_end;
+       ++global_offset) {
+    for (int local_index = 0; local_index < selected_seq_length;
+         ++local_index) {
+      if (top_beam_local[local_index].offset == global_offset) {
+        selected_ids[global_index] =
+            static_cast<int64_t>(top_beam_local[local_index].id);
+        selected_scores[global_index] = top_beam_local[local_index].score;
+        global_index++;
+      }
+    }
+    selected_offsets[global_offset + 1] = static_cast<size_t>(global_index);
+  }
+}
+
+template <int MaxLength, int MaxThreadsPerSeq, int MaxSeqs>
+__device__ void BeamSearchDetails(
+    int64_t* selected_ids, float* selected_scores, size_t* selected_offsets,
+    const int64_t* pre_ids, const float* pre_scores, const int64_t* ids,
+    const float* scores, const int seq_offset_start, const int seq_offset_end,
+    const int seq_width, int beam_size, int end_id, bool is_accumulated,
+    int num_used_threads) {
+  __shared__ Triple top_beam[MaxLength];
+
+  int num_items = 0;
+  if (is_accumulated) {
+    num_items = SelectTopBeam<MaxThreadsPerSeq, true>(
+        top_beam, pre_ids, pre_scores, ids, scores, seq_offset_start,
+        seq_offset_end, seq_width, beam_size, end_id, num_used_threads);
+  } else {
+    num_items = SelectTopBeam<MaxThreadsPerSeq, false>(
+        top_beam, pre_ids, pre_scores, ids, scores, seq_offset_start,
+        seq_offset_end, seq_width, beam_size, end_id, num_used_threads);
+  }
+
+  const int tid = threadIdx.x;  // use 1 thread only for each sequence
+  const int tid_of_seq = tid % MaxThreadsPerSeq;
+  if (tid_of_seq == 0) {
+    // Use 1 thread for each sequence.
+    Triple* top_beam_local = top_beam + tid * beam_size;
+    bool finish_flag =
+        PruneEndBeams(top_beam_local, pre_ids, end_id, num_items);
+
+    int selected_seq_start = 0;
+    int selected_seq_length = finish_flag ? 0 : num_items;
+
+    if (MaxSeqs > 1) {
+      const int seq_id = (MaxSeqs > 1) ? tid / MaxThreadsPerSeq : tid;
+      __shared__ int shared_mem[MaxSeqs];
+
+      // [0, MaxSeqs - 1], length of each sequences
+      shared_mem[seq_id] = selected_seq_length;
+      __syncthreads();
+
+      for (int s = 0; s < seq_id; ++s) {
+        selected_seq_start += shared_mem[s];
+      }
+
+      if (seq_id == 0) {
+        selected_offsets[0] = 0;
+      }
+    } else {
+      selected_offsets[0] = 0;
+    }
+
+    WriteBack(selected_ids, selected_scores, selected_offsets, top_beam_local,
+              seq_offset_start, seq_offset_end, selected_seq_start,
+              selected_seq_length);
+  }
+}
+
+template <int MaxLength, int MaxThreadsPerSeq, int MaxSeqs>
+__global__ void BeamSearchKernel(int64_t* selected_ids, float* selected_scores,
+                                 size_t* selected_offsets,
+                                 const int64_t* pre_ids,
+                                 const float* pre_scores, const int64_t* ids,
+                                 const float* scores, const size_t* seq_offsets,
+                                 const int num_seqs, const int seq_width,
+                                 int beam_size, int end_id, bool is_accumulated,
+                                 int num_used_threads) {
+  const int tid = threadIdx.x;
+  const int seq_id = (MaxSeqs > 1) ? tid / MaxThreadsPerSeq : tid;
+
+  int seq_offset_start = static_cast<int>(seq_offsets[seq_id]);
+  int seq_offset_end = static_cast<int>(seq_offsets[seq_id + 1]);
+
+  BeamSearchDetails<MaxLength, MaxThreadsPerSeq, MaxSeqs>(
+      selected_ids, selected_scores, selected_offsets, pre_ids, pre_scores, ids,
+      scores, seq_offset_start, seq_offset_end, seq_width, beam_size, end_id,
+      is_accumulated, num_used_threads);
+}
+
+template <int MaxLength, int MaxThreadsPerSeq>
+__global__ void BeamSearchKernelSingle(
+    int64_t* selected_ids, float* selected_scores, size_t* selected_offsets,
+    const int64_t* pre_ids, const float* pre_scores, const int64_t* ids,
+    const float* scores, const int seq_length, const int seq_width,
+    int beam_size, int end_id, bool is_accumulated, int num_used_threads) {
+  const int seq_offset_start = 0;
+  const int seq_offset_end = seq_length;
+
+  BeamSearchDetails<MaxLength, MaxThreadsPerSeq, 1>(
+      selected_ids, selected_scores, selected_offsets, pre_ids, pre_scores, ids,
+      scores, seq_offset_start, seq_offset_end, seq_width, beam_size, end_id,
+      is_accumulated, num_used_threads);
+}
+
+static inline int GetNumUsedThreads(const int max_threads_per_seq,
+                                    const int seq_width, int beam_size) {
+  int num_used_threads = (seq_width + beam_size - 1) / beam_size;
+  num_used_threads = max_threads_per_seq < num_used_threads
+                         ? max_threads_per_seq
+                         : num_used_threads;
+
+  num_used_threads =
+      num_used_threads > 32
+          ? (num_used_threads >> 5) << 5
+          : (num_used_threads > 16
+                 ? 32
+                 : (num_used_threads > 8
+                        ? 16
+                        : (num_used_threads > 4
+                               ? 8
+                               : (num_used_threads > 2 ? 4
+                                                       : num_used_threads))));
+  return num_used_threads;
+}
+
+template <typename T>
+class BeamSearchFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::LoDTensor* pre_ids,
+                  const framework::LoDTensor* pre_scores,
+                  const framework::LoDTensor* ids,
+                  const framework::LoDTensor* scores,
+                  framework::LoDTensor* selected_ids,
+                  framework::LoDTensor* selected_scores, size_t level,
+                  size_t beam_size, int end_id, bool is_accumulated) {
+    auto abs_lod = framework::ToAbsOffset(scores->lod());
+
+    const int64_t* pre_ids_data = pre_ids->data<int64_t>();
+    const float* pre_scores_data = pre_scores->data<float>();
+    const int64_t* ids_data = ids ? ids->data<int64_t>() : nullptr;
+    const float* scores_data = scores->data<float>();
+
+    const size_t num_seqs = abs_lod[level].size() - 1;
+    size_t seq_width = 1;
+    for (int i = 1; i < scores->dims().size(); i++) {
+      seq_width *= scores->dims()[i];
+    }
+
+    // Reserve a big enough memory.
+    auto selected_dims =
+        framework::make_ddim({static_cast<int64_t>(num_seqs * beam_size), 1});
+    int64_t* selected_ids_data =
+        selected_ids->mutable_data<int64_t>(selected_dims, context.GetPlace());
+    float* selected_scores_data =
+        selected_scores->mutable_data<float>(selected_dims, context.GetPlace());
+
+    framework::LoD selected_lod(2);
+    selected_lod[0].assign(abs_lod[level].begin(), abs_lod[level].end());
+    selected_lod[1].resize(scores->dims()[0] + 1);
+    size_t* selected_offsets =
+        selected_lod[1].CUDAMutableData(context.GetPlace());
+
+    if (num_seqs == 1) {
+      const int seq_length = static_cast<int>(abs_lod[level][1]);
+      const int kMaxThreadsPerSeq = 1024;
+      int num_used_threads =
+          GetNumUsedThreads(kMaxThreadsPerSeq, static_cast<int>(seq_width),
+                            static_cast<int>(beam_size));
+      switch (platform::RoundToPowerOfTwo(beam_size * seq_width)) {
+        CUDA_LAUNCH_KERNEL_HELPER(
+            BeamSearchKernelSingle<kPowerOfTwoDim, kMaxThreadsPerSeq><<<
+                1, kMaxThreadsPerSeq, 0, context.stream()>>>(
+                selected_ids_data, selected_scores_data, selected_offsets,
+                pre_ids_data, pre_scores_data, ids_data, scores_data,
+                seq_length, static_cast<int>(seq_width),
+                static_cast<int>(beam_size), static_cast<int>(end_id),
+                is_accumulated, num_used_threads));
+      }
+    } else if (num_seqs <= 4) {
+      const size_t* seq_offsets = abs_lod[level].CUDAData(context.GetPlace());
+      // Use only 1 block
+      const int kMaxThreadsPerSeq = 32;
+      const int kMaxSeqs = 4;
+      int num_used_threads =
+          GetNumUsedThreads(kMaxThreadsPerSeq, static_cast<int>(seq_width),
+                            static_cast<int>(beam_size));
+      switch (platform::RoundToPowerOfTwo(beam_size * num_seqs * 32)) {
+        CUDA_LAUNCH_KERNEL_HELPER(
+            BeamSearchKernel<kPowerOfTwoDim, kMaxThreadsPerSeq, kMaxSeqs><<<
+                1, num_seqs * kMaxThreadsPerSeq, 0, context.stream()>>>(
+                selected_ids_data, selected_scores_data, selected_offsets,
+                pre_ids_data, pre_scores_data, ids_data, scores_data,
+                seq_offsets, static_cast<int>(num_seqs),
+                static_cast<int>(seq_width), static_cast<int>(beam_size),
+                end_id, is_accumulated, num_used_threads));
+      }
+    } else {
+      LOG(FATAL) << "Not implemented.";
+    }
+
+    context.Wait();
+    if (!framework::CheckLoD(selected_lod)) {
+      PADDLE_THROW("lod %s is not right", framework::LoDToString(selected_lod));
+    }
+
+    selected_ids->set_lod(selected_lod);
+    selected_scores->set_lod(selected_lod);
+    if (selected_lod[1].back() < num_seqs * beam_size) {
+      auto final_selected_dims = framework::make_ddim(
+          {static_cast<int64_t>(selected_lod[1].back()), 1});
+      selected_ids->Resize(final_selected_dims);
+      selected_scores->Resize(final_selected_dims);
+    }
+  }
+};
+
+template class BeamSearchFunctor<platform::CUDADeviceContext, int>;
+template class BeamSearchFunctor<platform::CUDADeviceContext, int64_t>;
+template class BeamSearchFunctor<platform::CUDADeviceContext, float>;
+template class BeamSearchFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/beam_search.h b/paddle/fluid/operators/math/beam_search.h
new file mode 100644
index 0000000000000000000000000000000000000000..3cd17f426c5596582c91f2b3f0cc5ba513e3aa4b
--- /dev/null
+++ b/paddle/fluid/operators/math/beam_search.h
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/*
+ * This is an implementation of beam search.
+ *
+ * To explain the details, lets take machine translation task for example, in
+ * this task, one source sentence is translated to multiple target sentences,
+ * during this period, one sentence will be translated to multiple translation
+ * prefixes(target sentence that have not ended), in each time step a prefix
+ * will have some candidates, input the candidate ids and their corresponding
+ * scores (probabilities), it will sort and select the top beam_size candidates
+ * for each source sentence, and store the selected candidates's score and their
+ * corresponding ids to LoDTensors.
+ *
+ * A detailed example:
+ *
+ *  Input
+ *
+ *    ids:
+ *      - LoD (should have 2 levels)
+ *        - first level: [0, 1, 4]
+ *        - second level: [0, 1, 2, 3, 4]
+ *      - tensor's data:
+ *          [[4, 2, 5]
+ *           [2, 1, 3]
+ *           [3, 5, 2]
+ *           [8, 2, 1]]
+ *
+ *    scores:
+ *      - LoD same as `ids`
+ *      - tensor's data
+ *          [[0.5, 0.3, 0.2]
+ *           [0.6, 0.3, 0.1]
+ *           [0.9, 0.5, 0.1]
+ *           [0.7, 0.5, 0.1]]
+ *
+ * The inputs means that there are 2 source sentences to translate, and the
+ * first source has 1 prefix, the second source has 2 prefix.
+ *
+ * Lets assume beam size is 2, and the beam search's output should be
+ *      - LoD
+ *        - first level: [0, 1, 2]
+ *        - second level: [0, 2, 4]
+ *      - id tensor's data
+ *          [[4,
+ *            1,
+ *            3,
+ *            8]]
+ *      - score tensor's data
+ *          [[0.5,
+ *            0.3,
+ *            0.9,
+ *            0.7]]
+ *
+ * TODO all the prune operations should be in the beam search, so it is better
+ * to split the beam search algorithm into a sequence of smaller operators, and
+ * the prune operators can be inserted in this sequence.
+ */
+template <typename DeviceContext, typename T>
+class BeamSearchFunctor {
+ public:
+  /*
+   * The main function of beam search.
+   *
+   * @selected_ids: a [None, 1]-shaped tensor with LoD.
+   *   In a machine translation model, it might be the candidate term id sets,
+   *   each set stored as a varience-length sequence.
+   *   The format might be described with a two-level LoD
+   *   - [[0 1],
+   *      [0 1 2]]
+   *   - [[]
+   *      [0 1]]
+   *   the first level of LoD tells that there are two source sentences. The
+   *   second level describes the details of the candidate id set's offsets in
+   * the source sentences.
+   *
+   *  @selected_scores: a LoD tensor with the same shape and LoD with
+   * selected_ids.
+   *   It stores the corresponding scores of candidate ids in selected_ids.
+   *
+   * Return false if all the input tensor is empty, in machine translation task
+   * that means no candidates is provided, and the task will stop running.
+   */
+  void operator()(const DeviceContext& context,
+                  const framework::LoDTensor* pre_ids,
+                  const framework::LoDTensor* pre_scores,
+                  const framework::LoDTensor* ids,
+                  const framework::LoDTensor* scores,
+                  framework::LoDTensor* selected_ids,
+                  framework::LoDTensor* selected_scores, size_t level,
+                  size_t beam_size, int end_id, bool is_accumulated);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/beam_search_test.cc b/paddle/fluid/operators/math/beam_search_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1c29ee95f6b109209316e4e8c8f3cda37eac62ae
--- /dev/null
+++ b/paddle/fluid/operators/math/beam_search_test.cc
@@ -0,0 +1,141 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/beam_search.h"
+#include <gtest/gtest.h>
+#include <vector>
+
+void PrepareCPUTensors(paddle::framework::LoDTensor* ids,
+                       paddle::framework::LoDTensor* scores,
+                       paddle::framework::LoDTensor* pre_ids,
+                       paddle::framework::LoDTensor* pre_scores) {
+  // lod
+  paddle::framework::LoD lod;
+  std::vector<size_t> level0({0, 2, 4});
+  std::vector<size_t> level1({0, 1, 2, 3, 4});
+  lod.push_back(level0);
+  lod.push_back(level1);
+  ids->set_lod(lod);
+  scores->set_lod(lod);
+
+  auto dims = paddle::framework::make_ddim({4, 3});
+  ids->Resize(dims);
+  scores->Resize(dims);
+
+  paddle::platform::CPUPlace place;
+  auto* ids_data = ids->mutable_data<int64_t>(place);
+  auto* scores_data = scores->mutable_data<float>(place);
+  std::vector<int64_t> ids_vec_data({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1});
+  std::vector<float> scores_vec_data(
+      {0.6f, 0.3f, 0.5f, 0.2f, 0.3f, 0.1f, 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f});
+
+  CHECK_EQ(static_cast<size_t>(ids->numel()), ids_vec_data.size());
+  CHECK_EQ(static_cast<size_t>(ids->numel()), scores_vec_data.size());
+
+  for (int i = 0; i < ids->numel(); i++) {
+    ids_data[i] = ids_vec_data[i];
+    scores_data[i] = scores_vec_data[i];
+  }
+
+  // pre_ids
+  pre_ids->Resize(paddle::framework::make_ddim({4, 1}));
+  for (int i = 0; i < 4; i++) {
+    pre_ids->mutable_data<int64_t>(place)[i] = i + 1;
+  }
+
+  // pre_scores
+  pre_scores->Resize(paddle::framework::make_ddim({4, 1}));
+  for (int i = 0; i < 4; i++) {
+    pre_scores->mutable_data<float>(place)[i] = 0.1 * (i + 1);
+  }
+}
+
+template <typename DeviceContext, typename Place>
+void TestBeamSearch() {
+  paddle::framework::LoDTensor ids;
+  paddle::framework::LoDTensor scores;
+  paddle::framework::LoDTensor pre_ids;
+  paddle::framework::LoDTensor pre_scores;
+
+  auto* place = new Place();
+  DeviceContext* context = new DeviceContext(*place);
+  if (paddle::platform::is_cpu_place(*place)) {
+    PrepareCPUTensors(&ids, &scores, &pre_ids, &pre_scores);
+  } else {
+    paddle::framework::LoDTensor cpu_ids;
+    paddle::framework::LoDTensor cpu_scores;
+    paddle::framework::LoDTensor cpu_pre_ids;
+    paddle::framework::LoDTensor cpu_pre_scores;
+
+    PrepareCPUTensors(&cpu_ids, &cpu_scores, &cpu_pre_ids, &cpu_pre_scores);
+
+    TensorCopySync(cpu_ids, *place, &ids);
+    TensorCopySync(cpu_scores, *place, &scores);
+    TensorCopySync(cpu_pre_ids, *place, &pre_ids);
+    TensorCopySync(cpu_pre_scores, *place, &pre_scores);
+
+    ids.set_lod(cpu_ids.lod());
+    scores.set_lod(cpu_scores.lod());
+    pre_ids.set_lod(cpu_pre_ids.lod());
+    pre_scores.set_lod(cpu_pre_scores.lod());
+  }
+
+  paddle::framework::LoDTensor selected_ids;
+  paddle::framework::LoDTensor selected_scores;
+
+  size_t level = 0;
+  size_t beam_size = 2;
+  int end_id = 0;
+  paddle::operators::math::BeamSearchFunctor<DeviceContext, float> beamsearch;
+  beamsearch(*context, &pre_ids, &pre_scores, &ids, &scores, &selected_ids,
+             &selected_scores, level, beam_size, end_id, true);
+
+  ASSERT_EQ(selected_ids.lod(), selected_scores.lod());
+
+  paddle::framework::LoDTensor cpu_selected_ids;
+  paddle::framework::LoDTensor cpu_selected_scores;
+  if (paddle::platform::is_cpu_place(*place)) {
+    cpu_selected_ids = selected_ids;
+    cpu_selected_scores = selected_scores;
+  } else {
+    TensorCopySync(selected_ids, paddle::platform::CPUPlace(),
+                   &cpu_selected_ids);
+    TensorCopySync(selected_scores, paddle::platform::CPUPlace(),
+                   &cpu_selected_scores);
+    cpu_selected_ids.set_lod(selected_ids.lod());
+    cpu_selected_scores.set_lod(selected_scores.lod());
+  }
+
+  std::vector<int64_t> expected_ids({4, 5, 3, 8});
+  std::vector<float> expected_scores({0.6f, 0.5f, 0.9f, 0.7f});
+  for (int i = 0; i < 4; i++) {
+    ASSERT_EQ(expected_ids[i], cpu_selected_ids.data<int64_t>()[i]);
+    ASSERT_EQ(expected_scores[i], cpu_selected_scores.data<float>()[i]);
+  }
+
+  delete place;
+  delete context;
+}
+
+TEST(BeamSearch, CPU) {
+  TestBeamSearch<paddle::platform::CPUDeviceContext,
+                 paddle::platform::CPUPlace>();
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(BeamSearch, GPU) {
+  TestBeamSearch<paddle::platform::CUDADeviceContext,
+                 paddle::platform::CUDAPlace>();
+}
+#endif
diff --git a/paddle/fluid/operators/math/sampler.cc b/paddle/fluid/operators/math/sampler.cc
index 2708f3bcd8f1d2cab19c74b57fdf9f903d9dc65d..238d9f2905058d267ffbee0669594920d7a9e031 100644
--- a/paddle/fluid/operators/math/sampler.cc
+++ b/paddle/fluid/operators/math/sampler.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/sampler.h"
+#include <glog/logging.h>
 #include <iostream>
 #include <queue>
 #include <utility>
@@ -77,7 +78,14 @@ int64_t CustomSampler::Sample() const {
   auto index = (*int_dist_)(*random_engine_);
   auto p = (*real_dist_)(*random_engine_);
   if (p > alias_probs_[index]) {
-    return alias_[index];
+    int alias = alias_[index];
+
+    if (alias == exceptional_val) {
+      LOG(WARNING) << "WARNING: CustomSampler get alias " << exceptional_val;
+      return index;
+    }
+
+    return alias;
   } else {
     return index;
   }
diff --git a/paddle/fluid/operators/math/sampler.h b/paddle/fluid/operators/math/sampler.h
index 98e0b898a504e3bd6b37c3cc772c179eab6038a4..3fa5a7ae336a9be984324411b88570aea99c2c78 100644
--- a/paddle/fluid/operators/math/sampler.h
+++ b/paddle/fluid/operators/math/sampler.h
@@ -116,6 +116,7 @@ class CustomSampler : public Sampler {
   const float* alias_probs_;
   const int* alias_;
   const float* probs_;
+  const int exceptional_val = -1;
   std::shared_ptr<std::mt19937> random_engine_;
   std::shared_ptr<std::uniform_real_distribution<>> real_dist_;
   std::shared_ptr<std::uniform_int_distribution<>> int_dist_;
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc
index f15b37a1e3f0ae9c7612c4f74470472393ff4ad6..aedb82da2f0fb2f15e1586d351af7c9d4364852b 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc
@@ -354,7 +354,7 @@ TEST(selected_rows_functor, cpu_merge_add_multi) {
 
   auto* out_data = output->value().data<float>();
   for (size_t i = 0; i < ret_rows.size(); ++i) {
-    for (size_t j = 0; j < row_numel; ++j) {
+    for (size_t j = 0; j < static_cast<size_t>(row_numel); ++j) {
       EXPECT_EQ(out_data[i * row_numel + j], ret_rows[i]);
     }
   }
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
index 73d83fa2e43f14445c969648cd469b0e32d644c7..74892316e6decdeab3a08396fa2f4bdeb8eb7b73 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
@@ -301,7 +301,7 @@ TEST(selected_rows_functor, gpu_merge_add) {
 
   auto* out_data = output_cpu.data<float>();
   for (size_t i = 0; i < ret_rows.size(); ++i) {
-    for (size_t j = 0; j < row_numel; ++j) {
+    for (size_t j = 0; j < static_cast<size_t>(row_numel); ++j) {
       EXPECT_EQ(out_data[i * row_numel + j], ret_rows[i]);
     }
   }
diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/paddle/fluid/operators/math/sequence_pooling_test.cc
index 5535523e798912ff80eeb5d753914c7d8d70a05f..cf6e89b3d9f11f2b68322ef15ddf026625f6a5a5 100644
--- a/paddle/fluid/operators/math/sequence_pooling_test.cc
+++ b/paddle/fluid/operators/math/sequence_pooling_test.cc
@@ -66,7 +66,7 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
     cpu_in_grad.set_lod(in_grad.lod());
   }
 
-  EXPECT_EQ(in_grad.numel(), lod[0].back() * second_dim);
+  EXPECT_EQ(in_grad.numel(), static_cast<int64_t>(lod[0].back() * second_dim));
   EXPECT_EQ(in_grad.lod(), lod);
 
   if (paddle::platform::is_cpu_place(*place)) {
diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
index 2c97eef096eb3d23273e362e658cb1b5fc808609..3e48b67a570d41482e358ae3941eb1e2b6ab91f8 100644
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -119,6 +119,11 @@ class NCEKernel : public framework::OpKernel<T> {
     PrepareSamples<DeviceContext, T>(context, sampler);
     auto sample_labels = context.Output<Tensor>("SampleLabels");
     const int64_t *sample_labels_data = sample_labels->data<int64_t>();
+
+    for (int x = 0; x < sample_labels->numel(); x++) {
+      PADDLE_ENFORCE_GE(sample_labels_data[x], 0, "nce sample label %d", x);
+    }
+
     auto sample_out = context.Output<Tensor>("SampleLogits");
     T *sample_out_data = sample_out->mutable_data<T>(context.GetPlace());
     auto label = context.Input<Tensor>("Label");
diff --git a/paddle/fluid/operators/ngraph/CMakeLists.txt b/paddle/fluid/operators/ngraph/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..83f78d505d7444cd12105aee40b0d03349b07be3
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/CMakeLists.txt
@@ -0,0 +1,4 @@
+if(WITH_NGRAPH)
+  cc_library(ngraph_engine SRCS ngraph_engine.cc DEPS ngraph_bridge framework_proto)
+  op_library(ngraph_engine_op DEPS ngraph_engine op_registry op_info device_context)
+endif()
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.cc b/paddle/fluid/operators/ngraph/ngraph_engine.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fde3a5ba55bf13a6dc60cce0915d71f27f640e90
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.cc
@@ -0,0 +1,492 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+
+#include <algorithm>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/ngraph_bridge.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/ngraph/ngraph_engine.h"
+
+namespace paddle {
+namespace operators {
+
+static ngraph::Shape Ddim2Shape(const framework::DDim& dims) {
+  ngraph::Shape sp;
+  for (int i = 0; i < dims.size(); ++i) {
+    int k = dims[i];
+    k = k == 0 ? 1 : k;
+    sp.push_back(k);
+  }
+  return sp;
+}
+
+static std::map<framework::proto::VarType::Type, ngraph::element::Type>
+    pd2ng_type_map = {
+        {framework::proto::VarType::FP32, ngraph::element::f32},
+        {framework::proto::VarType::FP64, ngraph::element::f64},
+        {framework::proto::VarType::INT32, ngraph::element::i32},
+        {framework::proto::VarType::INT64, ngraph::element::i64},
+        {framework::proto::VarType::BOOL, ngraph::element::boolean},
+};
+
+std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
+    NgraphEngine::func_cache_ = {};
+
+std::shared_ptr<ngraph::runtime::Backend> NgraphEngine::backend_ =
+    ngraph::runtime::Backend::create("CPU");
+
+static std::vector<std::vector<int>> NgraphOpIntervals(
+    framework::BlockDesc* block) {
+  std::vector<std::vector<int>> intervals;
+  auto ops = block->AllOps();
+  int size = ops.size();
+  int left = 0;
+  while (left < size && ops.at(left)->Type() != framework::kFeedOpType) {
+    ++left;
+  }
+  if (left == size) {
+    return intervals;
+  }
+  while (left < size && ops.at(left)->Type() == framework::kFeedOpType) {
+    ++left;
+  }
+
+  int right = left;
+  while (right < size && ops.at(right)->Type() != framework::kFetchOpType) {
+    ++right;
+  }
+  if (right == size) {
+    return intervals;
+  }
+  if (left >= right) return intervals;
+
+  // (left, right - 1) represents indices between feed and fetch
+  int pivot = left;
+  while (pivot < right) {
+    auto op_type = ops.at(pivot)->Type();
+    if (paddle::framework::NgraphBridge::NG_NODE_MAP.find(op_type) ==
+        paddle::framework::NgraphBridge::NG_NODE_MAP.end()) {
+      ++pivot;
+    } else {
+      int start = pivot, end = start;
+      while (pivot < right &&
+             (paddle::framework::NgraphBridge::NG_NODE_MAP.find(
+                  ops.at(pivot)->Type()) !=
+              paddle::framework::NgraphBridge::NG_NODE_MAP.end())) {
+        ++pivot;
+        ++end;
+      }
+      std::vector<int> interval = {start, end};
+      intervals.push_back(interval);
+    }
+  }  // end while
+  return intervals;
+}
+
+static void SubstituteNgraphOp(framework::BlockDesc* block,
+                               std::string block_str,
+                               std::vector<int> interval) {
+  framework::ProgramDesc program;
+  block->RemoveOp(interval.at(0), interval.at(1));
+  auto* ng_op = block->InsertOp(interval.at(0));
+  ng_op->SetType("ngraph_engine");
+  ng_op->SetAttr("interval", interval);
+  ng_op->SetAttr("graph", block_str);
+}
+
+// TODO(baojun-nervana): Move EnableNgraph to compile time per PR #15089
+void NgraphEngine::EnableNgraph(const framework::ProgramDesc& program) {
+#ifdef PADDLE_WITH_NGRAPH
+  VLOG(4) << "use_ngraph=True";
+  for (size_t bid = 0; bid < program.Size(); ++bid) {
+    // TODO(baojun-nervana): Remove the const_cast
+    auto* block =
+        const_cast<framework::ProgramDesc&>(program).MutableBlock(bid);
+    std::string block_str = block->Proto()->SerializeAsString();
+    auto intervals = NgraphOpIntervals(block);
+    for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
+      SubstituteNgraphOp(block, block_str, *it);
+    }
+  }
+#else
+  LOG(WARNING)
+      << "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option";
+#endif
+}
+
+NgraphEngine::NgraphEngine(const framework::Scope& scope,
+                           const platform::Place& place,
+                           const std::string& serialized_graph,
+                           const std::vector<int>& interval)
+    : scope_(scope), place_(place) {
+  var_in_node_map_ = std::make_shared<
+      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
+
+  var_node_map_ = std::make_shared<
+      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
+
+  func_cache_key_ = std::to_string(interval[0]) + std::to_string(interval[1]) +
+                    serialized_graph;
+
+  framework::proto::BlockDesc bdesc;
+  bdesc.ParseFromString(serialized_graph);
+  framework::BlockDesc block(nullptr, &bdesc);
+
+  Prepare(block, interval);
+
+  BuildNgIO();
+
+  GetNgFunction();
+}
+
+void NgraphEngine::Prepare(const framework::BlockDesc& block,
+                           const std::vector<int>& interval) {
+  for (auto& var : block.AllVars()) {
+    if (!(var->GetType() == framework::proto::VarType::SELECTED_ROWS ||
+          var->GetType() == framework::proto::VarType::LOD_TENSOR ||
+          var->GetType() == framework::proto::VarType::LOD_TENSOR_ARRAY)) {
+      continue;
+    }
+
+    auto var_name = var->Name();
+    if (var->Name() == framework::kEmptyVarName) {
+      continue;
+    }
+
+    if (var_name != framework::kFeedOpType &&
+        var_name != framework::kFetchOpType) {
+      auto pd_type = var->GetDataType();
+      if (pd2ng_type_map.find(pd_type) == pd2ng_type_map.end()) {
+        PADDLE_THROW("Data type of var %s not found in pd2ng_type_map",
+                     var_name);
+      }
+      var_type_map_[var_name] = pd2ng_type_map[pd_type];
+    }
+
+    if (var->Persistable()) {
+      persistables_.insert(var->Name());
+    }
+  }
+
+  auto ops_desc = block.AllOps();
+  int idx = interval[0];
+  while (idx < interval[1]) {
+    auto op_desc = ops_desc.at(idx);
+    auto op = framework::OpRegistry::CreateOp(*op_desc);
+    fused_ops_.push_back(std::move(op));
+    ++idx;
+  }
+
+  while (ops_desc.at(idx)->Type() != framework::kFetchOpType) {
+    auto op_desc = ops_desc.at(idx);
+    for (auto& var_name_item : op_desc->Inputs()) {
+      for (auto& var_name : var_name_item.second) {
+        post_op_inputs_.insert(var_name);
+      }
+    }
+    ++idx;
+  }
+
+  while (idx < static_cast<int>(ops_desc.size()) &&
+         ops_desc.at(idx)->Type() == framework::kFetchOpType) {
+    std::string fetch_target_name = ops_desc.at(idx)->Input("X")[0];
+    fetches_.insert(fetch_target_name);
+    ++idx;
+  }
+
+  if (ops_desc.at(interval.at(0) - 1)->Type() == framework::kFeedOpType &&
+      ops_desc.at(interval.at(1))->Type() == framework::kFetchOpType) {
+    ng_op_state_ = OpState::FULL;
+  }
+
+  for (auto* op_desc : ops_desc) {
+    if (op_desc->Type().find("_grad") != std::string::npos) {
+      ng_op_state_ = ng_op_state_ == OpState::FULL ? OpState::FULL_TRAIN
+                                                   : OpState::PARTIAL_TRAIN;
+      break;
+    }
+  }
+
+  if (ng_op_state_ != OpState::FULL_TRAIN &&
+      ng_op_state_ != OpState::PARTIAL_TRAIN) {
+    ng_op_state_ = ng_op_state_ == OpState::FULL ? OpState::FULL_TEST
+                                                 : OpState::PARTIAL_TEST;
+  }
+}
+
+void NgraphEngine::GetNgInputShape(
+    std::shared_ptr<framework::OperatorBase> op) {
+  framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_);
+  op->RuntimeInferShape(scope_, place_, ctx);
+  for (auto& var_name_item : op->Inputs()) {
+    for (auto& var_name : var_name_item.second) {
+      auto* var = scope_.FindVar(var_name);
+      if (var && var->IsType<framework::LoDTensor>()) {
+        auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+        auto sp = Ddim2Shape(tensor_pd->dims());
+        if (std::find(var_in_.begin(), var_in_.end(), var_name) !=
+            var_in_.end()) {
+          if (var_node_map_->find(var_name) == var_node_map_->end()) {
+            // auto ng_type = pd2ng_type_map.at(GetDataTypeOfVar(var));
+            auto ng_type = var_type_map_.at(var_name);
+            auto prm =
+                std::make_shared<ngraph::op::Parameter>(ng_type, sp, true);
+            (*var_node_map_)[var_name] = prm;
+            (*var_in_node_map_)[var_name] = prm;
+          }
+        }
+      }
+    }
+  }
+}
+
+void NgraphEngine::BuildNgNodes() {
+  for (auto& op : fused_ops_) {
+    for (auto& var_name_item : op->Outputs()) {
+      for (auto& var_name : var_name_item.second) {
+        if (var_node_map_->find(var_name) == var_node_map_->end()) {
+          auto* var = scope_.FindVar(var_name);
+          if (var && var->IsType<framework::LoDTensor>()) {
+            auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+            auto& ddim = tensor_pd->dims();
+            auto ng_shape = Ddim2Shape(ddim);
+            auto ng_type = var_type_map_.at(var_name);
+            auto prm = std::make_shared<ngraph::op::Parameter>(ng_type,
+                                                               ng_shape, true);
+            (*var_node_map_)[var_name] = prm;
+          }
+        }
+      }
+    }
+  }
+  framework::NgraphBridge ngb(var_node_map_);
+  for (auto& op : fused_ops_) {
+    ngb.BuildNgNode(op);
+  }
+}
+
+void NgraphEngine::BuildNgIO() {
+  std::unordered_set<std::string> inputs;
+  std::unordered_set<std::string> outputs;
+
+  for (auto& op : fused_ops_) {
+    for (auto& var_name_item : op->Inputs()) {
+      for (auto& var_name : var_name_item.second) {
+        inputs.insert(var_name);
+        const bool is_output = outputs.find(var_name) != outputs.end();
+        if (!is_output &&
+            std::find(var_in_.begin(), var_in_.end(), var_name) ==
+                var_in_.end()) {
+          // fill var_in here to keep lhs and rhs order
+          var_in_.push_back(var_name);
+        }
+      }
+    }
+
+    if (op->Type() != "fill_constant") {
+      GetNgInputShape(op);
+    }
+
+    for (auto& var_name_item : op->Outputs()) {
+      PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
+                        "op %s has more than 1 output - Not handling yet",
+                        op->Type());
+      for (auto& var_name : var_name_item.second) {
+        outputs.insert(var_name);
+      }
+    }
+  }
+
+  // var_out.clear();
+  for (auto& op : fused_ops_) {
+    for (auto& var_name_item : op->Outputs()) {
+      PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
+                        "op %s has more than 1 output - Not handling yet",
+                        op->Type());
+      for (auto& var_name : var_name_item.second) {
+        switch (ng_op_state_) {
+          case OpState::PARTIAL_TEST:
+            if (post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
+                fetches_.find(var_name) != fetches_.end()) {
+              var_out_.push_back(var_name);
+            }
+            break;
+          case OpState::FULL_TEST:
+            if (fetches_.find(var_name) != fetches_.end()) {
+              var_out_.push_back(var_name);
+            }
+            break;
+          case OpState::PARTIAL_TRAIN:
+            if (fetches_.find(var_name) != fetches_.end() ||
+                post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
+                persistables_.find(var_name) != persistables_.end()) {
+              var_out_.push_back(var_name);
+            }
+            break;
+          case OpState::FULL_TRAIN:
+            if (fetches_.find(var_name) != fetches_.end() ||
+                persistables_.find(var_name) != persistables_.end()) {
+              var_out_.push_back(var_name);
+            }
+            break;
+          default:
+            var_out_.push_back(var_name);
+        }
+      }
+    }
+  }
+}
+
+void NgraphEngine::BuildNgFunction() {
+  BuildNgNodes();
+  ngraph_function_ = nullptr;
+  ngraph::NodeVector func_outputs;
+  ngraph::ParameterVector func_inputs;
+
+  for (auto& vo : var_out_) {
+    func_outputs.push_back(var_node_map_->at(vo));
+  }
+
+  for (auto& vi : var_in_) {
+    std::shared_ptr<ngraph::op::Parameter> prm =
+        std::dynamic_pointer_cast<ngraph::op::Parameter>(
+            var_in_node_map_->at(vi));
+    func_inputs.push_back(prm);
+  }
+
+  ngraph_function_ =
+      std::make_shared<ngraph::Function>(func_outputs, func_inputs);
+}
+
+void NgraphEngine::GetNgFunction() {
+  bool cache_on = true;
+  if (cache_on) {
+    std::string input_shape_str;
+    for (auto& var_name : var_in_) {
+      auto shape = var_node_map_->at(var_name)->get_shape();
+      for (size_t i = 0; i < shape.size(); ++i) {
+        input_shape_str += std::to_string(shape.at(i));
+      }
+    }
+    func_cache_key_ = input_shape_str + func_cache_key_;
+    if (func_cache_.find(func_cache_key_) != func_cache_.end()) {
+      ngraph_function_ = func_cache_.at(func_cache_key_);
+    } else {
+      BuildNgFunction();
+      func_cache_[func_cache_key_] = ngraph_function_;
+    }
+  } else {
+    BuildNgFunction();
+  }
+}
+
+void NgraphEngine::Run(const framework::Scope& scope,
+                       const platform::Place& place) const {
+  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_in;
+  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_out;
+
+  for (size_t i = 0; i < var_in_.size(); ++i) {
+    auto vi = var_in_.at(i);
+    auto sp = var_node_map_->at(vi)->get_shape();
+    std::shared_ptr<ngraph::runtime::Tensor> ti;
+    auto* var = scope.FindVar(vi);
+    if (var && var->IsType<framework::LoDTensor>()) {
+      auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
+      PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()),
+                     "Ensure ngraph tensor layout align with paddle tensor");
+      auto ng_type = var_type_map_.at(vi);
+      if (ng_type == ngraph::element::f32) {
+        auto pd_arr = tensor_pd->mutable_data<float>(place);
+        ti = backend_->create_tensor(ngraph::element::f32, sp, pd_arr);
+      } else if (ng_type == ngraph::element::i32) {
+        const int* arr = tensor_pd->data<int>();
+        ti = backend_->create_tensor(ngraph::element::i32, sp,
+                                     const_cast<int*>(arr));
+      } else if (ng_type == ngraph::element::i64) {
+        auto pd_arr = tensor_pd->mutable_data<int64_t>(place);
+        ti = backend_->create_tensor(ngraph::element::i64, sp, pd_arr);
+      } else if (ng_type == ngraph::element::f64) {
+        auto pd_arr = tensor_pd->mutable_data<double>(place);
+        ti = backend_->create_tensor(ngraph::element::f64, sp, pd_arr);
+      } else if (ng_type == ngraph::element::boolean) {
+        auto pd_arr = tensor_pd->mutable_data<bool>(place);
+        ti = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr);
+      } else {
+        PADDLE_THROW("Data type not handling for var %s", vi);
+      }
+    } else {
+      PADDLE_THROW("Cannot find var or tensor with var name %s", vi);
+    }
+    bool is_test = (ng_op_state_ == OpState::PARTIAL_TEST ||
+                    ng_op_state_ == OpState::FULL_TEST)
+                       ? true
+                       : false;
+    bool is_persistable =
+        (persistables_.find(vi) != persistables_.end()) ? true : false;
+    if (is_test && is_persistable) {
+      ti->set_stale(false);
+    }
+    t_in.push_back(ti);
+  }
+
+  for (size_t i = 0; i < var_out_.size(); ++i) {
+    auto vo = var_out_[i];
+    auto* var = scope.FindVar(vo);
+    std::shared_ptr<ngraph::runtime::Tensor> to;
+    if (var && var->IsType<framework::LoDTensor>()) {
+      auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
+      auto dd = tensor_pd->dims();
+      ngraph::Shape sp = Ddim2Shape(dd);
+      auto ng_type = var_type_map_.at(vo);
+      if (ng_type == ngraph::element::f32) {
+        auto pd_arr = tensor_pd->mutable_data<float>(place);
+        to = backend_->create_tensor(ng_type, sp, pd_arr);
+      } else if (ng_type == ngraph::element::i64) {
+        auto pd_arr = tensor_pd->mutable_data<int64_t>(place);
+        to = backend_->create_tensor(ng_type, sp, pd_arr);
+      } else if (ng_type == ngraph::element::i32) {
+        auto pd_arr = tensor_pd->mutable_data<int>(place);
+        to = backend_->create_tensor(ng_type, sp, pd_arr);
+      } else if (ng_type == ngraph::element::f64) {
+        auto pd_arr = tensor_pd->mutable_data<double>(place);
+        to = backend_->create_tensor(ng_type, sp, pd_arr);
+      } else if (ng_type == ngraph::element::boolean) {
+        auto pd_arr = tensor_pd->mutable_data<bool>(place);
+        to = backend_->create_tensor(ng_type, sp, pd_arr);
+      } else {
+        PADDLE_THROW("Data type not handled in for var %s", vo);
+      }
+      t_out.push_back(to);
+    } else {
+      PADDLE_THROW("Cannot find var or tensor with var name %s", vo);
+    }
+  }
+
+  backend_->call(backend_->compile(ngraph_function_), t_out, t_in);
+}  // NgraphEngine::Run
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.h b/paddle/fluid/operators/ngraph/ngraph_engine.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf5ff2a743b0edb69163e674d36c56a02c0b4153
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+#include "ngraph/ngraph.hpp"
+
+namespace paddle {
+namespace operators {
+
+enum class OpState {                /* nGraph support state on ops          */
+                     FULL_TRAIN,    /* Support full ops for train           */
+                     PARTIAL_TRAIN, /* Support partial ops for train        */
+                     FULL_TEST,     /* Support full list of ops for test    */
+                     PARTIAL_TEST,  /* Support partial list of ops for test */
+                     FULL,          /* All ops supported from feed to fetch */
+                     UNKNOWN        /* Output all for debug purpose         */
+};
+
+// perform graph build through bridge and execute computation
+class NgraphEngine {
+ public:
+  explicit NgraphEngine(const framework::Scope& scope,
+                        const platform::Place& place,
+                        const std::string& serialized_graph,
+                        const std::vector<int>& interval);
+
+  void Run(const framework::Scope& scope, const platform::Place& place) const;
+
+  static void EnableNgraph(const framework::ProgramDesc& program);
+
+ private:
+  static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
+      func_cache_;
+  const framework::Scope& scope_;
+  const platform::Place& place_;
+  std::vector<std::shared_ptr<framework::OperatorBase>> fused_ops_;
+  std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
+  std::unordered_set<std::string> persistables_;
+  std::unordered_set<std::string> fetches_;
+  std::unordered_set<std::string> post_op_inputs_;
+  OpState ng_op_state_ = OpState::UNKNOWN;
+  std::string func_cache_key_;
+
+  // ngraph backend eg. CPU
+  static std::shared_ptr<ngraph::runtime::Backend> backend_;
+  // ngraph function to call and execute
+  std::shared_ptr<ngraph::Function> ngraph_function_;
+  // var_name of inputs
+  std::vector<std::string> var_in_;
+  // var_name of outputs from  fetch in order
+  std::vector<std::string> var_out_;
+  // map input vars to nodes
+  std::shared_ptr<
+      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+      var_in_node_map_;
+  // map each var name with a ngraph node
+  std::shared_ptr<
+      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+      var_node_map_;
+  // prepare info for nraph engine
+  void Prepare(const framework::BlockDesc& block,
+               const std::vector<int>& interval);
+  // get ngraph input and define ngraph input parameters
+  void GetNgInputShape(std::shared_ptr<framework::OperatorBase> op);
+  // Call ngraph bridge to map ops
+  void BuildNgNodes();
+  // get the ngraph input and output var list
+  void BuildNgIO();
+  // build ngraph function call
+  void BuildNgFunction();
+  // Check cache for ngraph function or otherwise build the function
+  void GetNgFunction();
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine_op.cc b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3051ca123b29658d3e9a35239ad00f621a297cb5
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
@@ -0,0 +1,52 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <string>
+
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/ngraph/ngraph_engine_op.h"
+
+namespace paddle {
+namespace operators {
+
+class NgraphEngineOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Xs", "A list of inputs.").AsDispensable();
+    AddOutput("Ys", "A list of outputs").AsDispensable();
+    AddAttr<std::string>("graph", "the graph.");
+    AddAttr<std::vector<int>>("interval", "op interval supported by ngraph");
+    AddComment("ngraph engine operator.");
+  }
+};
+
+class NgraphEngineInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(ngraph_engine, ops::NgraphEngineOp, ops::NgraphEngineOpMaker,
+                  ops::NgraphEngineOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    ngraph_engine,
+    ops::NgraphEngineKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine_op.h b/paddle/fluid/operators/ngraph/ngraph_engine_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..d2974298b0707575624ad2f6935e83d06b4c83bb
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/operators/ngraph/ngraph_engine.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace operators {
+
+class NgraphEngineOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::OpKernelType kt = framework::OpKernelType(
+        framework::proto::VarType::FP32, ctx.GetPlace());
+    return kt;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class NgraphEngineKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& scope = ctx.scope();
+    auto place = ctx.GetPlace();
+    std::string serialized_graph = ctx.Attr<std::string>("graph");
+    auto interval = ctx.Attr<std::vector<int>>("interval");
+
+    NgraphEngine ngraph_engine(scope, place, serialized_graph, interval);
+    ngraph_engine.Run(scope, place);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu
index bcec6f3563df7f4e1e48554cc891d596f9e56024..8d695fdedd04055215864ca4f0a7059ed7a5d6b0 100644
--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/roi_align_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 
@@ -255,8 +256,8 @@ class GPUROIAlignOpKernel : public framework::OpKernel<T> {
 
     Tensor roi_batch_id_list;
     roi_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
+    auto cplace = platform::CPUPlace();
+    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
     auto rois_lod = rois->lod().back();
     int rois_batch_size = rois_lod.size() - 1;
     PADDLE_ENFORCE_EQ(
@@ -270,14 +271,18 @@ class GPUROIAlignOpKernel : public framework::OpKernel<T> {
         roi_batch_id_data[i] = n;
       }
     }
-    Tensor roi_batch_id_list_gpu;
-    framework::TensorCopySync(roi_batch_id_list, ctx.GetPlace(),
-                              &roi_batch_id_list_gpu);
-    GPUROIAlignForward<
-        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+    auto& dev_ctx = ctx.cuda_device_context();
+    auto& allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    int bytes = roi_batch_id_list.numel() * sizeof(int);
+    auto roi_ptr = allocator.Allocate(bytes);
+    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+    const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
+                 dev_ctx.stream());
+    GPUROIAlignForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
         output_size, in->data<T>(), rois->data<T>(), spatial_scale, channels,
-        height, width, pooled_height, pooled_width, sampling_ratio,
-        roi_batch_id_list_gpu.data<int>(),
+        height, width, pooled_height, pooled_width, sampling_ratio, roi_id_data,
         out->mutable_data<T>(ctx.GetPlace()));
   }
 };
@@ -307,8 +312,8 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
     }
     Tensor roi_batch_id_list;
     roi_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
+    auto cplace = platform::CPUPlace();
+    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
     auto rois_lod = rois->lod().back();
     int rois_batch_size = rois_lod.size() - 1;
     for (int n = 0; n < rois_batch_size; ++n) {
@@ -316,24 +321,28 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
         roi_batch_id_data[i] = n;
       }
     }
-    Tensor roi_batch_id_list_gpu;
-    framework::TensorCopySync(roi_batch_id_list, ctx.GetPlace(),
-                              &roi_batch_id_list_gpu);
-
+    auto& dev_ctx = ctx.cuda_device_context();
+    auto& allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    auto roi_ptr = allocator.Allocate(roi_batch_id_list.numel() * sizeof(int));
+    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+    int bytes = roi_batch_id_list.numel() * sizeof(int);
+    const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
+                 dev_ctx.stream());
     in_grad->mutable_data<T>(ctx.GetPlace());
     math::SetConstant<Place, T> set_zero;
-    set_zero(ctx.cuda_device_context(), in_grad, static_cast<T>(0));
+    set_zero(dev_ctx, in_grad, static_cast<T>(0));
 
     int output_grad_size = out_grad->numel();
     int blocks = NumBlocks(output_grad_size);
     int threads = kNumCUDAThreads;
 
     if (output_grad_size > 0) {
-      GPUROIAlignBackward<
-          T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+      GPUROIAlignBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
           output_grad_size, rois->data<T>(), out_grad->data<T>(), rois_num,
           spatial_scale, channels, height, width, pooled_height, pooled_width,
-          sampling_ratio, roi_batch_id_list_gpu.data<int>(),
+          sampling_ratio, roi_id_data,
           in_grad->mutable_data<T>(ctx.GetPlace()));
     }
   }
diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu
index 75c3dd6bc498e35c6249f79a1c24cfe17316670e..ac3a4201e65256ae16c3376b385dd6000da60fe6 100644
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/roi_pool_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 
@@ -152,8 +153,8 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
 
     framework::Tensor roi_batch_id_list;
     roi_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
+    auto cplace = platform::CPUPlace();
+    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
     auto rois_lod = rois->lod().back();
     int rois_batch_size = rois_lod.size() - 1;
     PADDLE_ENFORCE_EQ(
@@ -168,15 +169,20 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
       }
     }
 
-    framework::Tensor roi_batch_id_list_gpu;
-    framework::TensorCopy(roi_batch_id_list, ctx.GetPlace(),
-                          ctx.device_context(), &roi_batch_id_list_gpu);
-
-    GPUROIPoolForward<
-        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+    auto& dev_ctx = ctx.cuda_device_context();
+    auto& allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    int bytes = roi_batch_id_list.numel() * sizeof(int);
+    auto roi_ptr = allocator.Allocate(bytes);
+    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+    const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
+                 dev_ctx.stream());
+
+    GPUROIPoolForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
         output_size, in->data<T>(), rois->data<T>(), spatial_scale, channels,
-        height, width, pooled_height, pooled_width,
-        roi_batch_id_list_gpu.data<int>(), out->mutable_data<T>(ctx.GetPlace()),
+        height, width, pooled_height, pooled_width, roi_id_data,
+        out->mutable_data<T>(ctx.GetPlace()),
         argmax->mutable_data<int64_t>(ctx.GetPlace()));
   }
 };
@@ -204,8 +210,8 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
     if (x_grad) {
       framework::Tensor roi_batch_id_list;
       roi_batch_id_list.Resize({rois_num});
-      int* roi_batch_id_data =
-          roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
+      auto cplace = platform::CPUPlace();
+      int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
       auto rois_lod = rois->lod().back();
       int rois_batch_size = rois_lod.size() - 1;
       for (int n = 0; n < rois_batch_size; ++n) {
@@ -213,25 +219,30 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
           roi_batch_id_data[i] = n;
         }
       }
-      framework::Tensor roi_batch_id_list_gpu;
-      framework::TensorCopy(roi_batch_id_list, ctx.GetPlace(),
-                            ctx.device_context(), &roi_batch_id_list_gpu);
+
+      auto& dev_ctx = ctx.cuda_device_context();
+      auto& allocator =
+          platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+      int bytes = roi_batch_id_list.numel() * sizeof(int);
+      auto roi_ptr = allocator.Allocate(bytes);
+      int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+      const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+      memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
+                   dev_ctx.stream());
 
       x_grad->mutable_data<T>(ctx.GetPlace());
       math::SetConstant<Place, T> set_zero;
-      set_zero(ctx.cuda_device_context(), x_grad, static_cast<T>(0));
+      set_zero(dev_ctx, x_grad, static_cast<T>(0));
 
       int output_grad_size = out_grad->numel();
       int blocks = NumBlocks(output_grad_size);
       int threads = kNumCUDAThreads;
 
       if (output_grad_size > 0) {
-        GPUROIPoolBackward<
-            T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+        GPUROIPoolBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
             output_grad_size, rois->data<T>(), out_grad->data<T>(),
             argmax->data<int64_t>(), rois_num, spatial_scale, channels, height,
-            width, pooled_height, pooled_width,
-            roi_batch_id_list_gpu.data<int>(),
+            width, pooled_height, pooled_width, roi_id_data,
             x_grad->mutable_data<T>(ctx.GetPlace()));
       }
     }
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
index c07e6962e673ceb274ef31cbf492f378ae696137..27e0201bd70df59c58eaa7567d5bb69eb1b721b4 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
@@ -68,6 +68,11 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
                        "Level number of Input(X)'s lod could be 0. Otherwise "
                        "size of Input(X)'s first level lod should be equal to "
                        "size of Input(Y)'s referred level lod.");
+      } else {
+        PADDLE_ENFORCE_EQ(x_dims[0], y_lod[ref_level].size() - 1,
+                          "When Input(X)'s lod is null, the dims[0] of "
+                          "Input(X) should match the "
+                          "size of Input(Y)'s referred level lod.");
       }
 
       int64_t out_first_dim = 0;
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index 14746fa95159d707be7c10c69a4ffc2211e17a93..c21b0c13c752b82b80c120cb5a5d4a010ef18287 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -101,6 +101,10 @@ class SigmoidCrossEntropyWithLogitsOpMaker
     AddOutput("Out",
               "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D "
               " of elementwise logistic losses.");
+    AddAttr<bool>("normalize",
+                  "if true, divide the loss by the number of "
+                  "targets != ignore_index.")
+        .SetDefault(false);
     AddAttr<int>("ignore_index",
                  "(int, default kIgnoreIndex), Specifies a target value that "
                  "is ignored and"
@@ -145,9 +149,14 @@ REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad,
                   ops::SigmoidCrossEntropyWithLogitsGradOp);
-REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits,
-                       ops::SigmoidCrossEntropyWithLogitsKernel<
-                           paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    sigmoid_cross_entropy_with_logits,
+    ops::SigmoidCrossEntropyWithLogitsKernel<paddle::platform::CPUDeviceContext,
+                                             float>,
+    ops::SigmoidCrossEntropyWithLogitsKernel<paddle::platform::CPUDeviceContext,
+                                             double>);
 REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits_grad,
                        ops::SigmoidCrossEntropyWithLogitsGradKernel<
-                           paddle::platform::CPUDeviceContext, float>);
+                           paddle::platform::CPUDeviceContext, float>,
+                       ops::SigmoidCrossEntropyWithLogitsGradKernel<
+                           paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
index a1fbc7e5fab71df486b53c31464c99e9c4557ccd..2a4570ef5cec0bee07efd69a2efd1a079ff33df5 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
@@ -11,12 +11,184 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "cub/cub.cuh"
 #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+static HOSTDEVICE float real_exp(float x) { return expf(x); }
+static HOSTDEVICE float real_exp(double x) { return exp(x); }
+static HOSTDEVICE float real_log(float x) { return logf(x); }
+static HOSTDEVICE float real_log(double x) { return log(x); }
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__global__ void GPUSigmoidForward(const T *x_data, const T *label_data,
+                                  const int ignore_index, const int limit,
+                                  T *out_data, T *counts) {
+  CUDA_1D_KERNEL_LOOP(i, limit) {
+    T x = x_data[i];
+    T label = label_data[i];
+    T eps = static_cast<T>(1e-5);
+    T diff = label - static_cast<T>(ignore_index);
+    if ((diff > -eps) && (diff < eps)) {
+      out_data[i] = static_cast<T>(0.);
+      counts[i] = 0;
+    } else {
+      T term1 = (x > 0) ? x : 0;
+      T term2 = x * label;
+      T term3 = real_log(static_cast<T>(1) + real_exp(static_cast<T>(-abs(x))));
+      out_data[i] = term1 - term2 + term3;
+      counts[i] = 1;
+    }
+  }
+}
+
+template <typename T, int BlockDim>
+__global__ void Sum(const T *counts, int num, const T eps, T *sum) {
+  typedef cub::BlockReduce<double, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  T in = 0;
+  for (int i = threadIdx.x; i < num; i += BlockDim) {
+    in += counts[i];
+  }
+  __syncthreads();
+  auto out =
+      BlockReduce(temp_storage).Reduce(static_cast<double>(in), cub::Sum());
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    T a = out > eps ? out : eps;
+    sum[0] = a;
+  }
+}
+
+template <typename T>
+__global__ void Div(T *loss, const int num, const T *norm) {
+  CUDA_1D_KERNEL_LOOP(i, num) { loss[i] /= norm[0]; }
+}
+
+template <typename T>
+__global__ void GPUSigmoidBackward(const T *x_data, const T *label_data,
+                                   const int ignore_index, const T *dout_data,
+                                   const int limit, T *dx_data, T *counts) {
+  CUDA_1D_KERNEL_LOOP(i, limit) {
+    T x = x_data[i];
+    T label = label_data[i];
+    T dout = dout_data[i];
+    T eps = static_cast<T>(1e-5);
+    T diff = label - static_cast<T>(ignore_index);
+    if ((diff > -eps) && (diff < eps)) {
+      dx_data[i] = static_cast<T>(0.);
+      counts[i] = 0;
+    } else {
+      T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + real_exp(-x));
+      T diff = simoid_x - label;
+      dx_data[i] = dout * diff;
+      counts[i] = 1;
+    }
+  }
+}
+
+// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
+template <typename DeviceContext, typename T>
+class GPUSigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const Tensor *X = context.Input<Tensor>("X");
+    const Tensor *Labels = context.Input<Tensor>("Label");
+    Tensor *Out = context.Output<Tensor>("Out");
+    int ignore_index = context.Attr<int>("ignore_index");
+    auto out_data = Out->mutable_data<T>(context.GetPlace());
+
+    auto &dev_ctx = context.cuda_device_context();
+    bool normalize = context.Attr<bool>("normalize");
+
+    // Temporary memory
+    auto &allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    auto cnt_ptr = allocator.Allocate(Labels->numel() * sizeof(T));
+    T *counts = reinterpret_cast<T *>(cnt_ptr->ptr());
+
+    int limit = Out->numel();
+    int blocks = NumBlocks(limit);
+    int threads = kNumCUDAThreads;
+    GPUSigmoidForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+        X->data<T>(), Labels->data<T>(), ignore_index, limit, out_data, counts);
+    if (normalize) {
+      auto norm_ptr = allocator.Allocate(sizeof(T));
+      T *norm = reinterpret_cast<T *>(norm_ptr->ptr());
+      Sum<T, kNumCUDAThreads><<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>(
+          counts, limit, static_cast<T>(1e-5), norm);
+      Div<T><<<blocks, threads, 0, dev_ctx.stream()>>>(out_data, limit, norm);
+    }
+  }
+};
+
+// dX = sigmoid(X) - labels
+template <typename DeviceContext, typename T>
+class GPUSigmoidCrossEntropyWithLogitsGradKernel
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const Tensor *X = context.Input<Tensor>("X");
+    const Tensor *Labels = context.Input<Tensor>("Label");
+    const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
+    auto dx_data = dX->mutable_data<T>(context.GetPlace());
+
+    int ignore_index = context.Attr<int>("ignore_index");
+
+    auto &dev_ctx = context.cuda_device_context();
+    // Temporary memory
+    auto &allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    auto cnt_ptr = allocator.Allocate(X->numel() * sizeof(T));
+    T *counts = reinterpret_cast<T *>(cnt_ptr->ptr());
+
+    int limit = dX->numel();
+    int blocks = NumBlocks(limit);
+    int threads = kNumCUDAThreads;
+    GPUSigmoidBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+        X->data<T>(), Labels->data<T>(), ignore_index, dOut->data<T>(), limit,
+        dx_data, counts);
+    bool normalize = context.Attr<bool>("normalize");
+    if (normalize) {
+      auto norm_ptr = allocator.Allocate(sizeof(T));
+      T *norm = reinterpret_cast<T *>(norm_ptr->ptr());
+      Sum<T, kNumCUDAThreads><<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>(
+          counts, limit, static_cast<T>(1e-5), norm);
+      Div<T><<<blocks, threads, 0, dev_ctx.stream()>>>(dx_data, limit, norm);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits,
-                        ops::SigmoidCrossEntropyWithLogitsKernel<
-                            paddle::platform::CUDADeviceContext, float>);
+                        ops::GPUSigmoidCrossEntropyWithLogitsKernel<
+                            paddle::platform::CUDADeviceContext, float>,
+                        ops::GPUSigmoidCrossEntropyWithLogitsKernel<
+                            paddle::platform::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits_grad,
-                        ops::SigmoidCrossEntropyWithLogitsGradKernel<
-                            paddle::platform::CUDADeviceContext, float>);
+                        ops::GPUSigmoidCrossEntropyWithLogitsGradKernel<
+                            paddle::platform::CUDADeviceContext, float>,
+                        ops::GPUSigmoidCrossEntropyWithLogitsGradKernel<
+                            paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
index 6e75f9e0b8d825b100de5c46f151a808cdb1b9d5..8f459d573ae5930c27a97c39ac79231384c3d12f 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
@@ -13,54 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/fluid/framework/eigen.h"
+#include <algorithm>
+#include <limits>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T>
-struct SigmoidCrossEntropyWithLogitsForward {
-  HOSTDEVICE SigmoidCrossEntropyWithLogitsForward(const int &ignore_index)
-      : ignore_index(ignore_index) {}
-
-  HOSTDEVICE T operator()(const T &x, const T &label) const {
-    if (static_cast<int>(label) == ignore_index) {
-      return static_cast<T>(0.);
-    }
-    T term1 = (x > 0) ? x : 0;
-    T term2 = x * label;
-    T term3 = std::log(static_cast<T>(1) + std::exp(-(std::abs(x))));
-    return term1 - term2 + term3;
-  }
-
-  int ignore_index;
-};
-
-template <typename T>
-struct SigmoidCrossEntropyWithLogitsBackward {
-  HOSTDEVICE SigmoidCrossEntropyWithLogitsBackward(const int &ignore_index)
-      : ignore_index(ignore_index) {}
-
-  HOSTDEVICE T operator()(const T &x, const T &label) const {
-    if (static_cast<int>(label) == ignore_index) {
-      return static_cast<T>(0.);
-    }
-    T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-x));
-    return simoid_x - label;
-  }
-
-  int ignore_index;
-};
 
 // Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
 template <typename DeviceContext, typename T>
@@ -70,16 +30,37 @@ class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
     const Tensor *X = context.Input<Tensor>("X");
     const Tensor *Labels = context.Input<Tensor>("Label");
     Tensor *Out = context.Output<Tensor>("Out");
-    Out->mutable_data<T>(context.GetPlace());
     int ignore_index = context.Attr<int>("ignore_index");
-
-    auto x = EigenVector<T>::Flatten(*X);
-    auto labels = EigenVector<T>::Flatten(*Labels);
-    auto out = EigenVector<T>::Flatten(*Out);
-    auto &place = *context.device_context<DeviceContext>().eigen_device();
-
-    out.device(place) = x.binaryExpr(
-        labels, SigmoidCrossEntropyWithLogitsForward<T>(ignore_index));
+    auto out_data = Out->mutable_data<T>(context.GetPlace());
+    int limit = Out->numel();
+    auto x_data = X->data<T>();
+    auto label_data = Labels->data<T>();
+    for (int idx = 0; idx < limit; ++idx) {
+      T x = x_data[idx];
+      T label = label_data[idx];
+      if (static_cast<int>(label) == ignore_index) {
+        out_data[idx] = static_cast<T>(0.);
+      } else {
+        T term1 = (x > 0) ? x : 0;
+        T term2 = x * label;
+        T term3 = std::log(static_cast<T>(1) + std::exp(-std::abs(x)));
+        out_data[idx] = term1 - term2 + term3;
+      }
+    }
+    bool normalize = context.Attr<bool>("normalize");
+    if (normalize) {
+      int norm = 0;
+      T eps = static_cast<T>(1e-6);
+      for (int idx = 0; idx < limit; ++idx) {
+        T diff = label_data[idx] - static_cast<T>(ignore_index);
+        if ((diff < -eps) || (diff > eps)) {
+          norm += 1;
+        }
+      }
+      eps = static_cast<T>(1e-5);
+      norm = norm > eps ? norm : eps;
+      std::for_each(out_data, out_data + limit, [norm](T &v) { v = v / norm; });
+    }
   }
 };
 
@@ -92,19 +73,39 @@ class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
     const Tensor *Labels = context.Input<Tensor>("Label");
     const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
     Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
-    dX->mutable_data<T>(context.GetPlace());
-
-    auto ignore_index = context.Attr<int>("ignore_index");
-    auto x = EigenVector<T>::Flatten(*X);
-    auto labels = EigenVector<T>::Flatten(*Labels);
-    auto dout = EigenVector<T>::Flatten(*dOut);
-    auto dx = EigenVector<T>::Flatten(*dX);
-    auto &place =
-        *context.template device_context<DeviceContext>().eigen_device();
+    auto dx_data = dX->mutable_data<T>(context.GetPlace());
 
-    auto diff = x.binaryExpr(labels, SigmoidCrossEntropyWithLogitsBackward<T>(
-                                         static_cast<int>(ignore_index)));
-    dx.device(place) = dout * diff;
+    int ignore_index = context.Attr<int>("ignore_index");
+    int limit = dX->numel();
+    auto x_data = X->data<T>();
+    auto label_data = Labels->data<T>();
+    auto dout_data = dOut->data<T>();
+    for (int idx = 0; idx < limit; ++idx) {
+      T x = x_data[idx];
+      T label = label_data[idx];
+      T dout = dout_data[idx];
+      if (static_cast<int>(label) == ignore_index) {
+        dx_data[idx] = static_cast<T>(0.);
+      } else {
+        T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-x));
+        T diff = simoid_x - label;
+        dx_data[idx] = dout * diff;
+      }
+    }
+    bool normalize = context.Attr<bool>("normalize");
+    if (normalize) {
+      int norm = 0;
+      T eps = static_cast<T>(1e-6);
+      for (int idx = 0; idx < limit; ++idx) {
+        T diff = label_data[idx] - static_cast<T>(ignore_index);
+        if ((diff < -eps) || (diff > eps)) {
+          norm += 1;
+        }
+      }
+      eps = static_cast<T>(1e-5);
+      norm = norm > eps ? norm : eps;
+      std::for_each(dx_data, dx_data + limit, [norm](T &v) { v = v / norm; });
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 88c4f508474e66953b79fb92ff1eb0b53a539f07..e7e990f759ba411f6954c51fb697a6befbad31b1 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -96,9 +96,13 @@ class TensorRTEngineOp : public framework::OperatorBase {
   void RunTrt(const framework::Scope &scope,
               const platform::Place &dev_place) const {
     int runtime_batch = 1;
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
     if (trt_engine_.get() == nullptr) {
       trt_engine_.reset(new TensorRTEngine(
-          max_batch_size_, workspace_size_, nullptr,
+          max_batch_size_, workspace_size_, stream,
           boost::get<platform::CUDAPlace>(dev_place).device));
       Prepare(scope, dev_place, trt_engine_.get());
     }
@@ -126,6 +130,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
       }
     }
 
+    cudaStreamSynchronize(stream);
     PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_);
     // Execute the engine.
     engine->Execute(runtime_batch);
@@ -163,7 +168,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
       output_index += 1;
     }
 
-    cudaStreamSynchronize(*engine->stream());
+    cudaStreamSynchronize(stream);
   }
 
   void Prepare(const framework::Scope &scope, const platform::Place &dev_place,
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
index 287b0edc96e5e312b0ff1725ee188ff319d44d23..391e7a1c070e040f6e90f820634c0d8b7cd40a96 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -99,7 +99,7 @@ TEST(TensorRTEngineOp, manual) {
   SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
                        block_->SerializeAsString());
   SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", 2);
-  SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 2 << 10);
+  SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 1 << 20);
   SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
   SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
                                     std::vector<std::string>({}));
@@ -193,7 +193,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
   SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
                        block_->SerializeAsString());
   SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", batch_size);
-  SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 2 << 10);
+  SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 1 << 20);
   SetAttr<std::vector<std::string>>(
       engine_op_desc.Proto(), "parameters",
       std::vector<std::string>({"y0", "y1", "y2", "y3"}));
diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h
index 9f504d14a8da116648483c0f64cb511b46e6a97e..2ce8f141d3c51661305f4952479cf2889fc4f396 100644
--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/cuda_device_function.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
 #include <cuda.h>
 // NOTE(): support float16 to half in header file.
 #define PADDLE_CUDA_FP16
@@ -30,6 +31,34 @@ namespace platform {
   mask = __ballot_sync(FULL_WARP_MASK, (predicate))
 #endif
 
+inline static int RoundToPowerOfTwo(int dim) {
+  if (dim > 512) {
+    return 1024;
+  } else if (dim > 256) {
+    return 512;
+  } else if (dim > 128) {
+    return 256;
+  } else if (dim > 64) {
+    return 128;
+  } else if (dim > 32) {
+    return 64;
+  } else {
+    return 32;
+  }
+}
+
+#define CUDA_LAUNCH_KERNEL_BASE(dim, ...)  \
+  case (dim): {                            \
+    constexpr auto kPowerOfTwoDim = (dim); \
+    __VA_ARGS__;                           \
+  } break
+
+#define CUDA_LAUNCH_KERNEL_HELPER(...)         \
+  CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__); \
+  CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__); \
+  CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__);  \
+  CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__);
+
 template <typename T>
 __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
                                                  int delta, int width = 32) {
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 8f80a2d7822f1dc16cee2514a991b7341f5d1cfd..2493fb71c019f9923012afa4a46cb3e95479f860 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -30,8 +30,9 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
   auto it = device_contexts_.find(place);
   if (it == device_contexts_.end()) {
     PADDLE_THROW(
-        "'Place' is not supported, Please re-compile with WITH_GPU "
-        "option");
+        "Place %s is not supported, Please re-compile with WITH_GPU "
+        "option",
+        place);
   }
   return it->second.get().get();
 }
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index ca89d91aadb2d3e9005e6dd06cef124428d7e250..400a6d7bfa5912774c4bbb2a5868dd9a471afd00 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 
 #include <algorithm>
+#include <cstdlib>
+#include <string>
 
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -58,7 +60,18 @@ DEFINE_string(selected_gpus, "",
 namespace paddle {
 namespace platform {
 
-int GetCUDADeviceCount() {
+static int GetCUDADeviceCountImpl() {
+  const auto *cuda_visible_devices = std::getenv("CUDA_VISIBLE_DEVICES");
+  if (cuda_visible_devices != nullptr) {
+    std::string cuda_visible_devices_str(cuda_visible_devices);
+    if (std::all_of(cuda_visible_devices_str.begin(),
+                    cuda_visible_devices_str.end(),
+                    [](char ch) { return ch == ' '; })) {
+      VLOG(2) << "CUDA_VISIBLE_DEVICES is set to be empty. No GPU detected.";
+      return 0;
+    }
+  }
+
   int count;
   PADDLE_ENFORCE(
       cudaGetDeviceCount(&count),
@@ -66,6 +79,11 @@ int GetCUDADeviceCount() {
   return count;
 }
 
+int GetCUDADeviceCount() {
+  static auto dev_cnt = GetCUDADeviceCountImpl();
+  return dev_cnt;
+}
+
 int GetCUDAComputeCapability(int id) {
   PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
   cudaDeviceProp device_prop;
@@ -203,13 +221,17 @@ size_t GpuMaxChunkSize() {
 void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                     enum cudaMemcpyKind kind, cudaStream_t stream) {
   PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream),
-                 "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync");
+                 "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync "
+                 "(%p -> %p, length: %d)",
+                 src, dst, static_cast<int>(count));
 }
 
 void GpuMemcpySync(void *dst, const void *src, size_t count,
                    enum cudaMemcpyKind kind) {
   PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind),
-                 "cudaMemcpy failed in paddle::platform::GpuMemcpySync");
+                 "cudaMemcpy failed in paddle::platform::GpuMemcpySync (%p -> "
+                 "%p, length: %d)",
+                 src, dst, static_cast<int>(count));
 }
 
 void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index faac6a12c66378d090b642312df4538aeeb3d8cd..269280d604a13a62046fb7811d34b7c69b61b50f 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -365,7 +365,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
     mem_fmt.ndims = axis.size();
     for (unsigned int i = 0; i < nchw_tz.size(); ++i) {
       mem_fmt.dims[i] = nchw_tz[i];  // logical dimensions (nchw format,
-                                     // regardless physical layout)
+      // regardless physical layout)
     }
     mem_fmt.data_type = mkldnn_f32;
     mem_fmt.format = mkldnn_blocked;
@@ -374,7 +374,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
     for (int i = nchw_tz.size() - 1; i >= 0; --i) {
       mem_fmt.layout_desc.blocking.padding_dims[i] =
           nchw_tz[i];  // logical dimensions (nchw format, regardless physical
-                       // layout)
+      // layout)
       mem_fmt.layout_desc.blocking.block_dims[i] = 1;
       mem_fmt.layout_desc.blocking.offset_padding_to_data[i] = 0;  // no offset
       mem_fmt.layout_desc.blocking.strides[0][axis[i]] = total_stride;
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 67b2386813eeafca5ac627edf79cad7e02445b8c..803ea6b26087884ad79c6bf80238953a012eaddc 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -18,9 +18,9 @@ if(WITH_PYTHON)
       SRCS ${PYBIND_SRCS}
       DEPS ${PYBIND_DEPS}
       ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
-    if(NOT APPLE AND NOT ANDROID AND NOT WIN32)
+    if(NOT APPLE AND NOT WIN32)
       target_link_libraries(paddle_pybind rt)
-    endif(NOT APPLE AND NOT ANDROID AND NOT WIN32)
+    endif(NOT APPLE AND NOT WIN32)
   endif(WITH_AMD_GPU)
 
   get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index dbc7843caa0c0a39a32cda6050fa99a3ab4c3e22..31c3bfa43ffec22059a602e9ff09a33188d72c91 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -15,18 +15,38 @@ limitations under the License. */
 #include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/imperative/tracer.h"
+#include "paddle/fluid/imperative/type_defs.h"
 
 namespace paddle {
 namespace pybind {
 
 // Bind Methods
-void BindTracer(pybind11::module *m) {
+void BindTracer(pybind11::module* m) {
   pybind11::class_<imperative::Tracer>(*m, "Tracer", "")
       .def("__init__",
-           [](imperative::Tracer &self, framework::BlockDesc *root_block) {
+           [](imperative::Tracer& self, framework::BlockDesc* root_block) {
              new (&self) imperative::Tracer(root_block);
            })
-      .def("trace", &imperative::Tracer::Trace)
+      .def("trace",
+           [](imperative::Tracer& self, imperative::OpBase* op,
+              const imperative::VarBasePtrMap& inputs,
+              const imperative::VarBasePtrMap& outputs,
+              framework::BlockDesc* block,
+              const platform::CPUPlace expected_place,
+              const bool stop_gradient = false) {
+             self.Trace(op, inputs, outputs, block, expected_place,
+                        stop_gradient);
+           })
+      .def("trace",
+           [](imperative::Tracer& self, imperative::OpBase* op,
+              const imperative::VarBasePtrMap& inputs,
+              const imperative::VarBasePtrMap& outputs,
+              framework::BlockDesc* block,
+              const platform::CUDAPlace expected_place,
+              const bool stop_gradient = false) {
+             self.Trace(op, inputs, outputs, block, expected_place,
+                        stop_gradient);
+           })
       .def("py_trace", &imperative::Tracer::PyTrace,
            pybind11::return_value_policy::take_ownership);
 }
diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index d32fe58f8695a5c14f276ef038416f5c47f3400f..24059140ab20e24917b93a5f60936b1087797ff9 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -15,7 +15,9 @@
 #include "paddle/fluid/pybind/ir.h"
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/var_desc.h"
@@ -24,6 +26,7 @@
 namespace py = pybind11;
 using paddle::framework::ir::Graph;
 using paddle::framework::ir::Node;
+using paddle::framework::ir::GraphSafeRemoveNodes;
 using paddle::framework::OpDesc;
 using paddle::framework::ProgramDesc;
 using paddle::framework::VarDesc;
@@ -32,6 +35,7 @@ using pybind11::return_value_policy;
 namespace paddle {
 namespace pybind {
 void BindGraph(py::module *m) {
+  m->def("graph_safe_remove_nodes", GraphSafeRemoveNodes);
   py::class_<Graph, std::shared_ptr<Graph>>(
       *m, "Graph",
       "The graph is a Directed Acyclic Single Static Assignment Graph, see "
@@ -42,6 +46,8 @@ void BindGraph(py::module *m) {
       .def("get_float", &Graph::Get<float>)
       .def("get_double", &Graph::Get<double>)
       .def("get_string", &Graph::Get<std::string>)
+      .def("get_program", &Graph::Get<ProgramDesc>)
+      .def("get_marked_nodes", &Graph::Get<std::unordered_set<const Node *>>)
       .def("set", [](Graph &self, const std::string &attr_name,
                      int attr) { return self.Set(attr_name, new int(attr)); })
       .def("set",
@@ -57,6 +63,17 @@ void BindGraph(py::module *m) {
            [](Graph &self, const std::string &attr_name, double attr) {
              return self.Set(attr_name, new double(attr));
            })
+      .def("set",
+           [](Graph &self, const std::string &attr_name,
+              const ProgramDesc &attr) {
+             return self.Set(attr_name, new ProgramDesc(attr));
+           })
+      .def("set",
+           [](Graph &self, const std::string &attr_name,
+              const std::unordered_set<const Node *> &attr) {
+             return self.Set(attr_name,
+                             new std::unordered_set<const Node *>(attr));
+           })
       .def("erase", &Graph::Erase)
       .def("nodes", &Graph::Nodes, return_value_policy::reference)
       .def("create_var_node",
@@ -85,12 +102,52 @@ void BindNode(py::module *m) {
   py::class_<Node> node(*m, "Node");
   node.def("name", &Node::Name)
       .def("node_type", &Node::NodeType)
-      .def("var", &Node::Var)
-      .def("op", &Node::Op)
+      .def("var", &Node::Var, return_value_policy::reference)
+      .def("op", &Node::Op, return_value_policy::reference)
       .def("id", &Node::id)
       .def("is_op", &Node::IsOp)
       .def("is_var", &Node::IsVar)
       .def("is_ctrl_var", &Node::IsCtrlVar)
+      .def("inputs_remove",
+           [](Node &self, int node_id) {
+             for (auto it = self.inputs.begin(); it != self.inputs.end();
+                  it++) {
+               if ((*it)->id() == node_id) {
+                 self.inputs.erase(it);
+               }
+             }
+           })
+      .def("inputs_remove",
+           [](Node &self, Node &node) {
+             for (auto it = self.inputs.begin(); it != self.inputs.end();
+                  it++) {
+               if (*it == &node) {
+                 self.inputs.erase(it);
+               }
+             }
+           })
+      .def("inputs_append",
+           [](Node &self, Node &node) { self.inputs.push_back(&node); })
+      .def("outputs_remove",
+           [](Node &self, int node_id) {
+             for (auto it = self.outputs.begin(); it != self.outputs.end();
+                  it++) {
+               if ((*it)->id() == node_id) {
+                 self.outputs.erase(it);
+               }
+             }
+           })
+      .def("outputs_remove",
+           [](Node &self, Node &node) {
+             for (auto it = self.outputs.begin(); it != self.outputs.end();
+                  it++) {
+               if (*it == &node) {
+                 self.outputs.erase(it);
+               }
+             }
+           })
+      .def("outputs_append",
+           [](Node &self, Node &node) { self.outputs.push_back(&node); })
       .def_readwrite("inputs", &Node::inputs)
       .def_readwrite("outputs", &Node::outputs);
 
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 4b218fb3a2af0933ea1e87abe20e7e031c32f721..e729be4a95a58510f1e0162af4216feaa400d971 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -228,7 +228,7 @@ void BindBlockDesc(pybind11::module *m) {
 
 void BindVarDsec(pybind11::module *m) {
   pybind11::class_<pd::VarDesc> var_desc(*m, "VarDesc", "");
-  var_desc
+  var_desc.def(pybind11::init<const std::string &>())
       .def("name", &pd::VarDesc::Name, pybind11::return_value_policy::reference)
       .def("set_name", &pd::VarDesc::SetName)
       .def("set_shape", &pd::VarDesc::SetShape)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index b086c218988b09955a06743325d00f9bd57f7d90..9b8bc16e24289548d9ef28253d0997e0490fc5d2 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -138,6 +138,22 @@ PYBIND11_MODULE(core, m) {
       .def("_grad_ivar",
            [](const imperative::VarBase &self) { return self.grads_; },
            py::return_value_policy::reference)
+      .def("_copy_to",
+           [](const imperative::VarBase &self, const platform::CPUPlace &place,
+              bool blocking) {
+             std::unique_ptr<imperative::VarBase> new_var =
+                 self.NewVarBase(place, blocking);
+             return new_var.release();
+           },
+           py::return_value_policy::take_ownership)
+      .def("_copy_to",
+           [](const imperative::VarBase &self, const platform::CUDAPlace &place,
+              bool blocking) {
+             std::unique_ptr<imperative::VarBase> new_var =
+                 self.NewVarBase(place, blocking);
+             return new_var.release();
+           },
+           py::return_value_policy::take_ownership)
       .def("value", [](const imperative::VarBase &self) { return self.var_; },
            py::return_value_policy::reference)
       .def_property(
@@ -626,7 +642,18 @@ All parameter, weight, gradient are variables in Paddle.
   py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
 #endif
   py::class_<platform::CUDAPlace>(m, "CUDAPlace")
-      .def(py::init<int>())
+      .def("__init__",
+           [](platform::CUDAPlace &self, int dev_id) {
+#ifdef PADDLE_WITH_CUDA
+             PADDLE_ENFORCE(
+                 dev_id >= 0 && dev_id < platform::GetCUDADeviceCount(),
+                 "Invalid CUDAPlace(%d), must inside [0, %d)", dev_id,
+                 platform::GetCUDADeviceCount());
+             new (&self) platform::CUDAPlace(dev_id);
+#else
+             PADDLE_THROW("Cannot use CUDAPlace in CPU only version");
+#endif
+           })
       .def("__str__", string::to_string<const platform::CUDAPlace &>);
 
   py::class_<paddle::platform::CPUPlace>(m, "CPUPlace")
@@ -634,7 +661,12 @@ All parameter, weight, gradient are variables in Paddle.
       .def("__str__", string::to_string<const platform::CPUPlace &>);
 
   py::class_<paddle::platform::CUDAPinnedPlace>(m, "CUDAPinnedPlace")
-      .def(py::init<>())
+      .def("__init__",
+           [](platform::CUDAPinnedPlace &) {
+#ifndef PADDLE_WITH_CUDA
+             PADDLE_THROW("Cannot use CUDAPinnedPlace in CPU only version");
+#endif
+           })
       .def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
 
   py::class_<platform::Place>(m, "Place")
@@ -788,21 +820,33 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("disable_profiler", platform::DisableProfiler);
   m.def("is_profiler_enabled", platform::IsProfileEnabled);
   m.def("reset_profiler", platform::ResetProfiler);
+  m.def("get_pass", [](const py::bytes &binary_str) {
+    std::string pass_type(binary_str);
+    auto pass = framework::ir::PassRegistry::Instance().Get(pass_type);
+    return std::shared_ptr<framework::ir::Pass>(std::move(pass));
+  });
 
   py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
   pass.def(py::init())
+      .def("has", &ir::Pass::Has)
+      .def("set",
+           [](ir::Pass &self, const std::string &attr_name,
+              const ProgramDesc &attr) {
+             return self.Set(attr_name, new ProgramDesc(attr));
+           })
       .def(
-          "set_str",
+          "set",
           [](ir::Pass &self, const std::string &name, const std::string &attr) {
             self.Set<std::string>(name, new std::string(attr));
           })
-      .def("set_int", [](ir::Pass &self, const std::string &name,
-                         int val) { self.Set<const int>(name, new int(val)); })
+      .def("set", [](ir::Pass &self, const std::string &name,
+                     int val) { self.Set<const int>(name, new int(val)); })
+      .def("get_program", &ir::Pass::Get<ProgramDesc>)
       .def("type", &ir::Pass::Type)
       .def("apply", [](ir::Pass &self, std::shared_ptr<ir::Graph> graph) {
         std::unique_ptr<ir::Graph> origin_graph(graph.get());
         auto optim_graph = self.Apply(std::move(origin_graph));
-        graph.reset(optim_graph.release());
+        optim_graph.release();
       });
 
   py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(
@@ -993,7 +1037,7 @@ All parameter, weight, gradient are variables in Paddle.
             PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
             self.remove_unnecessary_lock_ = b;
           },
-          R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default False.)DOC")
+          R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default True.)DOC")
       .def_property(
           "num_trainers",
           [](const BuildStrategy &self) { return self.num_trainers_; },
diff --git a/paddle/py_paddle/.gitignore b/paddle/py_paddle/.gitignore
deleted file mode 100644
index 80d1f76fbc05627e21e334af55d63a4a534434c6..0000000000000000000000000000000000000000
--- a/paddle/py_paddle/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-swig_paddle.py
-_swig_paddle.so
diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py
deleted file mode 100644
index 43614b9779d21795f1f274589ea93639e923ce75..0000000000000000000000000000000000000000
--- a/paddle/py_paddle/dataprovider_converter.py
+++ /dev/null
@@ -1,309 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.trainer.PyDataProvider2 as dp2
-import collections
-import swig_paddle
-import numpy
-import itertools
-from functools import reduce
-
-__all__ = ['DataProviderConverter']
-
-
-class IScanner(object):
-    """
-    The scanner will scan Python object two passes, then convert it to Paddle's
-    argument.
-
-    In the first pass, `pre_scan` will be invoked by every data instance, and
-    then invoke `finish_pre_scan` to arguments. And the second pass do the same
-    thing except the functions changed to `scan`, `finish_scan`.
-
-    During the first pass, a scanner may count the shape of input matrix and
-    allocate memory for this argument. Then fill the data into this  argument
-    in second pass.
-    """
-
-    def __init__(self, input_type, pos):
-        self.input_type = input_type
-        if not isinstance(self.input_type, dp2.InputType):
-            raise ValueError("input type should be dataprovider2.InputType")
-        self.pos = pos
-        # data_in_gpu is used to indicate whether to create argument on GPU
-        # or not in GPU mode. Now if using one thread (trainer_count=1),
-        # trainer uses NeuralNetwork which needs to create argument on GPU
-        # before calling forward function. So, set data_in_gpu to True.
-        # Otherwise, trainer uses MultiGradientMachine which will transfer
-        # data from CPU to GPU in the forward function, set data_in_gpu to
-        # False in this case.
-        self.data_in_gpu = swig_paddle.isUsingGpu(
-        ) and swig_paddle.getTrainerCount() == 1
-
-    def pre_scan(self, dat):
-        """
-        First pass scan method. During this method, the scanner could count the
-        data number, and get the total memory size this batch would use.
-
-        :param dat: The python object.
-        """
-        pass
-
-    def finish_pre_scan(self, argument):
-        """
-        Finish first scan pass. Allocate the memory.
-
-        :param argument: Output arguments object.
-        :type argument: swig_paddle.Arguments
-        :param dat: Output arguments object.
-        :type dat: The Python object, numpy.array or List.
-        :return:
-        """
-        pass
-
-    def scan(self, dat):
-        """
-        Second pass scan method. Copy the data to arguments.
-
-        :param dat: The python object.
-        """
-        pass
-
-    def finish_scan(self, argument):
-        """
-        Finish second pass. Finalize the resources, etc.
-
-        :param argument: Output arguments object.
-        :type argument: swig_paddle.Arguments
-        """
-        pass
-
-
-class DenseScanner(IScanner):
-    """
-    :type __mat__: numpy.ndarray
-    """
-
-    def __init__(self, input_type, pos):
-        IScanner.__init__(self, input_type, pos)
-        self.__mat__ = None
-        self.__shape__ = None
-        self.__height__ = 0
-        self.__dim__ = 0
-
-    def pre_scan(self, dat):
-        self.__height__ += 1
-        if self.__shape__ is None:
-            self.__shape__ = numpy.array(dat).shape
-            if len(self.__shape__) > 3:
-                raise ValueError(
-                    "The dimension of input cannot be greater than 3.")
-            if len(self.__shape__) == 0:
-                raise ValueError(
-                    "The input should be a vector, please check your input data."
-                )
-            self.__dim__ = reduce(lambda x, y: x * y, self.__shape__)
-            if len(self.__shape__) == 1 and self.__dim__ != self.input_type.dim:
-                raise ValueError(
-                    "The data size must be equal to it in data layer.")
-        else:
-            if self.__shape__ != numpy.array(dat).shape:
-                raise ValueError(
-                    "The data shape must be same in one mini-batch.")
-
-    def finish_pre_scan(self, argument):
-        self.__mat__ = numpy.ndarray(
-            shape=(self.__height__, self.__dim__), dtype=numpy.float32)
-        self.__height__ = 0
-
-    def scan(self, dat):
-        # It's better to use NumPy array for speed.
-        dat = numpy.array(dat)
-        dat = dat.flatten()
-        self.__mat__[self.__height__] = dat
-        self.__height__ += 1
-
-    def finish_scan(self, argument):
-        assert isinstance(argument, swig_paddle.Arguments)
-        if self.__mat__.dtype != numpy.float32:
-            self.__mat__ = self.__mat__.astype(numpy.float32)
-        m = swig_paddle.Matrix.createDenseFromNumpy(self.__mat__, True,
-                                                    self.data_in_gpu)
-        argument.setSlotValue(self.pos, m)
-        if len(self.__shape__) > 1:
-            # The last-two dimenstions are the frame height and width.
-            # For example, the layout is CHW for 3-D feature of image.
-            # The H and W are the frame height and width.
-            h, w = self.__shape__[-2:]
-            argument.setSlotFrameHeight(self.pos, h)
-            argument.setSlotFrameWidth(self.pos, w)
-        self.__shape__ = None
-
-
-class SparseBinaryScanner(IScanner):
-    def __init__(self, input_type, pos):
-        IScanner.__init__(self, input_type, pos)
-        self.__rows__ = [0]
-        self.__cols__ = []
-        self.__height__ = 0
-        self.__value__ = []
-
-    def scan(self, dat):
-        self.extend_cols(dat)
-        self.__rows__.append(len(self.__cols__))
-        self.__height__ += 1
-
-    def extend_cols(self, dat):
-        self.__cols__.extend(dat)
-
-    def finish_scan(self, argument):
-        assert isinstance(argument, swig_paddle.Arguments)
-        m = swig_paddle.Matrix.createSparse(
-            self.__height__,
-            self.input_type.dim,
-            len(self.__cols__),
-            len(self.__value__) == 0,
-            False,  # trans
-            False)  # TODO supoort GPU
-        assert isinstance(m, swig_paddle.Matrix)
-        m.sparseCopyFrom(self.__rows__, self.__cols__, self.__value__)
-        argument.setSlotValue(self.pos, m)
-
-
-class SparseFloatScanner(SparseBinaryScanner):
-    def __init__(self, input_type, pos):
-        SparseBinaryScanner.__init__(self, input_type, pos)
-
-    def extend_cols(self, dat):
-        self.__cols__.extend((x[0] for x in dat))
-        self.__value__.extend((x[1] for x in dat))
-
-
-class IndexScanner(IScanner):
-    def __init__(self, input_type, pos):
-        IScanner.__init__(self, input_type, pos)
-        self.__ids__ = None
-        self.__idx__ = 0
-
-    def pre_scan(self, dat):
-        self.__idx__ += 1
-
-    def finish_pre_scan(self, argument):
-        self.__ids__ = [0] * self.__idx__
-        self.__idx__ = 0
-
-    def scan(self, dat):
-        self.__ids__[self.__idx__] = dat
-        self.__idx__ += 1
-
-    def finish_scan(self, argument):
-        ids = swig_paddle.IVector.create(self.__ids__, self.data_in_gpu)
-        assert isinstance(argument, swig_paddle.Arguments)
-        argument.setSlotIds(self.pos, ids)
-
-
-class SequenceScanner(IScanner):
-    def __init__(self, input_type, pos, inner_scanner, setter):
-        IScanner.__init__(self, input_type, pos)
-        self.__seq__ = [0]
-        self.__inner_scanner__ = inner_scanner
-        self.__setter__ = setter
-
-    def pre_scan(self, dat):
-        for each in dat:
-            self.__inner_scanner__.pre_scan(each)
-
-    def finish_pre_scan(self, argument):
-        self.__inner_scanner__.finish_pre_scan(argument)
-
-    def scan(self, dat):
-        self.__seq__.append(self.__seq__[-1] + self.get_size(dat))
-        for each in dat:
-            self.__inner_scanner__.scan(each)
-
-    def finish_scan(self, argument):
-        seq = swig_paddle.IVector.create(self.__seq__, False)
-        self.__setter__(argument, self.pos, seq)
-        self.__inner_scanner__.finish_scan(argument)
-
-    def get_size(self, dat):
-        if isinstance(self.__inner_scanner__, SequenceScanner):
-            return sum(self.__inner_scanner__.get_size(item) for item in dat)
-        else:
-            return len(dat)
-
-
-class DataProviderConverter(object):
-    def __init__(self, input_types):
-        self.input_types = input_types
-        assert isinstance(self.input_types, collections.Sequence)
-        for each in self.input_types:
-            assert isinstance(each, dp2.InputType)
-
-    def convert(self, dat, argument=None):
-        if argument is None:
-            argument = swig_paddle.Arguments.createArguments(0)
-        assert isinstance(argument, swig_paddle.Arguments)
-        argument.resize(len(self.input_types))
-
-        scanners = [
-            DataProviderConverter.create_scanner(i, each_type)
-            for i, each_type in enumerate(self.input_types)
-        ]
-
-        for each_sample in dat:
-            for each_step, scanner in itertools.izip(each_sample, scanners):
-                scanner.pre_scan(each_step)
-
-        for scanner in scanners:
-            scanner.finish_pre_scan(argument)
-
-        for each_sample in dat:
-            for each_step, scanner in itertools.izip(each_sample, scanners):
-                scanner.scan(each_step)
-
-        for scanner in scanners:
-            scanner.finish_scan(argument)
-
-        return argument
-
-    def __call__(self, dat, argument=None):
-        return self.convert(dat, argument)
-
-    @staticmethod
-    def create_scanner(i, each):
-        assert isinstance(each, dp2.InputType)
-        retv = None
-        if each.type == dp2.DataType.Dense:
-            retv = DenseScanner(each, i)
-        elif each.type == dp2.DataType.Index:
-            retv = IndexScanner(each, i)
-        elif each.type == dp2.DataType.SparseNonValue:
-            retv = SparseBinaryScanner(each, i)
-        elif each.type == dp2.DataType.SparseValue:
-            retv = SparseFloatScanner(each, i)
-        assert retv is not None
-
-        if each.seq_type == dp2.SequenceType.SUB_SEQUENCE:
-            retv = SequenceScanner(
-                each, i, retv,
-                lambda a, p, seq: a.setSlotSubSequenceStartPositions(p, seq))
-
-        if each.seq_type in [
-                dp2.SequenceType.SUB_SEQUENCE, dp2.SequenceType.SEQUENCE
-        ]:
-            retv = SequenceScanner(
-                each, i, retv,
-                lambda a, p, seq: a.setSlotSequenceStartPositions(p, seq))
-        return retv
diff --git a/paddle/py_paddle/util.py b/paddle/py_paddle/util.py
deleted file mode 100644
index 3ae8dbf964c68c6f01ba30cb3ac69fb6c2f08c30..0000000000000000000000000000000000000000
--- a/paddle/py_paddle/util.py
+++ /dev/null
@@ -1,578 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Some Useful method for py_paddle.
-"""
-
-import swig_paddle
-import os
-import paddle.trainer.PyDataProviderWrapper
-import paddle.proto.ParameterConfig_pb2
-import paddle.proto.ModelConfig_pb2
-import paddle.proto.TrainerConfig_pb2
-import weakref
-import numpy
-import struct
-import sys
-import copy
-
-
-def initializePaddle(*args):
-    """
-    To initialize paddle process.
-    :param args: Command line options, such as --use_gpu=0, etc.
-    :return: Nothing.
-    """
-    old_argv = copy.deepcopy(sys.argv)
-    old_pypath = os.getenv("PYTHONPATH")
-    pypath = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
-    if old_pypath is not None:
-        pypath = os.pathsep.join([pypath, old_pypath])
-        os.putenv("PYTHONPATH", pypath)
-    args = [""] + list(args)  # argv[0] is command name, it is not important.
-    swig_paddle.__initPaddle__(args)
-    sys.argv = old_argv
-
-
-def __monkeypatch_init_paddle__():
-    swig_paddle.__initPaddle__ = swig_paddle.initPaddle
-    swig_paddle.initPaddle = initializePaddle
-
-
-class __ParameterCallbackWrapper__(swig_paddle.UpdateCallback):
-    """
-    Wrap the python callable object to paddle.UpdateCallback.
-
-    INTERNAL USE ONLY.
-    """
-
-    def __init__(self, callback):
-        swig_paddle.UpdateCallback.__init__(self)
-        self.callback = callback
-
-    def apply(self, param):
-        self.callback(param)
-
-    @staticmethod
-    def wrap(callback):
-        """
-        Cast the python callable object/paddle.UpdateCallback to
-        swig_paddle.UpdateCallback.__disown__
-        :param callback: callable or swig_paddle.UpdateCallback object.
-        """
-        if isinstance(callback, swig_paddle.UpdateCallback):
-            return callback.__disown__()
-        elif isinstance(callback, weakref.ProxyType):
-            raise RuntimeError("Should not pass __disown__ object")
-        else:
-            return __ParameterCallbackWrapper__(callback).__disown__()
-
-
-def __arguments_to_numpy__(i, arg):
-    assert isinstance(arg, swig_paddle.Arguments)
-    value = arg.getSlotValue(i)
-    ids = arg.getSlotIds(i)
-    prob = arg.getSlotIn(i)
-    if value is not None:
-        assert isinstance(value, swig_paddle.Matrix)
-        value = value.copyToNumpyMat()
-    if ids is not None:
-        assert isinstance(ids, swig_paddle.IVector)
-        ids = ids.copyToNumpyArray()
-    if prob is not None:
-        assert isinstance(prob, swig_paddle.Matrix)
-        prob = prob.copyToNumpyMat()
-    return {"value": value, "id": ids, "prob": prob}
-
-
-def __monkeypatch_gradient_machine__():
-    """
-    Add some class methods to GradientMachine.
-    This method should be only used internally.
-    """
-    swig_paddle.GradientMachine.loadFromConfigFile = \
-        staticmethod(loadGradientMachine)
-
-    def __matrix_to_numpy__(m):
-        if isinstance(m, swig_paddle.Matrix):
-            return m.copyToNumpyMat()
-        elif isinstance(m, swig_paddle.IVector):
-            return m.copyToNumpyArra()
-        else:
-            raise RuntimeError("Input arg should be matrix or vecotr.")
-
-    def createFromConfigProto(protoObj,
-                              createMode=swig_paddle.CREATE_MODE_NORMAL,
-                              paramTypes=[
-                                  swig_paddle.PARAMETER_VALUE,
-                                  swig_paddle.PARAMETER_GRADIENT,
-                                  swig_paddle.PARAMETER_MOMENTUM
-                              ]):
-        """
-        Create Gradient Machine From Proto object.
-        :param protoObj: Model config
-        :type protoObj: proto.ModelConfig_pb2.ModelConfig
-        :param createMode: Create Mode, default is normal.
-        :type createMode: int
-        :param paramTypes: the gradient machine parameter type.
-        :type paramTypes: list of int
-        :return: paddle.GradientMachine
-        """
-        assert isinstance(protoObj, paddle.proto.ModelConfig)
-        return swig_paddle.GradientMachine.createByConfigProtoStr(
-            protoObj.SerializeToString(), createMode, paramTypes)
-
-    swig_paddle.GradientMachine.createFromConfigProto = \
-        staticmethod(createFromConfigProto)
-
-    def forwardTest(self, inArgs):
-        """
-        forwardTest. forward gradient machine in test mode, and return a numpy
-        matrix dict.
-
-        :param inArgs: The input arguments
-        :type inArgs: paddle.Arguments
-        :return: A dictionary with keys ['id', 'value'], each value is a
-                 numpy.ndarray.
-        """
-        outArgs = swig_paddle.Arguments.createArguments(0)
-        self.forward(inArgs, outArgs, swig_paddle.PASS_TEST)
-        return [
-            __arguments_to_numpy__(i, outArgs)
-            for i in xrange(outArgs.getSlotNum())
-        ]
-
-    swig_paddle.GradientMachine.forwardTest = forwardTest
-
-    # Monkey patching backward
-    swig_paddle.GradientMachine.__backward__ = swig_paddle.GradientMachine.backward
-
-    def backward(self, callback):
-        """
-        GradientMachine Backward
-        :param callback: a callback which parameter is (paddle.Parameter) or
-                         a paddle.UpdateCallback object.
-        """
-        self.__backward__(__ParameterCallbackWrapper__.wrap(callback))
-
-    swig_paddle.GradientMachine.backward = backward
-
-    # Monkey patching forwardBackward.
-    swig_paddle.GradientMachine.__forwardBackward__ = \
-        swig_paddle.GradientMachine.forwardBackward
-
-    def forwardBackward(self,
-                        inArgs,
-                        outArgs,
-                        passType,
-                        callback=swig_paddle.UpdateCallback()):
-        """
-        GradientMachine forward backward.
-        :param inArgs: Input Arguments for GradientMachine.
-        :type inArgs: paddle.Arguments
-        :param outArgs: Output Arguments for GradientMachine.
-        :type outArgs: paddle.Arguments
-        :param passType: gradient machine's pass type.
-        :type passType: paddle.PassType
-        :param callback: a callable object with arguments (paddle.Parameter) or
-                         a paddle.UpdateCallback it will be called when
-                         backward
-        """
-        self.__forwardBackward__(inArgs, outArgs, passType,
-                                 __ParameterCallbackWrapper__.wrap(callback))
-
-    swig_paddle.GradientMachine.forwardBackward = forwardBackward
-
-    def getParameters(self):
-        return (self.getParameter(i) for i in xrange(self.getParameterSize()))
-
-    swig_paddle.GradientMachine.getParameters = getParameters
-
-    def getNonStaticParameters(self):
-        return (self.getNonStaticParameter(i)
-                for i in xrange(self.getNonStaticParameterSize()))
-
-    swig_paddle.GradientMachine.getNonStaticParameters = getNonStaticParameters
-
-    def getLayerOutputs(self, layerNames):
-        """
-        getLayerOutputs. get outputs of layers and return a numpy matrix dict.
-        :param layerNames: layer names.
-        :type layerNames: string or list.
-        """
-        if isinstance(layerNames, basestring):
-            layerNames = [layerNames]
-        elif not isinstance(layerNames, list):
-            raise RuntimeError("Input args shuld be string or a sting list.")
-
-        output = dict()
-        for name in layerNames:
-            output[name] = __arguments_to_numpy__(0, self.getLayerOutput(name))
-        return output
-
-    swig_paddle.GradientMachine.getLayerOutputs = getLayerOutputs
-
-
-def loadGradientMachine(config_filename, model_dir=None):
-    """
-    Load a gradient machine from config file name/path.
-    :param config_filename: The trainer config file name/path
-    :param model_dir: The model parameter directory. None if same as the
-    directory of config_filename
-    :return: GradientMachine with some enhance methods.
-    :rtype: paddle.GradientMachine
-    """
-    trainer_config = swig_paddle.TrainerConfig.createFromTrainerConfigFile(
-        config_filename)
-    assert isinstance(trainer_config, swig_paddle.TrainerConfig)
-    model_conf = trainer_config.getModelConfig()
-    network = swig_paddle.GradientMachine.createByModelConfig(model_conf)
-    assert isinstance(network, swig_paddle.GradientMachine)
-    if model_dir is None:
-        model_dir = os.path.dirname(config_filename)
-    network.loadParameters(model_dir)
-    return network
-
-
-def loadParameterFile(fn):
-    """
-    Load Paddle Parameter file to numpy.ndarray
-    :param fn: file name or file like object.
-    :type fn: str or file like object.
-    :return: numpy array
-    :rtype: numpy.ndarray
-    :raise: paddle.UnsupportError when parameter format is wrong.
-    """
-    if isinstance(fn, str):
-        with open(fn, 'rb') as f:
-            return loadParameterFile(f)
-    elif hasattr(fn, 'read'):  # File like object
-        version, = struct.unpack('i', fn.read(4))
-        if version != 0:
-            raise swig_paddle.UnsupportError()
-        value_length, = struct.unpack("I", fn.read(4))
-        if value_length != 4 and value_length != 8:
-            raise swig_paddle.UnsupportError()
-        dtype = 'float32' if value_length == 4 else 'float64'
-        param_size, = struct.unpack("L", fn.read(8))
-        value = numpy.fromfile(fn, dtype)
-        if len(value) != param_size:
-            raise swig_paddle.UnsupportError()
-        return value
-    else:
-        raise swig_paddle.UnsupportError()
-
-
-class DataProviderWrapperConverter(object):
-    """
-    A class convert DataFormat from PyDataProvider Wrapper to
-    py_paddle.paddle.Arguemnts.
-    """
-
-    class DenseValueConverter(object):
-        """
-        Internal class
-        """
-
-        def __init__(self, header_def):
-            self.__dim__ = header_def.dim
-            self.buf = []
-
-        def append(self, other):
-            assert len(other) == self.__dim__
-            self.buf += other
-
-        def __call__(self, slot_idx, arg):
-            mat = swig_paddle.Matrix.createDense(self.buf,
-                                                 len(self.buf) / self.__dim__,
-                                                 self.__dim__)
-            arg.setSlotValue(slot_idx, mat)
-
-    class IdValueConverter(object):
-        """
-        Internal class
-        """
-
-        def __init__(self, *args):
-            self.buf = []
-
-        def append(self, other):
-            assert isinstance(other, int)
-            self.buf.append(other)
-
-        def __call__(self, slot_idx, arg):
-            arg.setSlotIds(slot_idx, swig_paddle.IVector.create(self.buf))
-
-    class SparseNonValueConverter(object):
-        """
-        Internal class
-        """
-
-        def __init__(self, slot_def):
-            self.indices = [0]
-            self.cols = []
-            self.dim = slot_def.dim
-
-        def append(self, other):
-            self.indices.append(self.indices[-1] + len(other))
-            self.cols += other
-
-        def __call__(self, slot_idx, arg):
-            mat = swig_paddle.Matrix.createSparse(
-                len(self.indices) - 1, self.dim, len(self.cols), True)
-            assert isinstance(mat, swig_paddle.Matrix)
-            mat.sparseCopyFrom(self.indices, self.cols)
-            self.putIntoArg(slot_idx, arg, mat)
-
-        def putIntoArg(self, slot_idx, arg, mat):
-            arg.setSlotValue(slot_idx, mat)
-
-    class SparseValueConverter(SparseNonValueConverter):
-        """
-        Internal class
-        """
-
-        def __init__(self, slot_def):
-            super(DataProviderWrapperConverter.SparseValueConverter,
-                  self).__init__(slot_def)
-            self.values = []
-
-        def append(self, other):
-            super(DataProviderWrapperConverter.SparseValueConverter,
-                  self).append(map(lambda x: x[0], other))
-            self.values += map(lambda x: x[1], other)
-
-        def __call__(self, slot_idx, arg):
-            mat = swig_paddle.Matrix.createSparse(
-                len(self.indices) - 1, self.dim, len(self.cols), False)
-            assert isinstance(mat, swig_paddle.Matrix)
-            mat.sparseCopyFrom(self.indices, self.cols, self.values)
-            self.putIntoArg(slot_idx, arg, mat)
-
-    __SLOT_VALUE_CONVERTER_MAP__ = {
-        paddle.trainer.PyDataProviderWrapper.DenseSlot: DenseValueConverter,
-        paddle.trainer.PyDataProviderWrapper.IndexSlot: IdValueConverter,
-        paddle.trainer.PyDataProviderWrapper.SparseNonValueSlot:
-        SparseNonValueConverter,
-        paddle.trainer.PyDataProviderWrapper.SparseValueSlot:
-        SparseValueConverter
-    }
-
-    def __init__(self, use_seq, header):
-        """
-        Ctor
-        :param use_seq: True if use sequence.
-        :param header:  List of slots type,
-                       trainer.PyDataProviderWrapper.SlotType
-        """
-        self.__use_seq__ = use_seq
-        self.__header__ = header
-
-    def convert(self, wrapper_data, argument=None):
-        """
-        Convert PyDataProviderWrapper format to paddle.Argument
-        :param wrapper_data: PyDataProviderWrapper yield's data list.
-        :param argument: The output paddle.Arguments.
-                        If it is not None, it will assign data in this
-                        arguments, else it will create new arguments.
-        :return: arguments that contains data.
-        :rtype: paddle.Arguments
-        """
-        if argument is None:
-            argument = swig_paddle.Arguments.createArguments(0)
-        assert isinstance(argument, swig_paddle.Arguments)
-        argument.resize(len(self.__header__))
-
-        values = map(
-            lambda x: DataProviderWrapperConverter.__SLOT_VALUE_CONVERTER_MAP__[x.__class__](x),
-            self.__header__)
-
-        if self.__use_seq__:
-            seq_dim = [[] for _ in xrange(self.__header__.__len__())]
-            seq_start_pos = [[0] for _ in xrange(self.__header__.__len__())]
-
-            for each_sample in wrapper_data:
-                for slot_idx, sequence in enumerate(each_sample):
-                    for raw_data in sequence:
-                        values[slot_idx].append(raw_data)
-                    seq_start_pos[slot_idx].append(seq_start_pos[slot_idx][-1] +
-                                                   len(sequence))
-                    seq_dim[slot_idx].append(len(sequence))
-
-            for slot_idx in xrange(len(self.__header__)):
-                argument.setSlotSequenceDim(
-                    slot_idx, swig_paddle.IVector.create(seq_dim[slot_idx]))
-                argument.setSlotSequenceStartPositions(
-                    slot_idx,
-                    swig_paddle.IVector.create(seq_start_pos[slot_idx]))
-        else:
-            for each_sample in wrapper_data:
-                for raw_data, value in zip(each_sample, values):
-                    value.append(raw_data)
-
-        for i, v in enumerate(values):
-            v(i, argument)
-
-        return argument
-
-    def __call__(self, wrapper_data, argument=None):
-        """
-        Invoke self.convert. See documents in self.convert.
-        """
-        return self.convert(wrapper_data, argument)
-
-
-def __monkey_patch_protobuf_objects__():
-    def ParameterConfig_toProto(self):
-        """
-        Convert paddle.ParameterConfig to
-        proto.ParameterConfig_pb2.ParameterConfig
-
-        :return: proto.ParameterConfig_pb2.ParameterConfig object.
-        """
-        param_conf = paddle.proto.ParameterConfig_pb2.ParameterConfig()
-        param_conf.ParseFromString(self.toProtoString())
-        return param_conf
-
-    swig_paddle.ParameterConfig.toProto = ParameterConfig_toProto
-
-    def OptimizationConfig_toProto(self):
-        """
-        Convert paddle.OptimizationConfig to
-        proto.TrainerConfig_pb2.OptimizationConfig
-
-        :return: proto.TrainerConfig_pb2.OptimizationConfig
-        """
-        opt_conf = proto.TrainerConfig_pb2.OptimizationConfig()
-        opt_conf.ParseFromString(self.toProtoString())
-        return opt_conf
-
-    swig_paddle.OptimizationConfig.toProto = OptimizationConfig_toProto
-
-    def OptimizationConfig_createFromProto(protoObj):
-        """
-        Create a new paddle.OptimizationConfig from
-        proto.TrainerConfig_pb2.OptimizationConfig
-
-        :param protoObj: proto.TrainerConfig_pb2.OptimizationConfig
-        :return: paddle.OptimizationConfig
-        """
-
-        assert isinstance(protoObj, paddle.proto.OptimizationConfig)
-        return swig_paddle.OptimizationConfig.createFromProtoString(
-            protoObj.SerializeToString())
-
-    swig_paddle.OptimizationConfig.createFromProto = staticmethod(
-        OptimizationConfig_createFromProto)
-
-    def TrainerConfig_createFromProto(protoObj):
-        """
-        Create a new paddle.TrainerConfig from
-        proto.OptimizationConfig
-
-        :param protoObj: proto.TrainerConfig
-        :return: paddle.TrainerConfig
-        """
-        assert isinstance(protoObj, paddle.proto.TrainerConfig)
-        return swig_paddle.TrainerConfig.createFromProtoString(
-            protoObj.SerializeToString())
-
-    swig_paddle.TrainerConfig.createFromProto = staticmethod(
-        TrainerConfig_createFromProto)
-
-
-def __monkey_patch_parameter__():
-    def getBufs(self):
-        """
-        get all parameter vectors.
-        NOTE: the return value is a generator. Maybe you need to cast to
-        list or tuple or something else.
-
-        :return: generator of all parameter vectors.
-        :rtype: generator
-        """
-        return (self.getBuf(i) for i in xrange(swig_paddle.NUM_PARAMETER_TYPES))
-
-    swig_paddle.Parameter.getBufs = getBufs
-
-
-def __monkey_patch_trainer__():
-    swig_paddle.Trainer.__create__ = staticmethod(swig_paddle.Trainer.create)
-
-    def Trainer_create(config, model=None):
-        """
-        Create a trainer for model with TrainerCOnfig trainer_config
-        trainer_config.model_config will be ignored when model is supplied.
-        Trainer.trainOneBatch() and Trainer.forwardOneBatch() can be used only
-        when trainer_config.data_config is set.
-
-        A typical usage for Trainer is:
-        .. code-block:: python
-           trainer = Trainer.create(trainer_config, model)
-           for p in xrange(num_passes)
-               while True:
-                   data = get_next_batch(batch_size)
-                   if not data:
-                       break
-                   trainer.trainOneDataBatch(batch_size, data)
-               trainer.finishTrainPass()
-           trainer.finishTrain()
-
-        The trainer will take care of logging, model saving, distributed
-        training, etc.
-
-        :param config: trainer configuration
-        :type config: paddle.proto.TrainerConfig
-        :param model: the model to be trained
-        :type model: swig_paddle.GradientMachine
-        :return: a trainer
-        :rtype swig_paddle.Trainer
-
-        """
-        assert isinstance(config, paddle.proto.TrainerConfig)
-        if model is not None:
-            assert isinstance(model, swig_paddle.GradientMachine)
-        return swig_paddle.Trainer.__create__(
-            swig_paddle.TrainerConfig.createFromProto(config), model)
-
-    swig_paddle.Trainer.create = staticmethod(Trainer_create)
-
-    swig_paddle.Trainer.__getForwardOutput__ = \
-        swig_paddle.Trainer.getForwardOutput
-
-    def getForwardOutput(self):
-        """
-        Get the netword outputs from the previous trainOneBatch(),
-        trainOneDataBatch(), testOneDataPatch(), or forwardOneBatch() call.
-
-        :return: list of dictionary with keys ['id', 'value'], each value is a
-                 numpy.ndarray.
-        """
-        outArgs = self.__getForwardOutput__()
-        return [
-            __arguments_to_numpy__(i, outArgs)
-            for i in xrange(outArgs.getSlotNum())
-        ]
-
-    swig_paddle.Trainer.getForwardOutput = getForwardOutput
-
-
-def monkeypatches():
-    patches = [
-        __monkeypatch_init_paddle__, __monkeypatch_gradient_machine__,
-        __monkey_patch_protobuf_objects__, __monkey_patch_parameter__,
-        __monkey_patch_trainer__
-    ]
-    for patch in patches:
-        patch()
diff --git a/paddle/scripts/README.md b/paddle/scripts/README.md
index 9e8b135c1bc7fc05d88fe6f3bed17dd3b48e9615..6c608fce3cdad38f3109e563be3ffbe2f73e5390 100644
--- a/paddle/scripts/README.md
+++ b/paddle/scripts/README.md
@@ -40,7 +40,6 @@ The lastest pre-built build environment images are:
 | Image | Tag |
 | ----- | --- |
 | paddlepaddle/paddle | latest-dev |
-| paddlepaddle/paddle | latest-dev-android |
 
 ### Start Build
 
@@ -68,8 +67,6 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
 | `WITH_TESTING` | OFF | Build unit tests binaries. |
 | `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. |
 | `WITH_GOLANG` | OFF | Build fault-tolerant parameter server written in go. |
-| `WITH_SWIG_PY` | ON | Build with SWIG python API support. |
-| `WITH_C_API` | OFF | Build capi libraries for inference. |
 | `WITH_PYTHON` | ON | Build with python support. Turn this off if build is only for capi. |
 | `WITH_STYLE_CHECK` | ON | Check the code style when building. |
 | `PYTHON_ABI` | "" | Build for different python ABI support, can be cp27-cp27m or cp27-cp27mu |
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index f58e392684d619e145b07ac61d2adfe175443bb6..c2156a436ec73d03082fa08b6250dc77b2cee19f 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -33,7 +33,6 @@ function print_usage() {
     ${BLUE}gen_doc_lib${NONE}: generate paddle documents library
     ${BLUE}html${NONE}: convert C++ source code into HTML
     ${BLUE}dockerfile${NONE}: generate paddle release dockerfile
-    ${BLUE}capi${NONE}: generate paddle CAPI package
     ${BLUE}fluid_inference_lib${NONE}: deploy fluid inference library
     ${BLUE}check_style${NONE}: run code style check
     ${BLUE}cicheck${NONE}: run CI tasks
@@ -165,6 +164,9 @@ function cmake_gen() {
         INFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-/root/.cache/inference_demo}
     fi
 
+    distibuted_flag=${WITH_DISTRIBUTE:-OFF}
+    grpc_flag=${WITH_GRPC:-${distibuted_flag}}
+
     cat <<EOF
     ========================================
     Configuring cmake in /paddle/build ...
@@ -174,15 +176,13 @@ function cmake_gen() {
         -DWITH_DOC=${WITH_DOC:-OFF}
         -DWITH_GPU=${WITH_GPU:-OFF}
         -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF}
-        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
+        -DWITH_DISTRIBUTE=${distibuted_flag}
         -DWITH_MKL=${WITH_MKL:-ON}
         -DWITH_NGRAPH=${WITH_NGRAPH:-OFF}
         -DWITH_AVX=${WITH_AVX:-OFF}
         -DWITH_GOLANG=${WITH_GOLANG:-OFF}
         -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
-        -DWITH_C_API=${WITH_C_API:-OFF}
         -DWITH_PYTHON=${WITH_PYTHON:-ON}
-        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
         -DCUDNN_ROOT=/usr/
         -DWITH_TESTING=${WITH_TESTING:-ON}
         -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
@@ -197,7 +197,8 @@ function cmake_gen() {
         -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}
         -DPY_VERSION=${PY_VERSION:-2.7}
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
-        -DWITH_JEMALLOC=${WITH_JEMALLOC:-OFF}
+        -DWITH_JEMALLOC=${WITH_JEMALLOC:-OFF} 
+        -DWITH_GRPC=${grpc_flag}
     ========================================
 EOF
     # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -210,14 +211,12 @@ EOF
         -DWITH_DOC=${WITH_DOC:-OFF} \
         -DWITH_GPU=${WITH_GPU:-OFF} \
         -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \
-        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
+        -DWITH_DISTRIBUTE=${distibuted_flag} \
         -DWITH_MKL=${WITH_MKL:-ON} \
         -DWITH_NGRAPH=${WITH_NGRAPH:-OFF} \
         -DWITH_AVX=${WITH_AVX:-OFF} \
         -DWITH_GOLANG=${WITH_GOLANG:-OFF} \
         -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
-        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
-        -DWITH_C_API=${WITH_C_API:-OFF} \
         -DWITH_PYTHON=${WITH_PYTHON:-ON} \
         -DCUDNN_ROOT=/usr/ \
         -DWITH_TESTING=${WITH_TESTING:-ON} \
@@ -232,7 +231,8 @@ EOF
         -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}\
         -DPY_VERSION=${PY_VERSION:-2.7} \
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} \
-        -DWITH_JEMALLOC=${WITH_JEMALLOC:-OFF}
+        -DWITH_JEMALLOC=${WITH_JEMALLOC:-OFF} \
+        -DWITH_GRPC=${grpc_flag}
 
 }
 
@@ -316,6 +316,46 @@ EOF
     fi
 }
 
+function run_brpc_test() {
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build
+    if [[ ${WITH_TESTING:-ON} == "ON" \
+        && ${WITH_DISTRIBUTE:-OFF} == "ON" \
+        && ${WITH_GRPC:-OFF} == "OFF" ]] ; then
+    cat <<EOF
+    ========================================
+    Running brpc unit tests ...
+    ========================================
+EOF
+        set +x
+        declare -a other_tests=("test_listen_and_serv_op" "system_allocator_test" \
+        "rpc_server_test" "varhandle_test" "collective_server_test" "brpc_serde_test")
+        all_tests=`ctest -N`
+
+        for t in "${other_tests[@]}"
+        do
+            if [[ ${all_tests} != *$t* ]]; then
+                continue
+            fi
+
+            if [[ ${TESTING_DEBUG_MODE:-OFF} == "ON" ]] ; then
+                ctest -V -R $t
+            else
+                ctest --output-on-failure -R $t
+            fi
+        done
+        set -x
+
+        if [[ ${TESTING_DEBUG_MODE:-OFF} == "ON" ]] ; then
+            ctest -V -R test_dist_*
+        else
+            ctest --output-on-failure -R test_dist_*
+        fi
+    fi
+}
+
+
+
 function run_mac_test() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
@@ -706,59 +746,43 @@ EOF
 EOF
 }
 
-function gen_capi_package() {
-    if [[ ${WITH_C_API} == "ON" ]]; then
-        capi_install_prefix=${INSTALL_PREFIX:-/paddle/build}/capi_output
-        rm -rf $capi_install_prefix
-        make DESTDIR="$capi_install_prefix" install
-        cd $capi_install_prefix/
-        ls | egrep -v "^Found.*item$" | xargs tar -czf ${PADDLE_ROOT}/build/paddle.tgz
-    fi
-}
-
 function gen_fluid_lib() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
-    if [[ ${WITH_C_API:-OFF} == "OFF" ]] ; then
-        cat <<EOF
+    cat <<EOF
     ========================================
     Generating fluid library for train and inference ...
     ========================================
 EOF
-        cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON
-        make -j `nproc` fluid_lib_dist
-        make -j `nproc` inference_lib_dist
-      fi
+    cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON
+    make -j `nproc` fluid_lib_dist
+    make -j `nproc` inference_lib_dist
 }
 
 function tar_fluid_lib() {
-    if [[ ${WITH_C_API:-OFF} == "OFF" ]] ; then
-        cat <<EOF
+    cat <<EOF
     ========================================
     Taring fluid library for train and inference ...
     ========================================
 EOF
-        cd ${PADDLE_ROOT}/build
-        cp -r fluid_install_dir fluid
-        tar -czf fluid.tgz fluid
-        cp -r fluid_inference_install_dir fluid_inference
-        tar -czf fluid_inference.tgz fluid_inference
-      fi
+    cd ${PADDLE_ROOT}/build
+    cp -r fluid_install_dir fluid
+    tar -czf fluid.tgz fluid
+    cp -r fluid_inference_install_dir fluid_inference
+    tar -czf fluid_inference.tgz fluid_inference
 }
 
 function test_fluid_lib() {
-    if [[ ${WITH_C_API:-OFF} == "OFF" ]] ; then
-        cat <<EOF
+    cat <<EOF
     ========================================
     Testing fluid library for inference ...
     ========================================
 EOF
-        cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci
-        ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} ${INFERENCE_DEMO_INSTALL_DIR} \
-                 ${TENSORRT_INCLUDE_DIR:-/usr/local/TensorRT/include} \
-                 ${TENSORRT_LIB_DIR:-/usr/local/TensorRT/lib}
-        ./clean.sh
-      fi
+    cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci
+    ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} ${INFERENCE_DEMO_INSTALL_DIR} \
+             ${TENSORRT_INCLUDE_DIR:-/usr/local/TensorRT/include} \
+             ${TENSORRT_LIB_DIR:-/usr/local/TensorRT/lib}
+    ./clean.sh
 }
 
 function main() {
@@ -791,11 +815,6 @@ function main() {
       dockerfile)
         gen_dockerfile ${PYTHON_ABI:-""}
         ;;
-      capi)
-        cmake_gen ${PYTHON_ABI:-""}
-        build
-        gen_capi_package
-        ;;
       fluid_inference_lib)
         cmake_gen ${PYTHON_ABI:-""}
         gen_fluid_lib
@@ -810,17 +829,20 @@ function main() {
         build
         assert_api_not_changed ${PYTHON_ABI:-""}
         run_test
-        gen_capi_package
         gen_fluid_lib
         test_fluid_lib
         assert_api_spec_approvals
         ;;
+      cicheck_brpc)
+        cmake_gen ${PYTHON_ABI:-""}
+        build
+        run_brpc_test
+        ;;
       assert_api)
         assert_api_not_changed ${PYTHON_ABI:-""}
         assert_api_spec_approvals
         ;;
       test_inference)
-        gen_capi_package
         gen_fluid_lib
         test_fluid_lib
         ;;
diff --git a/paddle/scripts/paddle_docker_build.sh b/paddle/scripts/paddle_docker_build.sh
index 174c2a12f007b282a5182c0aec9b0a6bec9e55fa..91ca8907c751ea706959a4eff29293a261359a41 100755
--- a/paddle/scripts/paddle_docker_build.sh
+++ b/paddle/scripts/paddle_docker_build.sh
@@ -28,7 +28,6 @@ function start_build_docker() {
         -e WITH_AVX=ON \
         -e WITH_GOLANG=OFF \
         -e WITH_TESTING=ON \
-        -e WITH_C_API=OFF \
         -e WITH_COVERAGE=ON \
         -e COVERALLS_UPLOAD=ON \
         -e WITH_DEB=OFF \
@@ -67,9 +66,6 @@ function main() {
     DOCKER_REPO="paddlepaddle/paddle"
     VERSION="latest-dev"
     PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
-    if [ "$1" == "build_android" ]; then
-        VERSION="latest-dev-android"
-    fi
     IMG=${DOCKER_REPO}:${VERSION}
     start_build_docker $@
 }
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 37ad77549c88166186d9bf354d6bfd4c6bef152c..59e695e6fcb66cbaed1bcc9e861df81b5f73c1ed 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -39,7 +39,6 @@ add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE})
 IF(WIN32)
     add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
             COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle/
-            COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
             COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
             COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
             COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
@@ -48,7 +47,6 @@ ELSE(WIN32)
 	add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
 		COMMAND touch stub.cc
 		COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python
-		COMMAND cp -r ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
 		COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
 		COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
 		COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
diff --git a/python/paddle/fluid/contrib/int8_inference/__init__.py b/python/paddle/fluid/contrib/int8_inference/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eca2dce114b069bf9b455d77ce670d73b5047fd2
--- /dev/null
+++ b/python/paddle/fluid/contrib/int8_inference/__init__.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/fluid/contrib/int8_inference/utility.py b/python/paddle/fluid/contrib/int8_inference/utility.py
new file mode 100644
index 0000000000000000000000000000000000000000..197fc5f2d261798dc17daa37c1b6d258936a8a39
--- /dev/null
+++ b/python/paddle/fluid/contrib/int8_inference/utility.py
@@ -0,0 +1,708 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle.fluid.core as core
+import numpy as np
+import math
+import os
+import paddle.fluid as fluid
+
+
+class Calibrator(object):
+    '''
+    The calibrator class transforms the program and updates the calculated scale into it.
+    This is INT8 v1 calibration tool, mainly for the support of ResNet-50 and MobileNet.
+    '''
+    # TODO(guomingz): Below op list will be updated once more INT8 op kernels are supported.
+    non_conv_int8_op_type = ("pool2d")
+    supported_int8_op_type = ("conv2d", "pool2d")
+    const_sign_op_type = ('pool2d', 'reshape', 'concat', 'transpose')
+    u8_max = 255
+    s8_max = 127
+
+    def __init__(self, *args, **kwargs):
+        self.program = kwargs['program']
+        self.iterations = kwargs['iterations']
+        self.pretrained_model = kwargs['pretrained_model']
+        self.debug = kwargs['debug']
+        self.algo = kwargs['algo']
+
+        self._conv_input_var_name = []
+        self._conv_output_var_name = []
+        self._pool2d_output_var_name = []
+        self._weights_var_name = []
+        self._residual_input_var_name = []
+        self._int8_output_var_op_index_dict = {}
+        self._conv_op_index = [
+            index for index, value in enumerate(self.program.global_block().ops)
+            if value.type == 'conv2d'
+        ]
+
+        self._var_max_value_map = {}
+        self._var_max_range = {}
+        self._weights_scaling_factor = {}
+        self._u8_output_var = []
+        self._s8_output_var = []
+        self._persistable_vars = []
+
+    def generate_sampling_program(self):
+        self.__init_analysis()
+        self.__generate_output_program()
+
+    def generate_quantized_data(self, sampling_data):
+        self.__sampling(sampling_data)
+        self.__save_scale()
+        self.__update_program()
+        self.__update_output_program_attr()
+        self.__display_debug()
+
+    def __display_debug(self):
+        if self.debug:
+            self.__dot(self._output_program)
+            print(self._output_program)
+
+    def __get_max_range_by_var_name(self, program, var_name):
+        """
+        Check the specified variable was generated from Relu layer or not.
+        If the variable was the output of one of the pool2d/reshape/concat
+        /transpose, we keep trace the ancestor of this variable;
+        If the variable was the output the conv op, we check it's has_relu
+        attr;
+        Otherwise, we return the Calibrator.s8 as default value.
+        Returns:
+            Return Calibrator.u8_max if the variable was generated by Relu,
+            otherwise it will returns Calibrator.s8
+        """
+        search_end_index = -1
+        input_index_name = {}
+        output_index_name = {}
+        ops_type = []
+
+        for index, op in enumerate(program.current_block().ops):
+            ops_type.append(op.type)
+
+            input_index_name[index] = op.input_arg_names
+
+            output_index_name[index] = op.output_arg_names
+            if var_name in op.output_arg_names:
+                search_end_index = index
+
+        # analysis
+        while search_end_index >= 0:
+            if ops_type[search_end_index] == "relu":
+                return Calibrator.u8_max
+
+            input_name = input_index_name[search_end_index][0]
+
+            for i in output_index_name.keys():
+                if input_name in output_index_name[i]:
+                    search_end_index = i
+                    break
+
+            if ops_type[
+                    search_end_index] not in Calibrator.const_sign_op_type and ops_type[
+                        search_end_index] != 'conv2d':
+                return Calibrator.s8_max
+
+            if ops_type[search_end_index] != 'conv2d':
+                continue
+
+            if program.current_block().ops[search_end_index].has_attr(
+                    'fuse_relu') and program.current_block().ops[
+                        search_end_index].attr('fuse_relu'):
+                return Calibrator.u8_max
+            else:
+                return Calibrator.s8_max
+
+        return Calibrator.s8_max
+
+    def __check_op_type_with_specified_var_as_input(self,
+                                                    program,
+                                                    var_name,
+                                                    start_index=0):
+        '''
+        Check whether all the type of ops that use the specified variable as the
+        input.If one of those op is not int8-enabled, return False.
+        '''
+        op_type_list = [
+            op.type for op in program.current_block().ops[start_index:]
+            if var_name in op.input_arg_names
+        ]
+        for i in op_type_list:
+            if not i in Calibrator.supported_int8_op_type:
+                return False
+        return True
+
+    def __check_var_source_dt(self, var_name):
+        '''
+        Check whether the specified variable is the output of int8 conv op or not.
+        If true, return the original op index.
+        If false, return -1
+        '''
+        return self._int8_output_var_op_index_dict[
+            var_name] if var_name in self._int8_output_var_op_index_dict else -1
+
+    def __update_int8_output_var_op_index_dict(self, index, var_name=None):
+        '''
+        Update the int8_output_variable/op_index dictionary
+        '''
+        for k, v in self._int8_output_var_op_index_dict.items():
+            if v >= index:
+                self._int8_output_var_op_index_dict[k] = v + 1
+        if var_name:
+            self._int8_output_var_op_index_dict[var_name] = index
+
+    def __update_program(self):
+        '''
+        Update the program with the quantize/dequantize op insertion.
+        '''
+        quantize_index, dequantize_index = self.__get_quantize_dequantize_combination(
+            self._output_program)
+        inserted_op_length = 0
+        calc_max_func = self.__get_optimal_scaling_factor if self.algo == "KL" else np.max
+        insert_op_collection = sorted(quantize_index + dequantize_index)
+
+        for index in insert_op_collection:
+            if index in quantize_index:
+                quantize_tmp = self._output_program.current_block().create_var(
+                    name="quantize_{}_tmp".format(index),
+                    dtype=core.VarDesc.VarType.UINT8)
+                original_out_name = self._output_program.current_block().ops[
+                    index + inserted_op_length - 1].output_names[0]
+                original_out = self._output_program.current_block().ops[
+                    index + inserted_op_length - 1].output(original_out_name)[0]
+
+                op = self._output_program.current_block()._insert_op(
+                    index=index + inserted_op_length,
+                    type="quantize",
+                    inputs={"Input": original_out},
+                    outputs={"Output": quantize_tmp}, )
+
+                op._set_attr("data_format", "MKLDNNLAYOUT")
+                op._set_attr("use_mkldnn", 1)
+                op._set_attr(
+                    "Scale", self._var_max_range[original_out] /
+                    calc_max_func(self._var_max_value_map[original_out]))
+
+                if self.__get_max_range_by_var_name(
+                        self._output_program,
+                        original_out) == Calibrator.s8_max:
+                    op._set_attr("is_negative_input", 1)
+
+                self.__update_int8_output_var_op_index_dict(
+                    index + inserted_op_length, "quantize_{}_tmp".format(index))
+
+                inserted_op_length += 1
+                for op in self._output_program.current_block().ops[
+                        index + inserted_op_length:]:
+                    for j in op.input_names:
+                        if op.input(j) and op.input(
+                                j
+                        )[0] == original_out and op.type in Calibrator.supported_int8_op_type:
+                            op.desc.set_input(j,
+                                              ["{}".format(quantize_tmp.name)])
+            else:
+                start_index = index + inserted_op_length
+                dequantize_tmp_var = self._output_program.current_block(
+                ).create_var(
+                    name="dequantize_{}_tmp".format(index + 1),
+                    dtype="float32", )
+                original_out_var = None
+
+                for original_input in self._output_program.current_block().ops[
+                        start_index].input_arg_names:
+                    index_res = self.__get_op_index_by_output_var(
+                        self._output_program, original_input)
+                    if index_res != -1:
+                        original_out_var = original_input
+                        break
+
+                if original_out_var:
+                    op = self._output_program.current_block()._insert_op(
+                        index=start_index,
+                        type="dequantize",
+                        inputs={"Input": original_out_var},
+                        outputs={"Output": dequantize_tmp_var})
+                    op._set_attr("data_format", "MKLDNNLAYOUT")
+                    op._set_attr("use_mkldnn", 1)
+                    op._set_attr("Scale", self._var_max_range[original_out_var]
+                                 / calc_max_func(self._var_max_value_map[
+                                     original_out_var]))
+
+                    for op_index in range(
+                            start_index + 1,
+                            len(self._output_program.current_block().ops)):
+                        if self._output_program.current_block(
+                        ).ops[op_index].type == "conv2d" and self._output_program.current_block(
+                        ).ops[op_index].attr("force_fp32_output"):
+                            continue
+                        else:
+                            for j in self._output_program.current_block().ops[
+                                    op_index].input_names:
+                                if len(self._output_program.current_block().ops[
+                                        op_index].input(j)
+                                       ) and self._output_program.current_block(
+                                       ).ops[op_index].input(j)[
+                                           0] == original_out_var:
+                                    self._output_program.current_block(
+                                    ).ops[op_index].desc.set_input(
+                                        j,
+                                        ["{}".format(dequantize_tmp_var.name)])
+
+                    inserted_op_length += 1
+
+                    op._set_attr("data_format", "MKLDNNLAYOUT")
+                    op._set_attr("use_mkldnn", 1)
+
+    def __update_output_program_attr(self):
+        for i in self._output_program.list_vars():
+            if i.name in self._persistable_vars:
+                i.persistable = False
+                os.system("rm -rf {}/{}".format(self.pretrained_model, i.name))
+
+        for i in self._u8_output_var:
+            self._output_program.current_block().var(i).desc.set_dtype(
+                core.VarDesc.VarType.UINT8)
+
+        for i in self._s8_output_var:
+            self._output_program.current_block().var(i).desc.set_dtype(
+                core.VarDesc.VarType.INT8)
+
+    @property
+    def sampling_program(self):
+        return self._output_program
+
+    @property
+    def sampling_vars(self):
+        return self._weights_var_name + self._conv_input_var_name + self._conv_output_var_name + self._residual_input_var_name + self._pool2d_output_var_name
+
+    def _is_close(self, a, b, rel_tol=1e-09, abs_tol=0.0):
+        return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
+
+    def __generate_output_program(self):
+        for i in self.program.list_vars():
+            if not i.persistable and i.name in self.sampling_vars:
+                i.persistable = True
+                self._persistable_vars.append(i.name)
+
+        self._output_program = self.program.clone()
+
+    def __save_scale(self):
+        '''
+        Update the convolution scale information.
+        '''
+        func = self.__get_optimal_scaling_factor if self.algo == 'KL' else np.max
+        for i in self._conv_op_index[1:]:
+            weights_var_name = self.program.current_block().ops[i].input(
+                'Filter')[0]
+            input_var_name = self.program.current_block().ops[i].input('Input')[
+                0]
+            output_var_name = self.program.current_block().ops[i].output(
+                'Output')[0]
+            self._output_program.current_block().ops[i]._set_attr(
+                "Scale_weights", self._weights_scaling_factor[weights_var_name])
+
+            self._output_program.current_block().ops[i]._set_attr(
+                "Scale_in", self._var_max_range[input_var_name] /
+                func(self._var_max_value_map[input_var_name]))
+            self._output_program.current_block().ops[i]._set_attr(
+                "Scale_out", self._var_max_range[output_var_name] /
+                func(self._var_max_value_map[output_var_name]))
+            if self._output_program.current_block().ops[i].desc.input(
+                    "ResidualData"):
+                residual_var_name = self._output_program.current_block().ops[
+                    i].desc.input("ResidualData")[0]
+                self._output_program.current_block().ops[i]._set_attr(
+                    "Scale_in_eltwise", self._var_max_range[residual_var_name] /
+                    func(self._var_max_value_map[residual_var_name]))
+
+    def __sampling(self, sampling_data):
+        '''
+        Sampling the variables data range.
+        '''
+        for i in self.program.list_vars():
+            if i.name not in self.sampling_vars:
+                continue
+
+            if i.name in self._weights_var_name:
+                scaling_factor_per_channel = []
+                data = sampling_data[i.name][0]
+                for j in range(data.shape[0]):
+                    var_value = float(np.max(np.abs(data[j])))
+                    if not self._is_close(var_value, 0.0):
+                        scaling_factor_per_channel.append(Calibrator.s8_max /
+                                                          var_value)
+                    else:
+                        scaling_factor_per_channel.append(0.0)
+                self._weights_scaling_factor[
+                    i.name] = scaling_factor_per_channel
+            else:
+                if i.name in self._conv_output_var_name:
+                    op_pos = self.__get_op_index_by_output_var(self.program,
+                                                               i.name)
+                    cur_op = self.program.current_block().ops[op_pos]
+
+                    if cur_op.has_attr('fuse_relu') and cur_op.attr(
+                            'fuse_relu'):
+                        max_range = Calibrator.u8_max
+                        self._u8_output_var.append(i.name)
+                    else:
+                        max_range = Calibrator.s8_max
+                        self._s8_output_var.append(i.name)
+                else:
+                    max_range = self.__get_max_range_by_var_name(self.program,
+                                                                 i.name)
+                max_value = [[np.abs(np_data)]
+                             for np_data in sampling_data[i.name]]
+
+                self._var_max_range[i.name] = max_range
+                self._var_max_value_map[i.name] = max_value
+
+    def __check_force_fp32_attr_by_output_var(self, program, var_name):
+        for op in program.current_block().ops:
+            if op.type == "conv2d" and var_name in op.output_arg_names:
+                return op.attr("force_fp32_output")
+        return False
+
+    def __get_op_index_by_output_var(self, program, var_name, start_index=0):
+        '''
+        Check whether the specified input variable is the output of the
+        conv/pool2d op's output or not.
+
+        Returns:
+            The index if the variable is the output of any conv/pool2d op's
+            output.
+            -1 when the variable is not the output of any conv/pool2d op's 
+            output.
+        '''
+        for index, op in enumerate(program.current_block().ops[start_index:]):
+            if var_name in op.output_arg_names and op.type in Calibrator.supported_int8_op_type:
+                return index
+        return -1
+
+    def __get_op_index_by_input_var(self, program, var_name, start_index=0):
+        '''
+        Get the op index by specified input variable.
+        Returns:
+            The op index if the variable is the input of this op or -1 if the 
+            variable is not the input of any op. 
+        '''
+        for index, op in enumerate(program.current_block().ops[start_index:]):
+            if var_name in op.input_arg_names:
+                return index
+
+        return -1
+
+    def __get_quantize_dequantize_combination(self, program):
+        """
+        Get the quantize/dequantize op index for further inserting.
+        Args:
+            The program desc.
+        Returns:
+            Two lists contains the quantize op and dequantize op index information.
+        """
+        quantize_op_index = []
+        dequantize_op_index = []
+        minimal_conv_count = 2  # there must be two conv ops if not enable the first conv int8.
+        if len(self._conv_op_index) < minimal_conv_count:
+            return [], []
+
+        for index, value in enumerate(self._conv_op_index):
+            if index == 0:
+                quantize_op_index.append(self._conv_op_index[index + 1])
+            elif index == len(self._conv_op_index) - 1:
+                output_var = program.current_block().ops[value].output(
+                    "Output")[0]
+                if self.__check_op_type_with_specified_var_as_input(
+                        program, output_var, index):
+                    dequantize_op_index.append(self._conv_op_index[index] + 2)
+                else:
+                    program.current_block().ops[value]._set_attr(
+                        "force_fp32_output", True)
+
+            elif self._conv_op_index[index] + 1 < self._conv_op_index[index +
+                                                                      1]:
+
+                program.current_block().ops[self._conv_op_index[
+                    index]]._set_attr("force_fp32_output", True)
+
+                for op_index in range(self._conv_op_index[index + 1],
+                                      self._conv_op_index[index], -1):
+                    op_type = program.current_block().ops[op_index].type
+                    op_has_int8_input = False
+                    input_var_name = None
+                    input_length = len(program.current_block().ops[op_index]
+                                       .input_arg_names)
+
+                    for var_name in program.current_block().ops[
+                            op_index].input_arg_names:
+                        if self.__check_var_source_dt(var_name) != -1:
+                            op_has_int8_input = True
+                            input_var_name = var_name
+                            break
+
+                    if op_has_int8_input:
+                        if op_type == "conv2d":
+                            if program.current_block().ops[op_index +
+                                                           1].type == "conv2d":
+                                continue
+                            elif program.current_block(
+                            ).ops[op_index +
+                                  1].type in Calibrator.non_conv_int8_op_type:
+                                dequantize_op_index.append(op_index + 2)
+                                break
+                            else:
+                                program.current_block().ops[op_index]._set_attr(
+                                    "force_fp32_output", True)
+                                continue
+                        elif not self.__check_force_fp32_attr_by_output_var(
+                                program, input_var_name
+                        ) and op_index not in dequantize_op_index:
+                            share_input_flag = True
+                            for input_attr_name in program.current_block().ops[
+                                    op_index].input_names:
+                                input_var_name = program.current_block().ops[
+                                    op_index].input(input_attr_name)[0]
+                                cousin_op_index = self.__get_op_index_by_input_var(
+                                    program, input_var_name)
+                                if cousin_op_index != -1 and cousin_op_index in dequantize_op_index:
+                                    share_input_flag = False
+                                    break
+                            if share_input_flag:
+                                dequantize_op_index.append(op_index)
+
+                    elif input_length:
+                        output_is_to_int8_op = False
+                        share_input_flag = True
+                        for var_name in program.current_block().ops[
+                                op_index].input_arg_names:
+                            if not self.__check_op_type_with_specified_var_as_input(
+                                    program, var_name):
+                                share_input_flag = False
+                                break
+
+                        for var_name in program.current_block().ops[
+                                op_index].output_arg_names:
+                            if self.__get_op_index_by_output_var(
+                                    program, var_name, op_index) != -1:
+                                output_is_to_int8_op = True
+                                break
+
+                        if share_input_flag or output_is_to_int8_op:
+                            quantize_op_index.append(op_index)
+
+        return quantize_op_index, dequantize_op_index
+
+    def __init_analysis(self):
+        '''
+        Collect the variable names for sampling.
+        '''
+        start_index = 1  #analysis the conv op detail from second conv op.
+
+        for i in self._conv_op_index[start_index:]:
+            self._weights_var_name.append(self.program.current_block().ops[i]
+                                          .input('Filter')[0])
+            self._conv_input_var_name.append(self.program.current_block().ops[i]
+                                             .input('Input')[0])
+            self._conv_output_var_name.append(self.program.current_block().ops[
+                i].output('Output')[0])
+            self._int8_output_var_op_index_dict[self.program.current_block()
+                                                .ops[i].output('Output')[0]] = i
+            if self.program.current_block().ops[i].desc.input("ResidualData"):
+                self._residual_input_var_name.append(self.program.current_block(
+                ).ops[i].desc.input("ResidualData")[0])
+
+            if self.program.current_block().ops[i + 1].type == "pool2d":
+                self._pool2d_output_var_name.append(self.program.current_block(
+                ).ops[i + 1].output('Out')[0])
+
+    def __expand_quantized_bins(self, quantized_bins, reference_bins):
+        expanded_quantized_bins = [0] * len(reference_bins)
+        num_merged_bins = len(reference_bins) / len(quantized_bins)
+        j_start = 0
+        j_end = num_merged_bins
+        for idx in xrange(len(quantized_bins)):
+            zero_count = reference_bins[j_start:j_end].count(0)
+            num_merged_bins = j_end - j_start
+            if zero_count == num_merged_bins:
+                avg_bin_ele = 0
+            else:
+                avg_bin_ele = quantized_bins[idx] / (
+                    num_merged_bins - zero_count + 0.0)
+            for idx1 in xrange(j_start, j_end):
+                expanded_quantized_bins[idx1] = (0 if reference_bins[idx1] == 0
+                                                 else avg_bin_ele)
+            j_start += num_merged_bins
+            j_end += num_merged_bins
+            if (idx + 1) == len(quantized_bins) - 1:
+                j_end = len(reference_bins)
+        return expanded_quantized_bins
+
+    def __safe_entropy(self, reference_distr_P, P_sum, candidate_distr_Q,
+                       Q_sum):
+        '''
+        Calculate the entropy.
+        '''
+        assert len(reference_distr_P) == len(candidate_distr_Q)
+        tmp_sum1 = 0
+        tmp_sum2 = 0
+        for idx in range(len(reference_distr_P)):
+            p_idx = reference_distr_P[idx]
+            q_idx = candidate_distr_Q[idx]
+            if p_idx == 0:
+                tmp_sum1 += 0
+                tmp_sum2 += 0
+            else:
+                if q_idx == 0:
+                    print("Fatal error!, idx = " + str(idx) +
+                          " qindex = 0! p_idx = " + str(p_idx))
+                tmp_sum1 += p_idx * (math.log(Q_sum * p_idx))
+                tmp_sum2 += p_idx * (math.log(P_sum * q_idx))
+        return (tmp_sum1 - tmp_sum2) / P_sum
+
+    # Reference: http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
+    def __get_optimal_scaling_factor(self,
+                                     activation_blob,
+                                     num_quantized_bins=255):
+        '''
+        Using the KL-divergenc method to get the more precise scaling factor.
+        '''
+        max_val = np.max(activation_blob)
+        min_val = np.min(activation_blob)
+        if min_val >= 0:
+            hist, hist_edeges = np.histogram(
+                activation_blob, bins=2048, range=(min_val, max_val))
+            ending_iter = 2047
+            starting_iter = int(ending_iter * 0.7)
+        else:
+            th = max(abs(max_val), abs(min_val))
+            hist, hist_edeges = np.histogram(
+                activation_blob, bins=2048, range=(-th, th))
+            starting_iter = 0
+            ending_iter = 2047
+            if abs(max_val) > abs(min_val):
+                while starting_iter < ending_iter:
+                    if hist[starting_iter] == 0:
+                        starting_iter += 1
+                        continue
+                    else:
+                        break
+                starting_iter += int((ending_iter - starting_iter) * 0.6)
+            else:
+                while ending_iter > 0:
+                    if hist[ending_iter] == 0:
+                        ending_iter -= 1
+                        continue
+                    else:
+                        break
+                starting_iter = int(0.6 * ending_iter)
+        bin_width = hist_edeges[1] - hist_edeges[0]
+        P_sum = len(activation_blob)
+        min_kl_divergence = 0
+        min_kl_index = 0
+        kl_inited = False
+        for i in range(starting_iter, ending_iter + 1):
+            reference_distr_P = hist[0:i].tolist()
+            outliers_count = sum(hist[i:2048])
+            if reference_distr_P[i - 1] == 0:
+                continue
+            reference_distr_P[i - 1] += outliers_count
+            reference_distr_bins = reference_distr_P[:]
+            candidate_distr_Q = hist[0:i].tolist()
+            num_merged_bins = i / num_quantized_bins
+            candidate_distr_Q_quantized = [0] * num_quantized_bins
+            j_start = 0
+            j_end = num_merged_bins
+            for idx in xrange(num_quantized_bins):
+                candidate_distr_Q_quantized[idx] = sum(candidate_distr_Q[
+                    j_start:j_end])
+                j_start += num_merged_bins
+                j_end += num_merged_bins
+                if (idx + 1) == num_quantized_bins - 1:
+                    j_end = i
+            candidate_distr_Q = self.__expand_quantized_bins(
+                candidate_distr_Q_quantized, reference_distr_bins)
+            Q_sum = sum(candidate_distr_Q)
+            kl_divergence = self.__safe_entropy(reference_distr_P, P_sum,
+                                                candidate_distr_Q, Q_sum)
+            if not kl_inited:
+                min_kl_divergence = kl_divergence
+                min_kl_index = i
+                kl_inited = True
+            elif kl_divergence < min_kl_divergence:
+                min_kl_divergence = kl_divergence
+                min_kl_index = i
+            else:
+                pass
+        if min_kl_index == 0:
+            while starting_iter > 0:
+                if hist[starting_iter] == 0:
+                    starting_iter -= 1
+                    continue
+                else:
+                    break
+            min_kl_index = starting_iter
+        return (min_kl_index + 0.5) * bin_width
+
+    @staticmethod
+    def __dot(program, output_name="model.dot"):
+        '''
+        Generate the graphiz dot file for debugging.
+        '''
+        dot_graph = ""
+        dot_nodes = []
+        dot_edges = []
+        dot_graph += "digraph pm {\n"
+        for block in program.blocks:
+            ops = list(block.ops)
+            for index, op in enumerate(ops):
+                op_type = op.type
+                op_name = op_type + "_" + op.output_arg_names[0].replace(
+                    ".", "_") + "___" + str(index)
+                for name in op.input_arg_names:
+                    name = name.replace(".", "_")
+                    dot_edge = name + " -> " + op_name
+                    if dot_edge not in dot_edges:
+                        dot_edges.append(dot_edge)
+                    dot_node = name + " [shape=oval, style=filled, fillcolor=yellow]"
+                    if dot_node not in dot_nodes:
+                        dot_nodes.append(dot_node)
+
+                for name in op.output_arg_names:
+                    name = name.replace(".", "_")
+                    dot_edge = op_name + " -> " + name
+                    if dot_edge not in dot_edges:
+                        dot_edges.append(dot_edge)
+                if op_type in Calibrator.supported_int8_op_type:
+                    if op_type == "conv2d" and op.has_attr(
+                            'force_fp32_output') and op.attr(
+                                "force_fp32_output"):
+                        dot_node = op_name + " [shape=box, style=filled, color=deeppink]"
+                    else:
+                        dot_node = op_name + " [shape=box, style=filled, color=greenyellow]"
+                elif op_type in ["quantize", "dequantize"]:
+                    dot_node = op_name + " [shape=box, style=filled, color=gold]"
+                else:
+                    dot_node = op_name + " [shape=box, style=filled, fillcolor=red]"
+
+                if dot_node not in dot_nodes:
+                    dot_nodes.append(dot_node)
+
+        for dot_edge in dot_edges:
+            dot_graph += dot_edge + "\n"
+        for dot_node in dot_nodes:
+            dot_graph += dot_node + "\n"
+        dot_graph += "}"
+
+        with open(output_name, 'w') as f:
+            f.write(dot_graph)
diff --git a/python/paddle/fluid/contrib/slim/graph/graph.py b/python/paddle/fluid/contrib/slim/graph/graph.py
index 7d6b0702035d49189c0919f976ea3c0c52663547..f38d9783413a01cd1005a014c0aba5ecf5cc79c2 100644
--- a/python/paddle/fluid/contrib/slim/graph/graph.py
+++ b/python/paddle/fluid/contrib/slim/graph/graph.py
@@ -11,8 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+from __future__ import print_function
+import os
+import subprocess
 from ....framework import Program
+from ....framework import Block
+from .... import core
 
 __all__ = ['Graph', 'ImitationGraph', 'IRGraph']
 
diff --git a/paddle/py_paddle/__init__.py b/python/paddle/fluid/contrib/slim/quantization/__init__.py
similarity index 62%
rename from paddle/py_paddle/__init__.py
rename to python/paddle/fluid/contrib/slim/quantization/__init__.py
index 5504d1d50c523315036bfaaf6641c5216269a5e5..6c26475f48855674d97abf5778a631646734fcf8 100644
--- a/paddle/py_paddle/__init__.py
+++ b/python/paddle/fluid/contrib/slim/quantization/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,13 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from util import DataProviderWrapperConverter
-from dataprovider_converter import DataProviderConverter
+from __future__ import print_function
 
-__all__ = [
-    'paddle',
-    'DataProviderConverter',
-    'DataProviderWrapperConverter',  # for deprecated usage.
-    'loadParameterFile'
-]
-util.monkeypatches()
+from . import quantization_pass
+from .quantization_pass import *
+
+__all__ = quantization_pass.__all__
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..266a106bc507104c0a8db1c882b55ac59e88195e
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -0,0 +1,318 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+from .... import core
+from ....framework import IrGraph
+from ....framework import Program
+from ....framework import Variable
+from ....initializer import Constant
+from .... import unique_name
+
+__all__ = ['QuantizationTransformPass']
+
+
+class QuantizationTransformPass(object):
+    def __init__(self,
+                 scope=None,
+                 program_exe=None,
+                 weight_bits=8,
+                 activation_bits=8,
+                 activation_quantize_type='abs_max',
+                 weight_quantize_type='abs_max',
+                 window_size=10000):
+        """
+        Convert and rewrite the IrGraph according to weight and
+        activation quantization type.
+        Args:
+            weight_bits (int): quantization bit number for weights,
+                the bias is not quantized.
+            activation_bits (int): quantization bit number for activation.
+            activation_quantize_type (str): quantization type for activation,
+                now support 'abs_max', 'range_abs_max'. If use 'abs_max' mode,
+                the quantization scale will be calculated dynamically each step
+                in both training and testing period. If use 'range_abs_max',
+                a static quantization scale will be calculated during training
+                and used in inference.
+            weight_quantize_type (str): quantization type for weights,
+                support 'abs_max'. The 'range_abs_max' usually is not used for
+                weight, since weights are fixed once the model is well trained.
+            window_size (int): the window size for 'range_abs_max' quantization.
+        Examples:
+        .. code-block:: python
+            # The original graph will be rewrite.
+            import paddle.fluid as fluid
+            from paddle.fluid.contrib.slim.quantization \
+                import QuantizationTransformPass
+            from paddle.fluid.contrib.slim.graph import IrGraph
+            from paddle.fluid import core
+
+            graph = IrGraph(core.Graph(program.desc), for_test=False)
+            exe = fluid.Executor(fluid.CPUPlace())
+            transform_pass = QuantizationTransformPass(fluid.global_scope(),
+            exe)
+            transform_pass.apply(graph)
+        """
+        self._scope = scope
+        self._program_exe = program_exe
+        self._weight_bits = weight_bits
+        self._activation_bits = activation_bits
+
+        quant_type = ['abs_max', 'range_abs_max']
+        if activation_quantize_type not in quant_type:
+            raise ValueError(
+                "Unknown activation_quantize_type : '%s'. It can only be ",
+                "'abs_max' or 'range_abs_max'.", str(activation_quantize_type))
+        if weight_quantize_type not in quant_type:
+            raise ValueError(
+                "Unknown weight_quantize_type: '%s'. It can only be ",
+                "'abs_max' or 'range_abs_max'.", str(weight_quantize_type))
+
+        self._activation_quantize_type = activation_quantize_type
+        self._weight_quantize_type = weight_quantize_type
+        self._window_size = window_size
+
+        self._need_initialized = collections.OrderedDict()
+        self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul']
+        self._quantizable_grad_ops = [
+            '%s_grad' % (op) for op in self._quantizable_ops
+        ]
+        self._fake_quant_op_types = [
+            'fake_quantize_abs_max', 'fake_quantize_range_abs_max'
+        ]
+        self._fake_dequant_op_types = ['fake_dequantize_max_abs']
+        self._is_test = None
+        self._global_step = None
+
+    def apply(self, graph):
+        assert isinstance(graph,
+                          IrGraph), 'graph must be the instance of IrGraph.'
+        self._need_initialized.clear()
+        self._is_test = graph.is_test()
+        # marked the variable which has been dequantized.
+        dequantized_vars = collections.OrderedDict()
+        params = [p.name() for p in graph.all_parameters()]
+
+        def _transform_forward(graph, op):
+            for var_node in op.inputs:
+                if var_node.name() in dequantized_vars:
+                    dequant_var_node = dequantized_vars[var_node.name()]
+                else:
+                    quant_bits = self._weight_bits if var_node.name() in params \
+                    else self._activation_bits
+                    quant_type = self._weight_quantize_type if var_node.name() \
+                        in params else self._activation_quantize_type
+                    quant_var_node, scale_var_node = self._insert_quant_op(
+                        graph, var_node, quant_bits, quant_type)
+                    dequant_var_node = self._insert_dequant_op(
+                        graph, quant_var_node, scale_var_node, quant_bits)
+                    dequantized_vars[var_node.name()] = dequant_var_node
+                graph.update_input_link(var_node, dequant_var_node, op)
+
+        def _transform_backward(graph, op):
+            no_dequanted_input_vars = True
+            for var_node in op.inputs:
+                if var_node.name() in dequantized_vars:
+                    dequant_var_node = dequantized_vars[var_node.name()]
+                    graph.update_input_link(var_node, dequant_var_node, op)
+                    no_dequanted_input_vars = False
+            if no_dequanted_input_vars:
+                raise ValueError("There is no dequanted inputs for op %s." %
+                                 (op.name()))
+
+        if not self._is_test:
+            self._create_global_step(graph)
+        ops = graph.all_ops()
+        # The process of _transform_forward and _transform_backward is needed in two for loops.
+        # The loop for transforming the forward graph:
+        for op in ops:
+            if op.name() in self._quantizable_ops:
+                _transform_forward(graph, op)
+        # The loop for renaming the inputs of backward op.
+        for op in ops:
+            if op.name() in self._quantizable_grad_ops:
+                _transform_backward(graph, op)
+
+        if len(self._need_initialized) > 0:
+            assert self._scope is not None, \
+            'The scope cannot be set None when activation_quantize_type equals to range_abs_max.'
+            assert self._program_exe is not None, \
+            'The program_exe cannot be set None when activation_quantize_type equals to range_abs_max.'
+            init_program = Program()
+            for var_desc, initializer in self._need_initialized.iteritems():
+                var = Variable(init_program.global_block())
+                var._set_desc(var_desc)
+                initializer(var, init_program.global_block())
+            self._program_exe.run(program=init_program, scope=self._scope)
+
+        return graph
+
+    def _create_global_step(self, graph):
+        if self._weight_quantize_type == 'range_abs_max' or \
+                self._activation_quantize_type == 'range_abs_max':
+            counter_name = '@STEP_COUNTER@'
+            for node in graph.all_vars():
+                if node.name() == counter_name:
+                    self._global_step = node
+            if self._global_step is None:
+                global_step_in = graph.create_param_node(
+                    name=counter_name,
+                    var_type=core.VarDesc.VarType.LOD_TENSOR,
+                    shape=[1],
+                    var_dtype=core.VarDesc.VarType.INT64)
+                self._need_initialized[global_step_in.var()] = \
+                    Constant(value=0, force_cpu=True)
+                global_step_out = graph.create_var_node_from_desc(
+                    global_step_in.var())
+                increment_op = graph.create_op_node(
+                    op_type='increment',
+                    attrs={'step': 1.0},
+                    inputs={'X': global_step_in},
+                    outputs={'Out': global_step_out})
+                graph.link_to(global_step_in, increment_op)
+                graph.link_to(increment_op, global_step_out)
+                self._global_step = global_step_out
+
+    def _insert_quant_op(self, graph, var_node, quant_bits, quant_type):
+        """
+        Insert fake_quantize_op in the graph.
+        """
+        if quant_type == 'abs_max':
+            return self._insert_quant_abs_max_op(graph, var_node, quant_bits)
+        elif quant_type == 'range_abs_max':
+            return self._insert_quant_range_abs_max_op(graph, var_node,
+                                                       quant_bits)
+
+    def _insert_quant_abs_max_op(self, graph, var_node, quant_bits):
+        """
+        Insert fake_quantize_abs_max op in the graph.
+        """
+        assert var_node.is_var(), '{} is not a var'.format(var_node.name())
+
+        quant_var_node = graph.create_var_node(
+            name=self._quantized_var_name(var_node.name()),
+            var_type=var_node.var().type(),
+            shape=var_node.var().shape(),
+            var_dtype=var_node.var().dtype())
+        scale_var_node = graph.create_var_node(
+            name=self._quantized_scale_name(var_node.name()),
+            var_type=var_node.var().type(),
+            shape=var_node.var().shape(),
+            var_dtype=var_node.var().dtype())
+        quant_op_node = graph.create_op_node(
+            op_type='fake_quantize_abs_max',
+            attrs={'bit_length': quant_bits},
+            inputs={'X': var_node},
+            outputs={'Out': quant_var_node,
+                     'OutScale': scale_var_node})
+        graph.link_to(var_node, quant_op_node)
+        graph.link_to(quant_op_node, quant_var_node)
+        graph.link_to(quant_op_node, scale_var_node)
+        return quant_var_node, scale_var_node
+
+    def _insert_quant_range_abs_max_op(self, graph, var_node, quant_bits):
+        """
+        Insert fake_quantize_range_abs_max on the graph.
+        """
+        assert var_node.is_var(), '{} is not a var'.format(var_node.name())
+
+        quant_var_node = graph.create_var_node(
+            name=self._quantized_var_name(var_node.name()),
+            var_type=var_node.var().type(),
+            shape=var_node.var().shape(),
+            var_dtype=var_node.var().dtype())
+
+        scale_in_node = graph.create_param_node(
+            name=self._quantized_scale_name(var_node.name()),
+            var_type=core.VarDesc.VarType.LOD_TENSOR,
+            shape=[1],
+            var_dtype=var_node.var().dtype())
+        self._need_initialized[scale_in_node.var()] = Constant(value=0.001)
+
+        scale_out_node = graph.create_var_node_from_desc(scale_in_node.var())
+        inputs = {'X': var_node, 'InScale': scale_in_node}
+        outputs = {'Out': quant_var_node, 'OutScale': scale_out_node}
+
+        if not self._is_test:
+            # The name of scales_var_node maybe 'scales_0', 'scales_1', etc.
+            scales_node = graph.create_param_node(
+                name=unique_name.generate('scales'),
+                var_type=core.VarDesc.VarType.LOD_TENSOR,
+                shape=[self._window_size],
+                var_dtype=var_node.var().dtype())
+            self._need_initialized[scales_node.var()] = Constant(value=0)
+            inputs['Iter'] = self._global_step
+            outputs['OutScales'] = scales_node
+        attrs = {
+            'window_size': self._window_size,
+            'bit_length': quant_bits,
+            'is_test': self._is_test
+        }
+        quant_op_node = graph.create_op_node(
+            op_type='fake_quantize_range_abs_max',
+            attrs=attrs,
+            inputs=inputs,
+            outputs=outputs)
+
+        graph.link_to(var_node, quant_op_node)
+        graph.link_to(scale_in_node, quant_op_node)
+        graph.link_to(quant_op_node, quant_var_node)
+        graph.link_to(quant_op_node, scale_out_node)
+
+        if not self._is_test:
+            graph.link_to(self._global_step, quant_op_node)
+            graph.link_to(quant_op_node, scales_node)
+
+        return quant_var_node, scale_out_node
+
+    def _insert_dequant_op(self, graph, var_node, scale_var_node, quant_bits):
+        """
+        Insert fake_dequantize_op in the graph.
+        """
+        assert var_node.is_var(), '{} is not a var'.format(var_node.name())
+
+        dequant_var_node = graph.create_var_node(
+            name=self._dequantized_var_name(var_node.name()),
+            var_type=var_node.var().type(),
+            shape=var_node.var().shape(),
+            var_dtype=var_node.var().dtype())
+        max_range = (1 << (quant_bits - 1)) - 1
+        dequant_op_node = graph.create_op_node(
+            op_type='fake_dequantize_max_abs',
+            attrs={'max_range': float(max_range)},
+            inputs={'X': var_node,
+                    'Scale': scale_var_node},
+            outputs={'Out': dequant_var_node})
+        graph.link_to(var_node, dequant_op_node)
+        graph.link_to(scale_var_node, dequant_op_node)
+        graph.link_to(dequant_op_node, dequant_var_node)
+        return dequant_var_node
+
+    def _quantized_var_name(self, var_name):
+        """
+        Return quantized variable name for the input `var_name`.
+        """
+        return "%s.quantized" % (var_name)
+
+    def _dequantized_var_name(self, var_name):
+        """
+        Return dequantized variable name for the input `var_name`.
+        """
+        return "%s.dequantized" % (var_name)
+
+    def _quantized_scale_name(self, var_name):
+        """
+        Return the scale name of quantized variable for the input `var_name`.
+        """
+        return "%s.scale" % (var_name)
diff --git a/python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bd4b95d6b90b7f16d507061190f0b463f6c4cc5
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py
@@ -0,0 +1,175 @@
+#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+import unittest
+import random
+import numpy as np
+import paddle.fluid as fluid
+import six
+from paddle.fluid.framework import Program
+from paddle.fluid.framework import IrGraph
+from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
+from paddle.fluid import core
+
+
+def linear_fc(num):
+    data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    hidden = data
+    for _ in six.moves.xrange(num):
+        hidden = fluid.layers.fc(hidden, size=128, act='relu')
+    loss = fluid.layers.cross_entropy(input=hidden, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+def residual_block(num):
+    def conv_bn_layer(input,
+                      ch_out,
+                      filter_size,
+                      stride,
+                      padding,
+                      act='relu',
+                      bias_attr=False):
+        tmp = fluid.layers.conv2d(
+            input=input,
+            filter_size=filter_size,
+            num_filters=ch_out,
+            stride=stride,
+            padding=padding,
+            act=None,
+            bias_attr=bias_attr)
+        return fluid.layers.batch_norm(input=tmp, act=act)
+
+    data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    hidden = data
+    for _ in six.moves.xrange(num):
+        conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
+        short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
+        hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu')
+    fc = fluid.layers.fc(input=hidden, size=10)
+    loss = fluid.layers.cross_entropy(input=fc, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+class TestQuantizationTransformPass(unittest.TestCase):
+    def setUp(self):
+        self.quantizable_op_and_inputs = {
+            'conv2d': ['Input', 'Filter'],
+            'depthwise_conv2d': ['Input', 'Filter'],
+            'mul': ['X', 'Y']
+        }
+        self.quantizable_grad_op_inputs = {
+            'conv2d_grad': ['Input', 'Filter'],
+            'depthwise_conv2d_grad': ['Input', 'Filter'],
+            'mul_grad': ['X', 'Y']
+        }
+
+    def check_program(self, transform_pass, program):
+        quantized_ops = set()
+        for block in program.blocks:
+            for op in block.ops:
+                # check forward
+                if op.type in self.quantizable_op_and_inputs:
+                    for arg_name in op.input_arg_names:
+                        self.assertTrue(
+                            arg_name.endswith('.quantized.dequantized'))
+                        quantized_ops.add(arg_name)
+
+            for op in block.ops:
+                # check backward
+                if op.type in self.quantizable_grad_op_inputs:
+                    for pname in self.quantizable_grad_op_inputs[op.type]:
+                        arg_name = op.input(pname)[0]
+                        self.assertTrue(
+                            arg_name.endswith('.quantized.dequantized'))
+                        self.assertTrue(arg_name in quantized_ops)
+
+    def linear_fc_quant(self, quant_type):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = linear_fc(3)
+            opt = fluid.optimizer.Adam(learning_rate=0.001)
+            opt.minimize(loss)
+        exe = fluid.Executor(fluid.CPUPlace())
+        graph = IrGraph(core.Graph(main.desc), for_test=False)
+        transform_pass = QuantizationTransformPass(
+            scope=fluid.global_scope(),
+            program_exe=exe,
+            activation_quantize_type=quant_type)
+        transform_pass.apply(graph)
+        marked_nodes = set()
+        for op in graph.all_ops():
+            if op.name().find('quantize') > -1:
+                marked_nodes.add(op)
+        graph.draw('.', 'quantize_fc_' + quant_type, marked_nodes)
+        program = graph.to_program()
+        self.check_program(transform_pass, program)
+        val_graph = IrGraph(core.Graph(program.desc), for_test=False)
+        val_marked_nodes = set()
+        for op in val_graph.all_ops():
+            if op.name().find('quantize') > -1:
+                val_marked_nodes.add(op)
+        val_graph.draw('.', 'val_fc_' + quant_type, val_marked_nodes)
+
+    def test_linear_fc_quant_abs_max(self):
+        self.act_quant_op_type = 'fake_quantize_abs_max'
+        self.linear_fc_quant('abs_max')
+
+    def test_linear_fc_quant_range_abs_max(self):
+        self.act_quant_op_type = 'fake_quantize_range_abs_max'
+        self.linear_fc_quant('range_abs_max')
+
+    def residual_block_quant(self, quant_type):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = residual_block(2)
+            opt = fluid.optimizer.Adam(learning_rate=0.001)
+            opt.minimize(loss)
+        exe = fluid.Executor(fluid.CPUPlace())
+        graph = IrGraph(core.Graph(main.desc), for_test=False)
+        transform_pass = QuantizationTransformPass(
+            scope=fluid.global_scope(),
+            program_exe=exe,
+            activation_quantize_type=quant_type)
+        transform_pass.apply(graph)
+        marked_nodes = set()
+        for op in graph.all_ops():
+            if op.name().find('quantize') > -1:
+                marked_nodes.add(op)
+        graph.draw('.', 'quantize_residual_' + quant_type, marked_nodes)
+        program = graph.to_program()
+        self.check_program(transform_pass, program)
+        val_graph = IrGraph(core.Graph(program.desc), for_test=False)
+        val_marked_nodes = set()
+        for op in val_graph.all_ops():
+            if op.name().find('quantize') > -1:
+                val_marked_nodes.add(op)
+        val_graph.draw('.', 'val_residual_' + quant_type, val_marked_nodes)
+
+    def test_residual_block_abs_max(self):
+        self.act_quant_op_type = 'fake_quantize_abs_max'
+        self.residual_block_quant('abs_max')
+
+    def test_residual_block_range_abs_max(self):
+        self.act_quant_op_type = 'fake_quantize_range_abs_max'
+        self.residual_block_quant('range_abs_max')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/contrib/tests/CMakeLists.txt b/python/paddle/fluid/contrib/tests/CMakeLists.txt
index 79bec8c4ad34d682895250bc29b1fddb3a569bd4..81aee1233d1db756686d1a934b94672dc5c770fe 100644
--- a/python/paddle/fluid/contrib/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/tests/CMakeLists.txt
@@ -1,6 +1,10 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
+if(APPLE OR WIN32 OR NOT WITH_MKL)
+    list(REMOVE_ITEM TEST_OPS test_calibration)
+endif()
+
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
diff --git a/python/paddle/fluid/contrib/tests/test_calibration.py b/python/paddle/fluid/contrib/tests/test_calibration.py
new file mode 100644
index 0000000000000000000000000000000000000000..17e4eb8b831268bed00736db2e9706aece9fdd74
--- /dev/null
+++ b/python/paddle/fluid/contrib/tests/test_calibration.py
@@ -0,0 +1,230 @@
+#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+import unittest
+import os
+import numpy as np
+import time
+import sys
+import random
+import paddle
+import paddle.fluid as fluid
+import argparse
+import functools
+import contextlib
+import paddle.fluid.profiler as profiler
+from PIL import Image, ImageEnhance
+import math
+sys.path.append('..')
+import int8_inference.utility as ut
+
+random.seed(0)
+np.random.seed(0)
+
+DATA_DIM = 224
+
+THREAD = 1
+BUF_SIZE = 102400
+
+DATA_DIR = 'data/ILSVRC2012'
+
+img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
+img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
+
+
+# TODO(guomingz): Remove duplicated code from line 45 ~ line 114
+def resize_short(img, target_size):
+    percent = float(target_size) / min(img.size[0], img.size[1])
+    resized_width = int(round(img.size[0] * percent))
+    resized_height = int(round(img.size[1] * percent))
+    img = img.resize((resized_width, resized_height), Image.LANCZOS)
+    return img
+
+
+def crop_image(img, target_size, center):
+    width, height = img.size
+    size = target_size
+    if center == True:
+        w_start = (width - size) / 2
+        h_start = (height - size) / 2
+    else:
+        w_start = np.random.randint(0, width - size + 1)
+        h_start = np.random.randint(0, height - size + 1)
+    w_end = w_start + size
+    h_end = h_start + size
+    img = img.crop((w_start, h_start, w_end, h_end))
+    return img
+
+
+def process_image(sample, mode, color_jitter, rotate):
+    img_path = sample[0]
+
+    img = Image.open(img_path)
+
+    img = resize_short(img, target_size=256)
+    img = crop_image(img, target_size=DATA_DIM, center=True)
+
+    if img.mode != 'RGB':
+        img = img.convert('RGB')
+
+    img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255
+    img -= img_mean
+    img /= img_std
+
+    return img, sample[1]
+
+
+def _reader_creator(file_list,
+                    mode,
+                    shuffle=False,
+                    color_jitter=False,
+                    rotate=False,
+                    data_dir=DATA_DIR):
+    def reader():
+        with open(file_list) as flist:
+            full_lines = [line.strip() for line in flist]
+            if shuffle:
+                np.random.shuffle(full_lines)
+
+            lines = full_lines
+
+            for line in lines:
+                img_path, label = line.split()
+                img_path = os.path.join(data_dir, img_path)
+                if not os.path.exists(img_path):
+                    continue
+                yield img_path, int(label)
+
+    mapper = functools.partial(
+        process_image, mode=mode, color_jitter=color_jitter, rotate=rotate)
+
+    return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE)
+
+
+def val(data_dir=DATA_DIR):
+    file_list = os.path.join(data_dir, 'val_list.txt')
+    return _reader_creator(file_list, 'val', shuffle=False, data_dir=data_dir)
+
+
+class TestCalibration(unittest.TestCase):
+    def setUp(self):
+        # TODO(guomingz): Put the download process in the cmake.
+        # Download and unzip test data set
+        imagenet_dl_url = 'http://paddle-inference-dist.bj.bcebos.com/int8/calibration_test_data.tar.gz'
+        zip_file_name = imagenet_dl_url.split('/')[-1]
+        cmd = 'rm -rf data {}  && mkdir data && wget {} && tar xvf {} -C data'.format(
+            zip_file_name, imagenet_dl_url, zip_file_name)
+        os.system(cmd)
+        # resnet50 fp32 data
+        resnet50_fp32_model_url = 'http://paddle-inference-dist.bj.bcebos.com/int8/resnet50_int8_model.tar.gz'
+        resnet50_zip_name = resnet50_fp32_model_url.split('/')[-1]
+        resnet50_unzip_folder_name = 'resnet50_fp32'
+        cmd = 'rm -rf {} {} && mkdir {} && wget {} && tar xvf {} -C {}'.format(
+            resnet50_unzip_folder_name, resnet50_zip_name,
+            resnet50_unzip_folder_name, resnet50_fp32_model_url,
+            resnet50_zip_name, resnet50_unzip_folder_name)
+        os.system(cmd)
+
+        self.iterations = 100
+        self.skip_batch_num = 5
+
+    def run_program(self, model_path, generate_int8=False, algo='direct'):
+        image_shape = [3, 224, 224]
+        os.environ['FLAGS_use_mkldnn'] = 'True'
+
+        fluid.memory_optimize(fluid.default_main_program())
+
+        exe = fluid.Executor(fluid.CPUPlace())
+
+        [infer_program, feed_dict,
+         fetch_targets] = fluid.io.load_inference_model(model_path, exe)
+
+        t = fluid.transpiler.InferenceTranspiler()
+        t.transpile(infer_program, fluid.CPUPlace())
+
+        val_reader = paddle.batch(val(), batch_size=1)
+
+        if generate_int8:
+            int8_model = os.path.join(os.getcwd(), "calibration_out")
+
+            if os.path.exists(int8_model):
+                os.system("rm -rf " + int8_model)
+                os.system("mkdir " + int8_model)
+
+            print("Start calibration ...")
+
+            calibrator = ut.Calibrator(
+                program=infer_program,
+                pretrained_model=model_path,
+                iterations=100,
+                debug=False,
+                algo=algo)
+
+            sampling_data = {}
+
+            calibrator.generate_sampling_program()
+        test_info = []
+        cnt = 0
+        for batch_id, data in enumerate(val_reader()):
+            image = np.array(
+                [x[0].reshape(image_shape) for x in data]).astype("float32")
+            label = np.array([x[1] for x in data]).astype("int64")
+            label = label.reshape([-1, 1])
+            running_program = calibrator.sampling_program.clone(
+            ) if generate_int8 else infer_program.clone()
+            for op in running_program.current_block().ops:
+                if op.has_attr("use_mkldnn"):
+                    op._set_attr("use_mkldnn", True)
+
+            _, acc1, _ = exe.run(
+                running_program,
+                feed={feed_dict[0]: image,
+                      feed_dict[1]: label},
+                fetch_list=fetch_targets)
+            if generate_int8:
+                for i in calibrator.sampling_program.list_vars():
+                    if i.name in calibrator.sampling_vars:
+                        np_data = np.array(fluid.global_scope().find_var(i.name)
+                                           .get_tensor())
+                        if i.name not in sampling_data:
+                            sampling_data[i.name] = []
+                        sampling_data[i.name].append(np_data)
+
+            test_info.append(np.mean(acc1) * len(data))
+            cnt += len(data)
+
+            if batch_id != self.iterations - 1:
+                continue
+
+            break
+
+        if generate_int8:
+            calibrator.generate_quantized_data(sampling_data)
+            fluid.io.save_inference_model(int8_model, feed_dict, fetch_targets,
+                                          exe, calibrator.sampling_program)
+            print(
+                "Calibration is done and the corresponding files were generated at {}".
+                format(os.path.abspath("calibration_out")))
+        else:
+            return np.sum(test_info) / cnt
+
+    def test_calibration_for_resnet50(self):
+        fp32_acc1 = self.run_program("resnet50_fp32/model")
+        self.run_program("resnet50_fp32/model", True)
+        int8_acc1 = self.run_program("calibration_out")
+        delta_value = np.abs(fp32_acc1 - int8_acc1)
+        self.assertLess(delta_value, 0.01)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index 7b70d19de5ca309441bdc1404e6e601af3c5b892..a24e1d13003d5c42fa9b5a9346d81c8de4ba45c4 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -88,8 +88,8 @@ class DataToLoDTensorConverter(object):
                     raise ValueError(
                         "Reshape error. What is defined in data layer is {}, but receive {}"
                         .format(self.shape, arr.shape))
-            else:
-                self._check_shape(arr.shape)
+            #else:
+            #    self._check_shape(arr.shape)
         t = core.LoDTensor()
         t.set(arr, self.place)
         if self.lod_level > 0:
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 77239f2e8ee5243611352d275c0d595670e24185..2bdae60db347b3d42fded138a20a505486e48dbc 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -23,6 +23,7 @@ import traceback
 import six
 
 import numpy as np
+import subprocess
 
 from .. import compat as cpt
 from .proto import framework_pb2
@@ -69,6 +70,7 @@ ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
 CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName()
 
 _imperative_tracer_ = None
+_imperative_current_expected_place_ = None
 
 
 def _in_imperative_mode():
@@ -79,6 +81,10 @@ def _imperative_tracer():
     return _imperative_tracer_
 
 
+def _current_expected_place():
+    return _imperative_current_expected_place_
+
+
 class NameScope(object):
     def __init__(self, name="", parent=None):
         self._children = dict()
@@ -382,8 +388,8 @@ class Variable(object):
             self._ivar.stop_gradient = stop_gradient
 
     def _numpy(self):
-        tensor = self._ivar.value().get_tensor()
-        return np.array(tensor)
+        new_ivar = self._ivar._copy_to(core.CPUPlace(), True)
+        return np.array(new_ivar.value().get_tensor())
 
     def _backward(self):
         self._ivar._run_backward()
@@ -1310,6 +1316,7 @@ class Block(object):
     def _trace_op(self, op, stop_gradient=False):
         if _in_imperative_mode():
             _imperative_tracer().trace(op.iop, op.inputs, op.outputs, self.desc,
+                                       _imperative_current_expected_place_,
                                        stop_gradient)
 
     def _insert_op(self, index, *args, **kwargs):
@@ -1512,6 +1519,154 @@ class Block(object):
         return ret_var
 
 
+class IrGraph(object):
+    """
+    IrGraph uses core.Graph as the delegation to accomplish the manipulation.
+    """
+
+    def __init__(self, graph, for_test=False):
+        """
+        Construct the IrGraph using core.Graph.
+        Args:
+            graph(core.Graph): C++ Graph.
+            for_test(bool): True for the test graph and false for the train graph.
+        """
+        assert isinstance(
+            graph, core.Graph), 'graph must be the instance of core.Graph.'
+        self.graph = graph
+        self._for_test = for_test
+
+    def is_test(self):
+        return self._for_test
+
+    def all_parameters(self):
+        param_nodes = set()
+        for node in self.graph.nodes():
+            if node.is_var() and node.var() is not None and node.var(
+            ).persistable():
+                param_nodes.add(node)
+        return param_nodes
+
+    def all_vars(self):
+        return {node for node in self.graph.nodes() if node.is_var()}
+
+    def all_ops(self):
+        return {node for node in self.graph.nodes() if node.is_op()}
+
+    def create_param_node(self, name, var_type, shape, var_dtype):
+        var_desc = core.VarDesc(name)
+        var_desc.set_type(var_type)
+        var_desc.set_shape(shape)
+        var_desc.set_dtype(var_dtype)
+        var_desc.set_persistable(True)
+        return self.graph.create_var_node(var_desc)
+
+    def create_var_node(self, name, var_type, shape, var_dtype):
+        var_desc = core.VarDesc(name)
+        var_desc.set_type(var_type)
+        var_desc.set_shape(shape)
+        var_desc.set_dtype(var_dtype)
+        return self.graph.create_var_node(var_desc)
+
+    def create_var_node_from_desc(self, var_desc):
+        return self.graph.create_var_node(var_desc)
+
+    def create_op_node(self, op_type, attrs, inputs, outputs):
+        op_desc = core.OpDesc()
+        op_desc.set_type(op_type)
+        for attr, value in attrs.iteritems():
+            self._update_desc_attr(op_desc, attr, value)
+        for input_name, var_nodes in inputs.iteritems():
+            if not isinstance(var_nodes, list):
+                var_nodes = [var_nodes]
+            op_desc.set_input(input_name,
+                              [var_node.name() for var_node in var_nodes])
+        for output_name, var_nodes in outputs.iteritems():
+            if not isinstance(var_nodes, list):
+                var_nodes = [var_nodes]
+            op_desc.set_output(output_name,
+                               [var_node.name() for var_node in var_nodes])
+        return self.graph.create_op_node(op_desc)
+
+    def create_op_node_from_desc(self, op_desc):
+        return self.graph.create_op_node(op_desc)
+
+    def update_input_link(self, old_input_node, new_input_node, op_node):
+        assert old_input_node in self.graph.nodes() and new_input_node in self.graph.nodes() and \
+            op_node in self.graph.nodes(), 'Th three arguments must be in the graph nodes.'
+        old_input_node.outputs_remove(op_node)
+        op_node.inputs_remove(old_input_node)
+        new_input_node.outputs_append(op_node)
+        op_node.inputs_append(new_input_node)
+        op_node.op()._rename_input(old_input_node.name(), new_input_node.name())
+
+    def link_to(self, node_in, node_out):
+        assert node_in in self.graph.nodes() and node_out in self.graph.nodes(), \
+            'Th two arguments must be in the graph nodes.'
+        node_in.outputs_append(node_out)
+        node_out.inputs_append(node_in)
+
+    def safe_remove_nodes(self, remove_nodes):
+        if not isinstance(remove_nodes, set):
+            remove_nodes = set(remove_nodes)
+        core.graph_safe_remove_nodes(self.graph, remove_nodes)
+
+    def draw(self, save_path, name, marked_nodes=None):
+        def _convert_to_pdf(dot_file_path):
+            pdf_save_path = os.path.splitext(dot_file_path)[0] + '.pdf'
+            exited_code = subprocess.call('dot -Tpdf ' + dot_file_path \
+                            + ' -o ' + pdf_save_path, shell=True)
+            if exited_code != 0:
+                print('The dot command is needed for creating pdf files.')
+                print('The {} is saved as the dot filetype.'.format(
+                    dot_file_path))
+
+        remove_ctr_vars = set()
+        ops_num = 0
+        for node in self.graph.nodes():
+            if node.is_ctrl_var():
+                remove_ctr_vars.add(node)
+            elif node.is_op():
+                ops_num += 1
+        print('Total ops num = {}.'.format(ops_num))
+        self.safe_remove_nodes(remove_ctr_vars)
+        if marked_nodes is not None:
+            if not isinstance(marked_nodes, set):
+                marked_nodes = set(marked_nodes)
+            marked_nodes = marked_nodes - remove_ctr_vars
+            if self.graph.has('__graphviz__marked_node__'):
+                self.graph.erase('__graphviz__marked_node__')
+            self.graph.set('__graphviz__marked_node__', marked_nodes)
+        viz_dot_path = os.path.join(save_path, name) + '.dot'
+        viz_pass = core.get_pass('graph_viz_pass')
+        viz_pass.set('graph_viz_path', viz_dot_path)
+        viz_pass.apply(self.graph)
+        _convert_to_pdf(viz_dot_path)
+
+    def to_program(self):
+        convert_pass = core.get_pass('graph_to_program_pass')
+        convert_pass.set('program', Program().desc)
+        convert_pass.apply(self.graph)
+        desc = convert_pass.get_program('program')
+        program = Program._construct_from_desc(desc)
+        return program
+
+    def _update_desc_attr(self, desc, name, val):
+        """
+        Update the value of desc's attribute by attribute's name.
+        """
+        if isinstance(val, Block):
+            desc.set_block_attr(name, val.desc)
+        elif isinstance(val, list) and val and all(
+                isinstance(v, Block) for v in val):
+            desc.set_blocks_attr(name, [v.desc for v in val])
+        elif isinstance(val, core.BlockDesc) or \
+                isinstance(val, core.ProgramDesc):
+            desc.set_serialized_attr(name, val.serialize_to_string())
+        else:
+            desc._set_attr(name, val)
+
+
 class Program(object):
     """
     Python Program. Beneath it is a ProgramDesc, which is used for
@@ -1547,12 +1702,20 @@ class Program(object):
         self._current_role = core.op_proto_and_checker_maker.OpRole.Forward
         self._op_role_var = []
 
-        # for distribute
+        # for distribute training
+        # _is_distributed = True if under distributed training
         self._is_distributed = False
+        # _is_chief = True if the trainer is the first one, usually No.0
         self._is_chief = False
-        self._slice_vars_and_attrs = []
+        # _parameters_on_pservers records all the parameters distributed on parameter servers.
+        self._parameters_on_pservers = None
+        # _endpoints is a list about parameter servers ip:port, such as ["ip:port","ip:port"]
         self._endpoints = []
+        # if current role is parameter server, the _ps_endpoint is its "ip:port"
+        self._ps_endpoint = None
+        # trainers_endpoints, it is used for distribution.
         self._trainers_endpoints = []
+        # the distributed lookup table names
         self._distributed_lookup_table = None
 
     @property
@@ -1936,6 +2099,23 @@ class Program(object):
         p._sync_with_cpp()
         return p
 
+    @staticmethod
+    def _construct_from_desc(desc):
+        """
+        Construct a program from program desc.
+
+        Args:
+            desc(core.ProgramDesc): The program desc for constructing.
+
+        Returns:
+            Program: A program.
+        """
+        p = Program()
+        p.desc = desc
+        p.blocks = [Block(p, i) for i in six.moves.range(p.desc.num_blocks())]
+        p._sync_with_cpp()
+        return p
+
     @property
     def random_seed(self):
         """
@@ -2066,8 +2246,9 @@ class Program(object):
                             "Program")
         self._is_distributed = other._is_distributed
         self._is_chief = other._is_chief
-        self._slice_vars_and_attrs = other._slice_vars_and_attrs
+        self._parameters_on_pservers = other._parameters_on_pservers
         self._endpoints = other._endpoints
+        self._ps_endpoint = other._ps_endpoint
         self._distributed_lookup_table = other._distributed_lookup_table
 
     def _copy_data_info_from(self, other):
@@ -2327,5 +2508,18 @@ def _imperative_guard(tracer):
     global _imperative_tracer_
     tmp_trace = _imperative_tracer_
     _imperative_tracer_ = tracer
+
     yield
+
     _imperative_tracer_ = tmp_trace
+
+
+@contextlib.contextmanager
+def _imperative_place_guard(place):
+    global _imperative_current_expected_place_
+    tmp_place = _imperative_current_expected_place_
+    _imperative_current_expected_place_ = place
+
+    yield
+
+    _imperative_current_expected_place_ = tmp_place
diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py
index 5d3ebb25a935cea6ec376e6bc044281dcba37337..ff3984b11f42cf9e6ff49c8654c600c065effe1d 100644
--- a/python/paddle/fluid/imperative/base.py
+++ b/python/paddle/fluid/imperative/base.py
@@ -25,18 +25,28 @@ def enabled():
 
 
 @contextlib.contextmanager
-def guard():
+def guard(place=None):
     train = framework.Program()
     startup = framework.Program()
     tracer = core.Tracer(train.current_block().desc)
+
+    if place is None:
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+
     with framework.program_guard(train, startup):
         with framework.unique_name.guard():
             with framework._imperative_guard(tracer):
-                yield
+                with framework._imperative_place_guard(place):
+                    yield
 
 
 def to_variable(value, block=None):
     if isinstance(value, np.ndarray):
+        assert enabled(), "to_variable could only be called in imperative mode"
+
         if not block:
             block = framework.default_main_program().current_block()
         py_var = framework.Variable(
@@ -47,9 +57,7 @@ def to_variable(value, block=None):
             dtype=value.dtype)
         var = py_var._ivar.value()
         tensor = var.get_tensor()
-        tensor.set(value, core.CPUPlace())
+        tensor.set(value, framework._current_expected_place())
         return py_var
     elif isinstance(value, framework.Variable):
         return value
-    else:
-        raise ValueError("Unsupported type %s" % type(value))
diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py
index 0fe680b491f5865f8c07b796f1cb3b542070fb7c..68fffdfa333290da39f921eb6144a5c0c401908b 100644
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@@ -22,8 +22,13 @@ from . import layers
 from ..framework import Variable, OpProtoHolder
 from ..param_attr import ParamAttr
 from ..initializer import Normal, Constant
-
-__all__ = ['Conv2D', 'Pool2D', 'FC', 'EMBEDDING']
+__all__ = [
+    'Conv2D',
+    'Pool2D',
+    'FC',
+    'BatchNorm',
+    'EMBEDDING'
+]
 
 
 class Conv2D(layers.Layer):
@@ -51,7 +56,8 @@ class Conv2D(layers.Layer):
             param_attr=param_attr,
             bias_attr=bias_attr,
             dtype=dtype,
-            name=name)
+            name=name,
+            act=act)
 
         self._groups = groups
         self._stride = utils.convert_to_list(stride, 2, 'stride')
@@ -137,6 +143,7 @@ class Conv2D(layers.Layer):
             outputs={'Out': [pre_act]},
             attrs={'axis': 1})
 
+        # Currently, we don't support inplace in imperative mode
         return self._helper.append_activation(pre_act)
 
 
@@ -212,6 +219,7 @@ class FC(layers.Layer):
                  act=None,
                  name=None):
         super(FC, self).__init__()
+
         self._size = size
         self._num_flatten_dims = num_flatten_dims
         self._dtype = dtype
@@ -237,6 +245,16 @@ class FC(layers.Layer):
             dtype=self._dtype,
             is_bias=False)
 
+        if self._helper.bias_attr:
+            size = list([self._size])
+            self._b = self._helper.create_parameter(
+                attr=self._helper.bias_attr,
+                shape=size,
+                dtype=self._dtype,
+                is_bias=True)
+        else:
+            self._b = None
+
     def forward(self, input):
         tmp = self._helper.create_variable_for_type_inference(self._dtype)
         self._helper.append_op(
@@ -249,31 +267,160 @@ class FC(layers.Layer):
                 "y_num_col_dims": 1
             })
 
-        out = self._helper.create_variable_for_type_inference(self._dtype)
+        pre_bias = self._helper.create_variable_for_type_inference(self._dtype)
         self._helper.append_op(
             type="sum",
             inputs={"X": [tmp]},
-            outputs={"Out": out},
+            outputs={"Out": pre_bias},
             attrs={"use_mkldnn": False})
 
-        bias_attr = self._helper.bias_attr
-        if bias_attr:
-            # add bias
-            size = list(out.shape[1:])
-            if not self._built:
-                self._b = self._helper.create_parameter(
-                    attr=bias_attr, shape=size, dtype=out.dtype, is_bias=True)
-            bias_out = self._helper.create_variable_for_type_inference(
-                dtype=out.dtype)
+        if self._b:
+            pre_activation = self._helper.create_variable_for_type_inference(
+                dtype=self._dtype)
             self._helper.append_op(
                 type='elementwise_add',
-                inputs={'X': [out],
+                inputs={'X': [pre_bias],
                         'Y': [self._b]},
-                outputs={'Out': [bias_out]},
-                attrs={'axis': 1})
-            out = bias_out
-        # add activation
-        return self._helper.append_activation(out)
+                outputs={'Out': [pre_activation]},
+                attrs={'axis': self._num_flatten_dims})
+        else:
+            pre_activation = pre_bias
+        # Currently, we don't support inplace in imperative mode
+        return self._helper.append_activation(pre_activation)
+
+
+class BatchNorm(layers.Layer):
+    def __init__(self,
+                 num_channels,
+                 act=None,
+                 is_test=False,
+                 momentum=0.9,
+                 epsilon=1e-05,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype=core.VarDesc.VarType.FP32,
+                 data_layout='NCHW',
+                 in_place=False,
+                 name=None,
+                 moving_mean_name=None,
+                 moving_variance_name=None,
+                 do_model_average_for_mean_and_var=False,
+                 fuse_with_relu=False,
+                 use_global_stats=False):
+        super(BatchNorm, self).__init__()
+
+        assert bias_attr is not False, "bias_attr should not be False in batch_norm."
+
+        from ..layer_helper import LayerHelper
+        self._helper = LayerHelper(
+            'batch_norm',
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            name=name,
+            act=act)
+
+        if dtype == core.VarDesc.VarType.FP16:
+            self._dtype = core.VarDesc.VarType.FP32
+        else:
+            self._dtype = dtype
+
+        param_shape = [num_channels]
+
+        # create parameter
+        self._scale = self._helper.create_parameter(
+            attr=self._helper.param_attr,
+            shape=param_shape,
+            dtype=self._dtype,
+            default_initializer=Constant(1.0))
+
+        # TODO(minqiyang): change stop_gradient sign to trainable to align with static graph
+        #  # setting stop_gradient=True to reduce computation
+        #  if use_global_stats and self._helper.param_attr.learning_rate == 0.:
+        #  self._scale.stop_gradient = True
+
+        self._bias = self._helper.create_parameter(
+            attr=self._helper.bias_attr,
+            shape=param_shape,
+            dtype=self._dtype,
+            is_bias=True)
+        # TODO(minqiyang): change stop_gradient sign to trainable to align with static graph
+        #  # setting stop_gradient=True to reduce computation
+        #  if use_global_stats and self._helper.bias_attr.learning_rate == 0.:
+        #  self._bias.stop_gradient = True
+
+        self._mean = self._helper.create_parameter(
+            attr=ParamAttr(
+                name=moving_mean_name,
+                initializer=Constant(0.0),
+                trainable=False,
+                do_model_average=do_model_average_for_mean_and_var),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._mean.stop_gradient = True
+
+        self._variance = self._helper.create_parameter(
+            attr=ParamAttr(
+                name=moving_variance_name,
+                initializer=Constant(1.0),
+                trainable=False,
+                do_model_average=do_model_average_for_mean_and_var),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._variance.stop_gradient = True
+
+        self._in_place = in_place
+        self._momentum = momentum
+        self._epsilon = epsilon
+        self._is_test = is_test
+        self._fuse_with_relu = fuse_with_relu
+        self._use_global_stats = use_global_stats
+
+    def _build_once(self, input):
+        pass
+
+    def forward(self, input):
+        # create output
+        # mean and mean_out share the same memory
+        mean_out = self._mean
+        # variance and variance out share the same memory
+        variance_out = self._variance
+
+        saved_mean = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        saved_variance = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        batch_norm_out = input if self._in_place else self._helper.create_variable_for_type_inference(
+            self._dtype)
+
+        self._helper.append_op(
+            type="batch_norm",
+            inputs={
+                "X": input,
+                "Scale": self._scale,
+                "Bias": self._bias,
+                "Mean": self._mean,
+                "Variance": self._variance
+            },
+            outputs={
+                "Y": batch_norm_out,
+                "MeanOut": mean_out,
+                "VarianceOut": variance_out,
+                "SavedMean": saved_mean,
+                "SavedVariance": saved_variance
+            },
+            attrs={
+                "momentum": self._momentum,
+                "epsilon": self._epsilon,
+                "is_test": self._is_test,
+                "use_mkldnn": False,
+                "fuse_with_relu": self._fuse_with_relu,
+                "use_global_stats": self._use_global_stats
+            })
+
+        # Currently, we don't support inplace in imperative mode
+        return self._helper.append_activation(batch_norm_out)
+        outputs={'Out': [bias_out]},
+
 
 
 class EMBEDDING(layers.Layer):
@@ -291,7 +438,7 @@ class EMBEDDING(layers.Layer):
         self._is_distributed = is_distributed
 
         self._padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
-            size[0] + padding_idx)
+                size[0] + padding_idx)
 
         self._param_attr = param_attr
         self._dtype = dtype
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 8a2cd4a9290ab1d2f3e2af2d682994c999d7b931..4f434328e47df4363b304ff55f587018d3157c5e 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -24,7 +24,8 @@ __all__ = [
     'Constant', 'Uniform', 'Normal', 'TruncatedNormal', 'Xavier', 'Bilinear',
     'MSRA', 'force_init_on_cpu', 'init_on_cpu', 'ConstantInitializer',
     'UniformInitializer', 'NormalInitializer', 'TruncatedNormalInitializer',
-    'XavierInitializer', 'BilinearInitializer', 'MSRAInitializer'
+    'XavierInitializer', 'BilinearInitializer', 'MSRAInitializer',
+    'NumpyArrayInitializer'
 ]
 
 _force_init_on_cpu_ = False
@@ -683,6 +684,64 @@ class BilinearInitializer(Initializer):
         return op
 
 
+class NumpyArrayInitializer(Initializer):
+    """Init an parameter with an numpy array
+
+    Args:
+        value (numpy): numpy array to initialize the variable
+
+    Examples:
+        .. code-block:: python
+
+            fc = fluid.layers.fc(input=x, size=10,
+                param_attr=fluid.initializer.NumpyArrayInitializer(numpy.array([1,2])))
+    """
+
+    def __init__(self, value):
+        import numpy
+        assert isinstance(value, numpy.ndarray)
+        super(NumpyArrayInitializer, self).__init__()
+        self._value = value
+
+    def __call__(self, var, block):
+        """Add constant initialization ops for a variable
+
+        Args:
+            var: Variable that needs to be initialized
+            block: The block in which initialization ops
+                   should be added
+
+        Returns:
+            the initialization op
+        """
+        assert isinstance(var, framework.Variable)
+        assert isinstance(block, framework.Block)
+        # Initialization Ops should be prepended and not appended
+        dtype = framework.convert_np_dtype_to_dtype_(self._value.dtype)
+        if dtype == VarDesc.VarType.FP32:
+            value_name = "fp32_values"
+            values = [float(v) for v in self._value.flat]
+        elif dtype == VarDesc.VarType.INT32:
+            value_name = "int32_values"
+            values = [int(v) for v in self._value.flat]
+        else:
+            raise ValueError("Unsupported dtype %s", self._value.dtype)
+        if self._value.size > 1024 * 1024 * 5:
+            raise ValueError("The size of input is too big. Please consider "
+                             "saving it to file and 'load_op' to load it")
+        op = block._prepend_op(
+            type='assign_value',
+            outputs={'Out': var},
+            attrs={
+                'dtype': dtype,
+                'shape': list(self._value.shape),
+                value_name: values
+            },
+            stop_gradient=True)
+        var.op = op
+        return op
+
+
 # We short the class name, since users will use the initializer with the package
 # name. The sample code:
 #
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index e74a87fc68db0e126098f7188db4a712dff2612d..6b1d4cc34f3cd40c878740f28618f26d5e89a6bd 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -19,6 +19,7 @@ import errno
 import time
 import shutil
 import six
+from functools import reduce
 
 from paddle.fluid.executor import Executor
 from paddle.fluid.evaluator import Evaluator
@@ -183,8 +184,6 @@ def save_vars(executor,
             # NOTE: don't save the variable which type is RAW
             if each_var.type == core.VarDesc.VarType.RAW:
                 continue
-            if each_var.name == main_program._distributed_lookup_table:
-                continue
             new_var = _clone_var_in_block_(save_block, each_var)
             if filename is None:
                 save_block.append_op(
@@ -206,16 +205,6 @@ def save_vars(executor,
                 outputs={},
                 attrs={'file_path': os.path.join(dirname, filename)})
 
-        # if there is lookup table, the trainer 0 will notify all pserver to save.
-        if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table:
-            lookup_table_filename = os.path.join(dirname, "__lookup_table__")
-            attrs = {}
-            attrs['epmap'] = main_program._endpoints
-            attrs['dir'] = lookup_table_filename
-            attrs['lookup_table'] = main_program._distributed_lookup_table
-            save_block.append_op(
-                type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
-
         executor.run(save_program)
 
 
@@ -267,6 +256,186 @@ def save_params(executor, dirname, main_program=None, filename=None):
         filename=filename)
 
 
+def _save_distributed_persistables(executor, dirname, main_program):
+    """
+    save_persistables for distributed training.
+    the method will do things listed below:
+    1.save part of persistable variables on trainer.
+    2.receive "remote prefetch variables" from parameter servers and merge them.
+    3.save "distributed lookup table" on parameter servers.
+    4.receive "optimizer variables" from parameter servers and merge them.
+
+    Args:
+        executor(Executor): The executor to run for saving parameters.
+        dirname(str): The saving directory path.
+        main_program(Program): The program whose parameters will be
+                            saved. the main_program must be the trainer_program
+                            get after transpiler.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            t = distribute_transpiler.DistributeTranspiler()
+            t.transpile(...)
+            train_program = t.get_trainer_program()
+            _save_distributed_persistables(executor=exe, dirname=param_path, main_program=train_program)
+    """
+
+    def __save_remote_params(executor, dirname, remote_params_map):
+        """
+        recive params on pserver through rpc.
+        if the params are be sliced, will concat them to one, then save it.
+        """
+        if not remote_params_map:
+            return
+
+        prog = Program()
+        block = prog.global_block()
+
+        # recv optimize vars from pserver
+        for name, remote_params in remote_params_map.items():
+            origin_var = None
+            is_slice = False
+            slice_vars = [0] * len(remote_params)
+            slice_var_names = [""] * len(remote_params)
+            endpoints = [""] * len(remote_params)
+
+            for idx, optimizer in enumerate(remote_params):
+                origin = optimizer.origin
+                slice = optimizer.slice
+                is_slice = optimizer.is_slice
+                block_id = optimizer.block_id
+                endpoint = optimizer.endpoint
+
+                if idx == 0:
+                    origin_var = block.create_var(
+                        name=origin.name,
+                        type=origin.type,
+                        shape=origin.shape,
+                        dtype=origin.dtype,
+                        persistable=True)
+
+                slice_var = block.create_var(
+                    name="{}.slice.{}".format(slice.name, idx),
+                    type=slice.type,
+                    shape=slice.shape,
+                    dtype=slice.dtype,
+                    persistable=True)
+
+                index = block_id if is_slice else idx
+                slice_vars[index] = slice_var
+                slice_var_names[index] = slice.name
+                endpoints[index] = endpoint
+
+            if is_slice:
+                block.append_op(
+                    type='recv',
+                    inputs={"X": []},
+                    outputs={"Out": slice_vars},
+                    attrs={
+                        "epmap": endpoints,
+                        "with_barrier": False,
+                        "varnames": slice_var_names,
+                        "sync_mode": True
+                    })
+                block.append_op(
+                    type='concat',
+                    inputs={'X': slice_vars},
+                    outputs={'Out': origin_var},
+                    attrs={})
+            else:
+                block.append_op(
+                    type='recv',
+                    inputs={"X": []},
+                    outputs={"Out": [origin_var]},
+                    attrs={
+                        "epmap": endpoints[:1],
+                        "with_barrier": False,
+                        "varnames": slice_var_names,
+                        "sync_mode": True
+                    })
+            block.append_op(
+                type='save',
+                inputs={'X': [origin_var]},
+                outputs={},
+                attrs={'file_path': os.path.join(dirname, origin_var.name)})
+            block.append_op(type='delete_var', inputs={'X': slice_vars})
+        executor.run(prog)
+
+    def __save_distributed_lookup_tables(executor, dirname,
+                                         distributed_lookup_table, endpoints):
+        """
+        because the distributed lookup table may too huge to merge and save at one place,
+        it will be saved at parameter server independent respectively.
+
+        the save directory is dirname/"__lookup_table__".
+
+        """
+        prog = Program()
+        block = prog.global_block()
+
+        # if there is lookup table, the trainer 0 will notify all pserver to save.
+        lookup_table_filename = os.path.join(dirname, "__lookup_table__")
+        attrs = {}
+        attrs['epmap'] = endpoints
+        attrs['dir'] = lookup_table_filename
+        attrs['lookup_table'] = distributed_lookup_table
+        block.append_op(
+            type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
+        executor.run(prog)
+
+    def __exclude_vars(exclude_var_names=[]):
+        def is_valid(var):
+            if var.name in exclude_var_names:
+                return False
+            if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+                        var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+                        var.desc.type() == core.VarDesc.VarType.READER:
+                return False
+            return var.persistable
+
+        return is_valid
+
+    if not isinstance(main_program, Program):
+        raise ValueError("'main_program' should be an instance of Program.")
+
+    if not main_program._is_distributed:
+        raise ValueError(
+            "'_save_distributed_persistables' just be designed for distributed training."
+        )
+
+    remote_params_map = main_program._parameters_on_pservers.get_distributed_vars_by_vtypes(
+        ["Optimizer", "RemotePrefetch"], groupby=True)
+
+    exclude_var_names = []
+    if remote_params_map:
+        exclude_var_names.extend(remote_params_map.keys())
+
+    if main_program._distributed_lookup_table:
+        if isinstance(main_program._distributed_lookup_table, list):
+            exclude_var_names.extend(main_program._distributed_lookup_table)
+        else:
+            exclude_var_names.append(main_program._distributed_lookup_table)
+
+    local_vars = list(
+        filter(__exclude_vars(exclude_var_names), main_program.list_vars()))
+    save_vars(
+        executor, main_program=main_program, dirname=dirname, vars=local_vars)
+
+    if main_program._is_chief:
+        if remote_params_map:
+            __save_remote_params(executor, dirname, remote_params_map)
+        if main_program._distributed_lookup_table:
+            __save_distributed_lookup_tables(
+                executor, dirname, main_program._distributed_lookup_table,
+                main_program._endpoints)
+
+
 def save_persistables(executor, dirname, main_program=None, filename=None):
     """
     This function filters out all variables with `persistable==True` from the
@@ -301,13 +470,19 @@ def save_persistables(executor, dirname, main_program=None, filename=None):
             fluid.io.save_persistables(executor=exe, dirname=param_path,
                                        main_program=None)
     """
-    save_vars(
-        executor,
-        dirname=dirname,
-        main_program=main_program,
-        vars=None,
-        predicate=is_persistable,
-        filename=filename)
+
+    if main_program and main_program._is_distributed:
+        _save_distributed_persistables(
+            executor, dirname=dirname, main_program=main_program)
+
+    else:
+        save_vars(
+            executor,
+            dirname=dirname,
+            main_program=main_program,
+            vars=None,
+            predicate=is_persistable,
+            filename=filename)
 
 
 def load_vars(executor,
@@ -402,17 +577,11 @@ def load_vars(executor,
         if not isinstance(main_program, Program):
             raise TypeError("program should be as Program type or None")
 
-        load_slice_vars = []
-        for each_var in main_program._slice_vars_and_attrs:
-            load_slice_vars.append(each_var[2].name)
-
         load_var_map = {}
         for each_var in vars:
             assert isinstance(each_var, Variable)
             if each_var.type == core.VarDesc.VarType.RAW:
                 continue
-            if each_var.name in load_slice_vars:
-                continue
             new_var = _clone_var_in_block_(load_block, each_var)
             if filename is None:
                 load_block.append_op(
@@ -435,10 +604,6 @@ def load_vars(executor,
                 attrs={'file_path': os.path.join(dirname, filename)})
         executor.run(load_prog)
 
-        # load slice vars on pserver, if have it.
-        _load_slice_up_vars(executor, dirname,
-                            main_program._slice_vars_and_attrs)
-
 
 def load_params(executor, dirname, main_program=None, filename=None):
     """
@@ -521,12 +686,134 @@ def load_persistables(executor, dirname, main_program=None, filename=None):
             fluid.io.load_persistables(executor=exe, dirname=param_path,
                                        main_program=None)
     """
-    load_vars(
-        executor,
-        dirname=dirname,
-        main_program=main_program,
-        predicate=is_persistable,
-        filename=filename)
+
+    if main_program and main_program._is_distributed:
+        _load_distributed_persistables(
+            executor, dirname=dirname, main_program=main_program)
+    else:
+        load_vars(
+            executor,
+            dirname=dirname,
+            main_program=main_program,
+            predicate=is_persistable,
+            filename=filename)
+
+
+def _load_distributed_persistables(executor, dirname, main_program=None):
+    """
+    customized load_persistables for distributed training.
+    it should be used on parameter server,
+
+    Args:
+        executor(Executor): The executor to run for saving parameters.
+        dirname(str): The load directory path.
+        main_program(Program): The program whose parameters will be
+                            loaded. the main_program must be the pserver_program
+                            get after transpiler.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            t = distribute_transpiler.DistributeTranspiler()
+            t.transpile(...)
+            pserver_prog = t.get_pserver_program(...)
+            _load_distributed_persistables(executor=exe, dirname=param_path, main_program=pserver_prog)
+    """
+
+    def __is_distributed_part_var(varname):
+        trainer_idx = varname.find(".trainer_")
+        block_idx = varname.find(".block")
+        return trainer_idx or block_idx
+
+    def __load_persistable_vars(executor, dirname, need_load_vars):
+        load_prog = Program()
+        load_block = load_prog.global_block()
+        need_delete_vars = []
+
+        for param in need_load_vars:
+            origin_var = param.origin
+            slice_var = param.slice
+            is_slice = param.is_slice
+            offset = param.offset
+
+            if is_slice:
+                origin = load_block.create_var(
+                    name="{}.load".format(origin_var.name),
+                    type=origin_var.type,
+                    shape=origin_var.shape,
+                    dtype=origin_var.dtype,
+                    persistable=True)
+
+                load_block.append_op(
+                    type='load',
+                    inputs={},
+                    outputs={'Out': [origin]},
+                    attrs={
+                        'file_path': os.path.join(dirname, origin_var.name)
+                    })
+
+                slice = load_block.create_var(
+                    name=slice_var.name,
+                    type=slice_var.type,
+                    shape=slice_var.shape,
+                    dtype=slice_var.dtype,
+                    persistable=True)
+
+                dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:])
+                start = int(offset / dim1_flatten)
+                end = int(offset / dim1_flatten + slice.shape[0])
+
+                load_block.append_op(
+                    type="slice",
+                    inputs={'Input': origin},
+                    outputs={'Out': slice},
+                    attrs={'axes': [0],
+                           'starts': [start],
+                           'ends': [end]})
+
+                need_delete_vars.append(origin)
+            else:
+                origin = load_block.create_var(
+                    name="{}".format(origin_var.name),
+                    type=origin_var.type,
+                    shape=origin_var.shape,
+                    dtype=origin_var.dtype,
+                    persistable=True)
+                load_block.append_op(
+                    type='load',
+                    inputs={},
+                    outputs={'Out': [origin]},
+                    attrs={
+                        'file_path': os.path.join(dirname, origin_var.name)
+                    })
+
+        load_block.append_op(
+            type='delete_var',
+            inputs={'X': need_delete_vars}, )
+
+        executor.run(load_prog)
+
+    if not isinstance(main_program, Program):
+        raise ValueError("'main_program' should be an instance of Program.")
+
+    if not main_program._is_distributed:
+        raise ValueError(
+            "'_load_distributed_persistables' just be designed for distributed training."
+        )
+
+    if not main_program._ps_endpoint:
+        raise ValueError(
+            "'_load_distributed_persistables' need current_endpoint set in DistributeTranspiler.transpile"
+        )
+
+    need_load_vars = main_program._parameters_on_pservers.get_distributed_vars_by_ep(
+        main_program._ps_endpoint)
+    __load_persistable_vars(executor, dirname, need_load_vars)
 
 
 def prepend_feed_ops(inference_program,
@@ -795,52 +1082,6 @@ def load_inference_model(dirname,
     return [program, feed_target_names, fetch_targets]
 
 
-def _save_lookup_tables_by_notify(executor, dirname, lookup_table,
-                                  pserver_endpoints):
-    """
-    This function will send checkpoint notify message from Trainer 0
-    to all the pservers.
-    The checkpoint notify message contains lookup table name,
-    the absolute path on pserver to save lookup_table.
-
-    Args:
-        executor(Executor): The executor to run for send checkpoint notify.
-        dirname(str): The folder where to save.
-        lookup_table(string): the lookup table name, when use distribute
-            lookup table, we can get lookup table name by DistributeTranspiler.
-            table_name
-        ps_endpoint_list(list): the parameter server ip:port list.
-            when use distribute lookup table, we can get ps_endpoint_list by
-            distribute arguments.
-    Return:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            param_path = "./my_paddle_model"
-            table_name = "share_w"
-            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
-
-            _save_pserver_vars_by_notify(executor=exe,
-                    dirname=param_path, lookup_table=table_name,
-                    pserver_endpoints=ps_endpoints)
-    """
-
-    pserver_notify_program = Program()
-    pserver_notify_block = pserver_notify_program.global_block()
-
-    attrs = {}
-    attrs['epmap'] = pserver_endpoints
-    attrs['dir'] = dirname
-    attrs['lookup_table'] = lookup_table
-
-    pserver_notify_block.append_op(
-        type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
-    executor.run(pserver_notify_program)
-
-
 def _endpoints_replacement(program, endpoints):
     ENDPOINT_MAP = "epmap"
     for op in program.global_block().ops:
@@ -911,54 +1152,3 @@ def get_parameter_value_by_name(name, executor, program=None):
         program = default_main_program()
     var = program.global_block().var(name)
     return get_parameter_value(var, executor)
-
-
-def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
-    if not slice_vars_and_attrs:
-        return
-
-    load_prog = Program()
-    load_block = load_prog.global_block()
-    need_delete_vars = []
-
-    for var_tuple in slice_vars_and_attrs:
-        orig_var = var_tuple[0]
-        start = var_tuple[1]
-        slice_var = var_tuple[2]
-        end = start + slice_var.shape[0]
-
-        orig_var_name = orig_var.name
-        orig_var.name = "{}.origin".format(orig_var_name)
-
-        clone_orig_var = load_block.create_var(
-            name=orig_var.name,
-            type=orig_var.type,
-            shape=orig_var.shape,
-            dtype=orig_var.dtype,
-            persistable=True)
-
-        clone_slice_var = load_block.create_var(
-            name=slice_var.name,
-            type=slice_var.type,
-            shape=slice_var.shape,
-            dtype=slice_var.dtype,
-            persistable=True)
-
-        load_block.append_op(
-            type='load',
-            inputs={},
-            outputs={'Out': [clone_orig_var]},
-            attrs={'file_path': os.path.join(dirname, orig_var_name)})
-        load_block.append_op(
-            type="slice",
-            inputs={'Input': clone_orig_var},
-            outputs={'Out': clone_slice_var},
-            attrs={'axes': [0],
-                   'starts': [start],
-                   'ends': [end]})
-        need_delete_vars.append(clone_orig_var)
-
-    load_block.append_op(
-        type='delete_var',
-        inputs={'X': need_delete_vars}, )
-    executor.run(load_prog)
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index ea9953f5814207c569e799c2ffa11758e31699aa..972c51938f2b2282f8de4b090f9af3bc66f89155 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -435,7 +435,10 @@ class LayerHelper(object):
         act_type = act.pop('type')
         tmp = input_var
         # NOTE(dzhwinter): some activation support inplace compution.
-        if not core.IsInplace(act_type):
+        # NOTE(minqiyang): currently, we don't support inplace in imperative mode
+        if not imperative_base.enabled() and core.IsInplace(act_type):
+            tmp = input_var
+        else:
             tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
         self.append_op(
             type=act_type,
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 8aed97dc59b100d4e37832e0a148d73662742ba0..cddc302d52e0a5aea802fd7e1464f1e220c8f769 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -44,6 +44,7 @@ __all__ = [
     'roi_perspective_transform',
     'generate_proposal_labels',
     'generate_proposals',
+    'generate_mask_labels',
     'iou_similarity',
     'box_coder',
     'polygon_box_transform',
@@ -1659,7 +1660,7 @@ def generate_proposal_labels(rpn_rois,
                              class_nums=None,
                              use_random=True):
     """
-    ** Generate proposal labels Faster-RCNN **
+    ** Generate Proposal Labels of Faster-RCNN **
     This operator can be, for given the GenerateProposalOp output bounding boxes and groundtruth,
     to sample foreground boxes and background boxes, and compute loss target.
 
@@ -1740,6 +1741,140 @@ def generate_proposal_labels(rpn_rois,
     return rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights
 
 
+def generate_mask_labels(im_info, gt_classes, is_crowd, gt_segms, rois,
+                         labels_int32, num_classes, resolution):
+    """
+    ** Generate Mask Labels for Mask-RCNN **
+
+    This operator can be, for given the RoIs and corresponding labels,
+    to sample foreground RoIs. This mask branch also has
+    a :math: `K \\times M^{2}` dimensional output targets for each foreground
+    RoI, which encodes K binary masks of resolution M x M, one for each of the
+    K classes. This mask targets are used to compute loss of mask branch.
+
+    Please note, the data format of groud-truth segmentation, assumed the
+    segmentations are as follows. The first instance has two gt objects.
+    The second instance has one gt object, this object has two gt segmentations.
+
+        .. code-block:: python
+
+            #[
+            #  [[[229.14, 370.9, 229.14, 370.9, ...]],
+            #   [[343.7, 139.85, 349.01, 138.46, ...]]], # 0-th instance
+            #  [[[500.0, 390.62, ...],[115.48, 187.86, ...]]] # 1-th instance
+            #]
+
+            batch_masks = []
+            for semgs in batch_semgs:
+                gt_masks = []
+                for semg in semgs:
+                    gt_segm = []
+                    for polys in semg:
+                        gt_segm.append(np.array(polys).reshape(-1, 2))
+                    gt_masks.append(gt_segm)
+                batch_masks.append(gt_masks)
+            
+            
+            place = fluid.CPUPlace()
+            feeder = fluid.DataFeeder(place=place, feed_list=feeds)
+            feeder.feed(batch_masks)
+
+    Args:
+        im_info(Variable): A 2-D Tensor with shape [N, 3]. N is the batch size,
+            each element is [height, width, scale] of image. Image scale is
+            target_size) / original_size.
+        gt_classes(Variable): A 2-D LoDTensor with shape [M, 1]. M is the total
+            number of ground-truth, each element is a class label.
+        is_crowd(Variable): A 2-D LoDTensor with shape as gt_classes,
+            each element is a flag indicating whether a groundtruth is crowd.
+        gt_segms(Variable): This input is a 2D LoDTensor with shape [S, 2],
+            it's LoD level is 3. Usually users do not needs to understand LoD,
+            The users should return correct data format in reader.
+
+
+
+            The LoD[0] represents the gt objects number of
+            each instance. LoD[1] represents the segmentation counts of each
+            objects. LoD[2] represents the polygons number of each segmentation.
+            S the total number of polygons coordinate points. Each element is
+            (x, y) coordinate points.
+        rois(Variable): A 2-D LoDTensor with shape [R, 4]. R is the total
+            number of RoIs, each element is a bounding box with
+            (xmin, ymin, xmax, ymax) format in the range of original image.
+        labels_int32(Variable): A 2-D LoDTensor in shape of [R, 1] with type
+            of int32. R is the same as it in `rois`. Each element repersents
+            a class label of a RoI.
+        num_classes(int): Class number.
+        resolution(int): Resolution of mask predictions.
+
+    Returns:
+        mask_rois (Variable):  A 2D LoDTensor with shape [P, 4]. P is the total
+            number of sampled RoIs. Each element is a bounding box with
+            [xmin, ymin, xmax, ymax] format in range of orignal image size.
+        mask_rois_has_mask_int32 (Variable): A 2D LoDTensor with shape [P, 1],
+            each element repersents the output mask RoI index with regard to
+            to input RoIs.
+        mask_int32 (Variable): A 2D LoDTensor with shape [P, K * M * M],
+            K is the classes number and M is the resolution of mask predictions.
+            Each element repersents the binary mask targets.
+
+    Examples:
+        .. code-block:: python
+
+          im_info = fluid.layers.data(name="im_info", shape=[3],
+              dtype="float32")
+          gt_classes = fluid.layers.data(name="gt_classes", shape=[1],
+              dtype="float32", lod_level=1)
+          is_crowd = fluid.layers.data(name="is_crowd", shape=[1],
+              dtype="float32", lod_level=1)
+          gt_masks = fluid.layers.data(name="gt_masks", shape=[2],
+              dtype="float32", lod_level=3)
+          # rois, labels_int32 can be the output of
+          # fluid.layers.generate_proposal_labels.
+          mask_rois, mask_index, mask_int32 = fluid.layers.generate_mask_labels(
+              im_info=im_info,
+              gt_classes=gt_classes,
+              is_crowd=is_crowd,
+              gt_segms=gt_masks,
+              rois=rois,
+              labels_int32=labels_int32,
+              num_classes=81,
+              resolution=14)
+    """
+
+    helper = LayerHelper('generate_mask_labels', **locals())
+
+    mask_rois = helper.create_variable_for_type_inference(dtype=rois.dtype)
+    roi_has_mask_int32 = helper.create_variable_for_type_inference(
+        dtype=gt_classes.dtype)
+    mask_int32 = helper.create_variable_for_type_inference(
+        dtype=gt_classes.dtype)
+
+    helper.append_op(
+        type="generate_mask_labels",
+        inputs={
+            'ImInfo': im_info,
+            'GtClasses': gt_classes,
+            'IsCrowd': is_crowd,
+            'GtSegms': gt_segms,
+            'Rois': rois,
+            'LabelsInt32': labels_int32
+        },
+        outputs={
+            'MaskRois': mask_rois,
+            'RoiHasMaskInt32': roi_has_mask_int32,
+            'MaskInt32': mask_int32
+        },
+        attrs={'num_classes': num_classes,
+               'resolution': resolution})
+
+    mask_rois.stop_gradient = True
+    roi_has_mask_int32.stop_gradient = True
+    mask_int32.stop_gradient = True
+
+    return mask_rois, roi_has_mask_int32, mask_int32
+
+
 def generate_proposals(scores,
                        bbox_deltas,
                        im_info,
@@ -1754,33 +1889,48 @@ def generate_proposals(scores,
     """
     **Generate proposal Faster-RCNN**
 
-    This operation proposes RoIs according to each box with their probability to be a foreground object and 
-    the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals
+    This operation proposes RoIs according to each box with their
+    probability to be a foreground object and 
+    the box can be calculated by anchors. Bbox_deltais and scores
+    to be an object are the output of RPN. Final proposals
     could be used to train detection net.
 
     For generating proposals, this operation performs following steps:
 
-    1. Transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4)
+    1. Transposes and resizes scores and bbox_deltas in size of
+       (H*W*A, 1) and (H*W*A, 4)
     2. Calculate box locations as proposals candidates. 
     3. Clip boxes to image
     4. Remove predicted boxes with small area. 
     5. Apply NMS to get final proposals as output.
 
     Args:
-        scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object.
-            N is batch size, A is number of anchors, H and W are height and width of the feature map.
-        bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W] represents the differece between predicted box locatoin and anchor location. 
-        im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin image information for N batch. Info contains height, width and scale
+        scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents
+            the probability for each box to be an object.
+            N is batch size, A is number of anchors, H and W are height and
+            width of the feature map.
+        bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W]
+            represents the differece between predicted box locatoin and
+            anchor location.
+        im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin
+            image information for N batch. Info contains height, width and scale
             between origin image size and the size of feature map.
-        anchors(Variable):   A 4-D Tensor represents the anchors with a layout of [H, W, A, 4]. H and W are height and width of the feature map,
-                    num_anchors is the box count of each position. Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
-        variances(Variable): The expanded variances of anchors with a layout of [H, W, num_priors, 4]. Each variance is in (xcenter, ycenter, w, h) format.
-        pre_nms_top_n(float): Number of total bboxes to be kept per image before NMS. 6000 by default.
-        post_nms_top_n(float): Number of total bboxes to be kept per image after NMS. 1000 by default.
+        anchors(Variable):   A 4-D Tensor represents the anchors with a layout
+            of [H, W, A, 4]. H and W are height and width of the feature map,
+            num_anchors is the box count of each position. Each anchor is
+            in (xmin, ymin, xmax, ymax) format an unnormalized.
+        variances(Variable): The expanded variances of anchors with a layout of
+            [H, W, num_priors, 4]. Each variance is in
+            (xcenter, ycenter, w, h) format.
+        pre_nms_top_n(float): Number of total bboxes to be kept per
+            image before NMS. 6000 by default.
+        post_nms_top_n(float): Number of total bboxes to be kept per
+            image after NMS. 1000 by default.
         nms_thresh(float): Threshold in NMS, 0.5 by default.
-        min_size(float): Remove predicted boxes with either height or width < min_size. 0.1 by default.
-        eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, adaptive_threshold = adaptive_threshold * eta in each iteration.
-
+        min_size(float): Remove predicted boxes with either height or
+            width < min_size. 0.1 by default.
+        eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5,
+            adaptive_threshold = adaptive_threshold * eta in each iteration.
     """
     helper = LayerHelper('generate_proposals', **locals())
 
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index dde05189722fef77e03a1c2d8f3cbae44a3e8245..617704a53138bd081a2ebe318de0c89e8db4aa96 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -321,7 +321,7 @@ def append_LARS(params_grads, learning_rate, weight_decay):
         The decayed learning rate
     Examples:
         .. code-block:: python
-        
+
             learning_rate *= local_gw_ratio * sqrt(sumsq(param))
                         / (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param)))
     """
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 503c91c27bafa764bec5cc3d22153e91ae3787d9..339290384398df6d85d2f914f311af2cd0d33aea 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -22,7 +22,7 @@ import six
 import os
 import inspect
 from ..layer_helper import LayerHelper
-from ..initializer import Normal, Constant
+from ..initializer import Normal, Constant, NumpyArrayInitializer
 from ..framework import Variable, OpProtoHolder
 from ..param_attr import ParamAttr
 from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_
@@ -2874,7 +2874,7 @@ def batch_norm(input,
         attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
     # setting stop_gradient=True to reduce computation
     if use_global_stats and helper.bias_attr.learning_rate == 0.:
-        scale.stop_gradient = True
+        bias.stop_gradient = True
 
     mean = helper.create_parameter(
         attr=ParamAttr(
@@ -3875,6 +3875,7 @@ def beam_search(pre_ids,
                 beam_size,
                 end_id,
                 level=0,
+                is_accumulated=True,
                 name=None):
     """
     Beam search is a classical algorithm for selecting candidate words in a
@@ -3887,14 +3888,17 @@ def beam_search(pre_ids,
     selects the top-K candidate word ids of current step from :attr:`ids`
     according to their :attr:`scores` for all source sentences, where K is
     :attr:`beam_size` and :attr:`ids, scores` are predicted results from the
-    computation cell. Additionally, :attr:`pre_ids` and :attr:`pre_scores` are
-    the output of beam_search at previous step, they are needed for special use
-    to handle ended candidate translations.
-
-    Note that the :attr:`scores` passed in should be accumulated scores, and
-    length penalty should be done with extra operators before calculating the
-    accumulated scores if needed, also suggest finding top-K before it and
-    using the top-K candidates following.
+    computation cell. If :attr:`ids` is not set, it will be calculated out
+    according to :attr:`scores`. Additionally, :attr:`pre_ids` and
+    :attr:`pre_scores` are the output of beam_search at previous step, they
+    are needed for special use to handle ended candidate translations.
+
+    Note that if :attr:`is_accumulated` is :attr:`True`, the :attr:`scores`
+    passed in should be accumulated scores. Else, the :attr:`scores` are
+    considered as the straightforward scores and will be transformed to the
+    log field and accumulated the :attr:`pre_scores` in this operator.
+    Length penalty should be done with extra operators before calculating the
+    accumulated scores if needed.
 
     Please see the following demo for a fully beam search usage example:
 
@@ -3924,6 +3928,8 @@ def beam_search(pre_ids,
             describes how these candidates belong to the prefix. The paths
             linking prefixes and selected candidates are organized and reserved
             in lod.
+        is_accumulated(bool, default True): Whether the input :attr:`score` is
+             accumulated scores.
         name(str|None): A name for this layer(optional). If set None, the layer
                         will be named automatically.
 
@@ -3952,8 +3958,12 @@ def beam_search(pre_ids,
                 end_id=end_id)
     """
     helper = LayerHelper('beam_search', **locals())
-    score_type = scores.dtype
-    id_type = ids.dtype
+    score_type = pre_scores.dtype
+    id_type = pre_ids.dtype
+
+    inputs = {"pre_ids": pre_ids, "pre_scores": pre_scores, "scores": scores}
+    if ids is not None:
+        inputs["ids"] = ids
 
     selected_scores = helper.create_variable_for_type_inference(
         dtype=score_type)
@@ -3961,12 +3971,7 @@ def beam_search(pre_ids,
 
     helper.append_op(
         type='beam_search',
-        inputs={
-            'pre_ids': pre_ids,
-            'pre_scores': pre_scores,
-            'ids': ids,
-            'scores': scores,
-        },
+        inputs=inputs,
         outputs={
             'selected_ids': selected_ids,
             'selected_scores': selected_scores,
@@ -3976,6 +3981,7 @@ def beam_search(pre_ids,
             'level': level,
             'beam_size': beam_size,
             'end_id': end_id,
+            'is_accumulated': is_accumulated,
         })
 
     return selected_ids, selected_scores
@@ -5146,9 +5152,9 @@ def nce(input,
         littles = []
         for i in range(custom_dist_len):
             normal_prob = custom_dist[i] * custom_dist_len
-            if normal_prob - 1.0 > 1e-4:
+            if normal_prob - 1.0 > 0:
                 bigs.append((i, normal_prob))
-            elif 1.0 - normal_prob > 1e-4:
+            elif 1.0 - normal_prob > 0:
                 littles.append((i, normal_prob))
             else:
                 alias_probs_[i] = normal_prob
@@ -5164,9 +5170,9 @@ def nce(input,
             alias_probs_[little[0]] = little[1]
             alias_[little[0]] = big_idx
             big_left = big[1] + little[1] - 1
-            if big_left - 1.0 > 1e-4:
+            if big_left - 1.0 > 0:
                 bigs.append((big_idx, big_left))
-            elif 1.0 - big_left > 1e-4:
+            elif 1.0 - big_left > 0:
                 littles.append((big_idx, big_left))
             else:
                 alias_probs_[big_idx] = big_left
@@ -5181,14 +5187,21 @@ def nce(input,
             alias_probs_[little[0]] = 1.0
             alias_[little[0]] = -1
 
-        probs = assign(input=np.array(custom_dist).astype('float32'))
-        custom_alias = assign(input=np.array(alias_).astype('int32'))
-        custom_alias_probs = assign(
-            input=np.array(alias_probs_).astype('float32'))
-
-        inputs['CustomDistProbs'] = probs
-        inputs['CustomDistAlias'] = custom_alias
-        inputs['CustomDistAliasProbs'] = custom_alias_probs
+        def _init_by_numpy_array(numpy_array):
+            ret = helper.create_parameter(
+                attr=ParamAttr(),
+                shape=numpy_array.shape,
+                dtype=numpy_array.dtype,
+                default_initializer=NumpyArrayInitializer(numpy_array))
+            ret.stop_gradient = True
+            return ret
+
+        inputs['CustomDistProbs'] = _init_by_numpy_array(
+            np.array(custom_dist).astype('float32'))
+        inputs['CustomDistAlias'] = _init_by_numpy_array(
+            np.array(alias_).astype('int32'))
+        inputs['CustomDistAliasProbs'] = _init_by_numpy_array(
+            np.array(alias_probs_).astype('float32'))
         sampler = 2
     else:
         raise Exception("Unsupported sampler type.")
@@ -5849,7 +5862,8 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
             type='increment',
             inputs={'X': [counter]},
             outputs={'Out': [counter]},
-            attrs={'step': float(step)})
+            attrs={'step': float(step)},
+            stop_gradient=True)
         counter.stop_gradient = True
 
     return counter
@@ -8927,7 +8941,8 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
 def sigmoid_cross_entropy_with_logits(x,
                                       label,
                                       ignore_index=kIgnoreIndex,
-                                      name=None):
+                                      name=None,
+                                      normalize=False):
     """
     ${comment}
 
@@ -8936,9 +8951,25 @@ def sigmoid_cross_entropy_with_logits(x,
         label(${label_type}): ${label_comment}
         ignore_index(&{ignore_index}): ${ignore_index_comment}
         name(basestring|None): Name of the output.
+        normalize(bool): If true, divide the output by the number of
+            targets != ignore_index.
 
     Returns:
         out(${out_type}): ${out_comment}
+
+    Examples:
+        .. code-block:: python
+
+            input = fluid.layers.data(
+                name='data', shape=[10], dtype='float32')
+            label = fluid.layers.data(
+                name='data', shape=[10], dtype='float32')
+            loss = fluid.layers.sigmoid_cross_entropy_with_logits(
+                x=input,
+                label=label,
+                ignore_index=-1,
+                normalize=True) # or False
+            # loss = fluid.layers.reduce_sum(loss) # summation of loss
     """
 
     helper = LayerHelper("sigmoid_cross_entropy_with_logits", **locals())
@@ -8953,7 +8984,8 @@ def sigmoid_cross_entropy_with_logits(x,
         type="sigmoid_cross_entropy_with_logits",
         inputs={"X": x,
                 "Label": label},
-        attrs={"ignore_index": ignore_index},
+        attrs={"ignore_index": ignore_index,
+               'normalize': normalize},
         outputs={"Out": out})
     return out
 
@@ -9450,7 +9482,7 @@ def teacher_student_sigmoid_loss(input,
                                 by the previous operator.
         label (Variable|list):  the ground truth which is a 2-D tensor with
                                 shape [N x 1], where N is the batch size.
-        soft_max_up_bound  (float):  if input > soft_max_up_bound, will be bound 
+        soft_max_up_bound  (float):  if input > soft_max_up_bound, will be bound
         soft_max_lower_bound (float): if input < soft_max_lower_bound, will be bound
 
     Returns:
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index ce9f508c9f10981d62b7f8417080f0f3b8d9b8a7..2153ca254f0e286a77160a2d53473e1bc76109d5 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -382,7 +382,8 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
             'dtype': out.dtype,
             'value': float(value),
             'force_cpu': force_cpu or force_init_on_cpu()
-        })
+        },
+        stop_gradient=True)
     out.stop_gradient = True
     return out
 
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index b72b900d3b26cb705b76623e9bacc16c76a7c79f..14f4276e2f4fc4a24d701ef05c94b88c4f0336da 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -301,10 +301,10 @@ class Optimizer(object):
             no_grad_set (set|None): set of Variables should be ignored.
             callbacks (list|None): list of callables to run when appending backward
                 operator for one parameter.
-        
+
         Return:
             list: list of (param, grad) pair, grad is the output of backward.
-        
+
         Examples:
             See examples in `apply_gradients`.
         """
@@ -322,10 +322,10 @@ class Optimizer(object):
 
         Args:
             params_grads (list): list of (param, grad) pair to do optimization.
-        
+
         Returns:
             list: A list of operators appended to the current program.
-        
+
         Examples:
             .. code-block:: python
 
@@ -364,7 +364,7 @@ class Optimizer(object):
 
         This method combines interface `backward()` and
         `apply_gradients()` into one.
-        
+
         Args:
             loss (Variable): loss variable to run optimizations.
             startup_program (Program): startup_program for initializing parameters
@@ -381,18 +381,21 @@ class Optimizer(object):
         optimize_ops = []
         if imperative_base.enabled():
             if parameter_list is not None:
-                params_grads = parameter_list
+                parameters = parameter_list
             else:
                 parameters = program.global_block().all_parameters()
-                params_grads = []
-                for param in parameters:
-                    # create gradient variable
-                    grad_var = Variable(
-                        block=loss.block,
-                        name=param._ivar._grad_name(),
-                        stop_gradient=True,
-                        ivar=param._ivar._grad_ivar())
-                    params_grads.append((param, grad_var))
+
+            params_grads = []
+            for param in parameters:
+                if param.stop_gradient:
+                    continue
+                # create gradient variable
+                grad_var = Variable(
+                    block=loss.block,
+                    name=param._ivar._grad_name(),
+                    stop_gradient=True,
+                    ivar=param._ivar._grad_ivar())
+                params_grads.append((param, grad_var))
             with program_guard(program, startup_program):
                 optimize_ops = self._create_optimization_pass(params_grads)
         else:
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index a1b1d2f584c399b790580757dea746d7b4e4ac80..a07ff6ac69ca20c8c68659a67606076ce8cdf027 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -159,7 +159,7 @@ class ParallelExecutor(object):
         trainers_endpoints = main._trainers_endpoints
         if num_trainers > 1 and trainers_endpoints:
             assert num_trainers == len(
-                trainers_endpoints), "num_trainers == len(end_points)"
+                trainers_endpoints), "num_trainers == len(endpoints)"
             build_strategy.trainers_endpoints = trainers_endpoints
 
         # step6: get persistable_vars, places. persistable_vars
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index d99eaa0634f93dcd16dd80ae172f11e8090a2623..2d9ed9f9c69a15af454bfec5918fd8bab27d6e4c 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -203,7 +203,7 @@ class TestGenerateProposalLabels(unittest.TestCase):
                 lod_level=1,
                 append_batch_size=False)
             class_nums = 5
-            rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights = fluid.layers.generate_proposal_labels(
+            outs = fluid.layers.generate_proposal_labels(
                 rpn_rois=rpn_rois,
                 gt_classes=gt_classes,
                 is_crowd=is_crowd,
@@ -216,6 +216,11 @@ class TestGenerateProposalLabels(unittest.TestCase):
                 bg_thresh_lo=0.0,
                 bbox_reg_weights=[0.1, 0.1, 0.2, 0.2],
                 class_nums=class_nums)
+            rois = outs[0]
+            labels_int32 = outs[1]
+            bbox_targets = outs[2]
+            bbox_inside_weights = outs[3]
+            bbox_outside_weights = outs[4]
             assert rois.shape[1] == 4
             assert rois.shape[0] == labels_int32.shape[0]
             assert rois.shape[0] == bbox_targets.shape[0]
@@ -226,6 +231,62 @@ class TestGenerateProposalLabels(unittest.TestCase):
             assert bbox_outside_weights.shape[1] == 4 * class_nums
 
 
+class TestGenerateMaskLabels(unittest.TestCase):
+    def test_generate_mask_labels(self):
+        program = Program()
+        with program_guard(program):
+            im_info = layers.data(
+                name='im_info',
+                shape=[1, 3],
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            gt_classes = layers.data(
+                name='gt_classes',
+                shape=[2, 1],
+                dtype='int32',
+                lod_level=1,
+                append_batch_size=False)
+            is_crowd = layers.data(
+                name='is_crowd',
+                shape=[2, 1],
+                dtype='int32',
+                lod_level=1,
+                append_batch_size=False)
+            gt_segms = layers.data(
+                name='gt_segms',
+                shape=[20, 2],
+                dtype='float32',
+                lod_level=3,
+                append_batch_size=False)
+            rois = layers.data(
+                name='rois',
+                shape=[4, 4],
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            labels_int32 = layers.data(
+                name='labels_int32',
+                shape=[4, 1],
+                dtype='int32',
+                lod_level=1,
+                append_batch_size=False)
+            num_classes = 5
+            resolution = 14
+            outs = fluid.layers.generate_mask_labels(
+                im_info=im_info,
+                gt_classes=gt_classes,
+                is_crowd=is_crowd,
+                gt_segms=gt_segms,
+                rois=rois,
+                labels_int32=labels_int32,
+                num_classes=num_classes,
+                resolution=resolution)
+            mask_rois, roi_has_mask_int32, mask_int32 = outs
+            assert mask_rois.shape[1] == 4
+            assert mask_int32.shape[1] == num_classes * resolution * resolution
+
+
 class TestMultiBoxHead(unittest.TestCase):
     def test_multi_box_head(self):
         data_shape = [3, 224, 224]
@@ -313,7 +374,7 @@ class TestRpnTargetAssign(unittest.TestCase):
                 name='gt_boxes', shape=[4], lod_level=1, dtype='float32')
             is_crowd = layers.data(
                 name='is_crowd',
-                shape=[10],
+                shape=[1, 10],
                 dtype='int32',
                 lod_level=1,
                 append_batch_size=False)
@@ -323,7 +384,7 @@ class TestRpnTargetAssign(unittest.TestCase):
                 dtype='float32',
                 lod_level=1,
                 append_batch_size=False)
-            pred_scores, pred_loc, tgt_lbl, tgt_bbox, bbox_inside_weight = layers.rpn_target_assign(
+            outs = layers.rpn_target_assign(
                 bbox_pred=bbox_pred,
                 cls_logits=cls_logits,
                 anchor_box=anchor_box,
@@ -337,6 +398,11 @@ class TestRpnTargetAssign(unittest.TestCase):
                 rpn_positive_overlap=0.7,
                 rpn_negative_overlap=0.3,
                 use_random=False)
+            pred_scores = outs[0]
+            pred_loc = outs[1]
+            tgt_lbl = outs[2]
+            tgt_bbox = outs[3]
+            bbox_inside_weight = outs[4]
 
             self.assertIsNotNone(pred_scores)
             self.assertIsNotNone(pred_loc)
@@ -351,41 +417,43 @@ class TestRpnTargetAssign(unittest.TestCase):
 
 class TestGenerateProposals(unittest.TestCase):
     def test_generate_proposals(self):
-        data_shape = [20, 64, 64]
-        images = fluid.layers.data(
-            name='images', shape=data_shape, dtype='float32')
-        im_info = fluid.layers.data(
-            name='im_info', shape=[1, 3], dtype='float32')
-        anchors, variances = fluid.layers.anchor_generator(
-            name='anchor_generator',
-            input=images,
-            anchor_sizes=[32, 64],
-            aspect_ratios=[1.0],
-            variance=[0.1, 0.1, 0.2, 0.2],
-            stride=[16.0, 16.0],
-            offset=0.5)
-        num_anchors = anchors.shape[2]
-        scores = fluid.layers.data(
-            name='scores', shape=[1, num_anchors, 8, 8], dtype='float32')
-        bbox_deltas = fluid.layers.data(
-            name='bbox_deltas',
-            shape=[1, num_anchors * 4, 8, 8],
-            dtype='float32')
-        rpn_rois, rpn_roi_probs = fluid.layers.generate_proposals(
-            name='generate_proposals',
-            scores=scores,
-            bbox_deltas=bbox_deltas,
-            im_info=im_info,
-            anchors=anchors,
-            variances=variances,
-            pre_nms_top_n=6000,
-            post_nms_top_n=1000,
-            nms_thresh=0.5,
-            min_size=0.1,
-            eta=1.0)
-        self.assertIsNotNone(rpn_rois)
-        self.assertIsNotNone(rpn_roi_probs)
-        print(rpn_rois.shape)
+        program = Program()
+        with program_guard(program):
+            data_shape = [20, 64, 64]
+            images = fluid.layers.data(
+                name='images', shape=data_shape, dtype='float32')
+            im_info = fluid.layers.data(
+                name='im_info', shape=[3], dtype='float32')
+            anchors, variances = fluid.layers.anchor_generator(
+                name='anchor_generator',
+                input=images,
+                anchor_sizes=[32, 64],
+                aspect_ratios=[1.0],
+                variance=[0.1, 0.1, 0.2, 0.2],
+                stride=[16.0, 16.0],
+                offset=0.5)
+            num_anchors = anchors.shape[2]
+            scores = fluid.layers.data(
+                name='scores', shape=[num_anchors, 8, 8], dtype='float32')
+            bbox_deltas = fluid.layers.data(
+                name='bbox_deltas',
+                shape=[num_anchors * 4, 8, 8],
+                dtype='float32')
+            rpn_rois, rpn_roi_probs = fluid.layers.generate_proposals(
+                name='generate_proposals',
+                scores=scores,
+                bbox_deltas=bbox_deltas,
+                im_info=im_info,
+                anchors=anchors,
+                variances=variances,
+                pre_nms_top_n=6000,
+                post_nms_top_n=1000,
+                nms_thresh=0.5,
+                min_size=0.1,
+                eta=1.0)
+            self.assertIsNotNone(rpn_rois)
+            self.assertIsNotNone(rpn_roi_probs)
+            print(rpn_rois.shape)
 
 
 class TestYoloDetection(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 808e1e6aa80744db1289094d7c1bad00002a4c3e..c23dfa01e76c21d0d162f2fed986e2eaf3a70a6d 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -84,6 +84,7 @@ list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
 list(REMOVE_ITEM TEST_OPS test_image_classification_resnet)
 list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
 list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
+list(REMOVE_ITEM TEST_OPS test_imperative_resnet)
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
@@ -91,6 +92,8 @@ py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
 py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op SERIAL)
 py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op SERIAL)
+py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS
+  FLAGS_cudnn_deterministic=1)
 if(WITH_DISTRIBUTE)
     py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
     set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
diff --git a/python/paddle/fluid/tests/unittests/dist_save_load.py b/python/paddle/fluid/tests/unittests/dist_save_load.py
index faec5350424668fca6416e91c3e58174bd4ec877..f0f13a9d49c5b84521aa3e00bdcabe0c494853a7 100644
--- a/python/paddle/fluid/tests/unittests/dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/dist_save_load.py
@@ -80,7 +80,8 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
         # NOTE: pserver should not call memory optimize
         t = self.get_transpiler(args.trainer_id,
                                 fluid.default_main_program(), args.endpoints,
-                                args.trainers, args.sync_mode)
+                                args.trainers, args.sync_mode, False,
+                                args.current_endpoint)
         pserver_prog = t.get_pserver_program(args.current_endpoint)
         startup_prog = t.get_startup_program(args.current_endpoint,
                                              pserver_prog)
@@ -93,7 +94,8 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
         exe.run(startup_prog)
 
         if need_load and model_dir:
-            self._load_persistable_vars(exe, model_dir, startup_prog)
+            fluid.io.load_persistables(exe, model_dir, pserver_prog)
+
         exe.run(pserver_prog)
 
     def run_trainer(self, args):
@@ -158,19 +160,46 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
 
         need_save = bool(int(os.getenv("SAVE", "0")))
         model_dir = os.getenv("MODEL_DIR", "")
-
-        if need_save:
-            for _ in six.moves.xrange(RUN_STEP):
-                loss, = exe.run(fetch_list=[avg_cost.name],
-                                feed=feeder.feed(get_data()))
-            if need_save and model_dir:
-                io.save_persistables(startup_exe, model_dir, trainer_prog)
-
-        var = np.array(fluid.global_scope().find_var('__fc_b__').get_tensor())
-        if six.PY2:
-            print(pickle.dumps(np.ravel(var).tolist()))
+        save_mode = os.getenv("SAVE_MODE", "")
+
+        if save_mode == "LOCAL":
+            if need_save:
+                for _ in six.moves.xrange(RUN_STEP):
+                    loss, = exe.run(fetch_list=[avg_cost.name],
+                                    feed=feeder.feed(get_data()))
+                if need_save and model_dir:
+                    io.save_persistables(startup_exe, model_dir, trainer_prog)
+
+            var = np.array(fluid.global_scope().find_var('__fc_b__').get_tensor(
+            ))
+            if six.PY2:
+                print(pickle.dumps(np.ravel(var).tolist()))
+            else:
+                sys.stdout.buffer.write(pickle.dumps(np.ravel(var).tolist()))
+
+        elif save_mode == "DIST":
+            skip_steps = int(os.getenv("SKIP_STEPS"))
+            loss = None
+            if need_save:
+                for idx in six.moves.xrange(8):
+                    loss, = exe.run(fetch_list=[avg_cost.name],
+                                    feed=feeder.feed(get_data()))
+                    if need_save and model_dir and idx == skip_steps and args.trainer_id == 0:
+                        io.save_persistables(startup_exe, model_dir,
+                                             trainer_prog)
+            else:
+                for idx in six.moves.xrange(8):
+                    data = get_data()
+                    if idx <= skip_steps:
+                        continue
+                    loss, = exe.run(fetch_list=[avg_cost.name],
+                                    feed=feeder.feed(data))
+            if six.PY2:
+                print(pickle.dumps(loss.tolist()))
+            else:
+                sys.stdout.buffer.write(pickle.dumps(loss.tolist()))
         else:
-            sys.stdout.buffer.write(pickle.dumps(np.ravel(var).tolist()))
+            raise Exception("save_mode must be LOCAL or DIST")
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
index fac5e037a46715d146e354825f09ee8ccc4f3d70..09afae6114e2b6cc8bce9b2be3b221ba9825db8c 100644
--- a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
@@ -75,9 +75,13 @@ def get_loss(cos_q_pt, cos_q_nt):
     return avg_cost
 
 
-def get_optimizer():
-    # SGD optimizer
-    optimizer = fluid.optimizer.SGD(learning_rate=base_lr)
+def get_optimizer(op="sgd"):
+    if op.upper() == "sgd".upper():
+        optimizer = fluid.optimizer.SGD(learning_rate=base_lr)
+    elif op.upper() == "adam".upper():
+        optimizer = fluid.optimizer.Adam(learning_rate=base_lr)
+    else:
+        optimizer = fluid.optimizer.SGD(learning_rate=base_lr)
     return optimizer
 
 
@@ -237,7 +241,8 @@ class TestDistSimnetBow2x2(TestDistRunnerBase):
         inference_program = fluid.default_main_program().clone()
 
         # Optimization
-        opt = get_optimizer()
+        opt = os.getenv('OPTIMIZER', 'sgd')
+        opt = get_optimizer(opt)
         opt.minimize(avg_cost)
 
         # Reader
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 3fcdc57906c214bdc8179c55b576e2e9e8d80973..0968ace62b6a4e258f7763dbf6fbeda07feb4cd5 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -43,7 +43,8 @@ class TestDistRunnerBase(object):
                        pserver_endpoints,
                        trainers,
                        sync_mode,
-                       dc_asgd=False):
+                       dc_asgd=False,
+                       current_endpoint=None):
         # NOTE: import fluid until runtime, or else forking processes will cause error.
         config = fluid.DistributeTranspilerConfig()
         config.enable_dc_asgd = dc_asgd
@@ -53,7 +54,8 @@ class TestDistRunnerBase(object):
             program=main_program,
             pservers=pserver_endpoints,
             trainers=trainers,
-            sync_mode=sync_mode)
+            sync_mode=sync_mode,
+            current_endpoint=current_endpoint)
         return t
 
     def run_pserver(self, args):
@@ -122,8 +124,8 @@ class TestDistRunnerBase(object):
         if args.batch_merge_repeat > 1:
             pass_builder = build_stra._finalize_strategy_and_create_passes()
             mypass = pass_builder.insert_pass(
-                len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass")
-            mypass.set_int("num_repeats", args.batch_merge_repeat)
+                len(pass_builder.all_passes()) - 3, "multi_batch_merge_pass")
+            mypass.set("num_repeats", args.batch_merge_repeat)
 
         if args.update_method == "nccl2":
             build_stra.num_trainers = len(args.endpoints.split(","))
diff --git a/python/paddle/fluid/tests/unittests/test_dist_save_load.py b/python/paddle/fluid/tests/unittests/test_dist_save_load.py
index 4588ca7c17ba5db893f080813d299feaa47626a7..e795bc410ee45a18cc0c7c914636f5b03309fad1 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_save_load.py
@@ -33,7 +33,6 @@ class TestDistSaveLoadDense2x2(TestDistBase):
                          delta=1e-3,
                          check_error_log=False,
                          need_envs={}):
-
         required_envs = {
             "PATH": os.getenv("PATH", ""),
             "PYTHONPATH": os.getenv("PYTHONPATH", ""),
@@ -77,7 +76,77 @@ class TestDistSaveLoadDense2x2(TestDistBase):
         need_envs = {
             "IS_DISTRIBUTED": '0',
             "IS_SPARSE": '0',
-            'IS_SELF_CONTAINED_LR': '1'
+            'IS_SELF_CONTAINED_LR': '1',
+            'SAVE_MODE': 'LOCAL',
+        }
+        self.check_with_place(
+            "dist_save_load.py",
+            delta=0,
+            check_error_log=False,
+            need_envs=need_envs)
+
+
+class TestDistSaveLoadWithPServerStateDense2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._enforce_place = "CPU"
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "http_proxy": ""
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        model_dir = tempfile.mkdtemp()
+
+        save_env = {}
+        save_env["SAVE_MODE"] = "DIST"
+        save_env["SAVE"] = "1"
+        save_env["MODEL_DIR"] = model_dir
+        save_env.update(required_envs)
+
+        tr0_var_1, tr1_var_1 = self._run_cluster(model_file, save_env,
+                                                 check_error_log)
+
+        load_env = {}
+        load_env["LOAD"] = "1"
+        load_env["MODEL_DIR"] = model_dir
+        load_env.update(required_envs)
+        tr0_var_2, tr1_var_2 = self._run_cluster(model_file, load_env,
+                                                 check_error_log)
+
+        shutil.rmtree(model_dir)
+
+        train0_1_np = np.array(tr0_var_1)
+        train1_1_np = np.array(tr1_var_1)
+        train0_2_np = np.array(tr0_var_2)
+        train1_2_np = np.array(tr1_var_2)
+
+        self.assertAlmostEqual(
+            train0_1_np.all(), train0_2_np.all(), delta=delta)
+        self.assertAlmostEqual(
+            train1_1_np.all(), train1_2_np.all(), delta=delta)
+
+    def test_dist(self):
+        need_envs = {
+            "IS_DISTRIBUTED": '0',
+            "IS_SPARSE": '0',
+            'IS_SELF_CONTAINED_LR': '1',
+            'SAVE_MODE': 'DIST',
+            'OPTIMIZER': 'ADAM',
+            'SKIP_STEPS': str(np.random.randint(2, 6))
         }
         self.check_with_place(
             "dist_save_load.py",
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 3d1ce6b27c935ddca0f2f5fb377e69b571e3714c..3566fed215229223f4d2ecd1bbb66cb297dd7716 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -741,21 +741,40 @@ class TestLoadSliceVar(TranspilerTest):
         pserver, _ = self.get_pserver(self.pserver1_ep)
         pserver2, _ = self.get_pserver(self.pserver2_ep)
 
-        self.assertTrue(pserver._slice_vars_and_attrs)
-        self.assertTrue(pserver2._slice_vars_and_attrs)
-
-        for idx in six.moves.xrange(len(pserver._slice_vars_and_attrs)):
-            self.assertEqual(pserver._slice_vars_and_attrs[idx][0],
-                             pserver2._slice_vars_and_attrs[idx][0])
-
-            total_numel = six.moves.reduce(
-                lambda x, y: x * y, pserver._slice_vars_and_attrs[idx][0].shape)
-            self.assertEqual(
-                total_numel,
-                six.moves.reduce(lambda x, y: x * y,
-                                 pserver._slice_vars_and_attrs[idx][2].shape) +
-                six.moves.reduce(lambda x, y: x * y,
-                                 pserver2._slice_vars_and_attrs[idx][2].shape))
+        vars_ps1 = pserver._parameters_on_pservers.get_distributed_vars_by_ep(
+            self.pserver1_ep)
+        vars_ps2 = pserver._parameters_on_pservers.get_distributed_vars_by_ep(
+            self.pserver2_ep)
+
+        self.assertTrue(vars_ps1)
+        self.assertTrue(vars_ps2)
+
+        for idx in six.moves.xrange(len(vars_ps1)):
+            total_numel = 0
+            ps1_numel, ps2_numel = 0, 0
+
+            ps1_var = vars_ps1[idx]
+
+            if not ps1_var.is_slice:
+                total_numel = six.moves.reduce(lambda x, y: x * y,
+                                               vars_ps1[idx].origin.shape)
+                ps1_numel = six.moves.reduce(lambda x, y: x * y,
+                                             vars_ps1[idx].slice.shape)
+            else:
+                ps2_var = None
+                for var in vars_ps2:
+                    if var.origin.name == ps1_var.origin.name:
+                        ps2_var = var
+                        break
+
+                total_numel = six.moves.reduce(lambda x, y: x * y,
+                                               ps1_var.origin.shape)
+                ps1_numel = six.moves.reduce(lambda x, y: x * y,
+                                             ps1_var.slice.shape)
+                ps2_numel = six.moves.reduce(lambda x, y: x * y,
+                                             ps2_var.slice.shape)
+
+            self.assertEqual(total_numel, ps1_numel + ps2_numel)
 
 
 class TestNCCL2Transpile(TranspilerTest):
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
index 7ec1f0ae753724dac5c4675926ead87a097a7a99..56dfb095def62bc617948821038f0c15c1547683 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
@@ -16,12 +16,17 @@ import os
 import unittest
 os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
 
+# FIXME(zjl): It seems that this unittest fails randomly 
+# when comparing all reduce last loss and reduce last loss
+# e.g.: AssertionError: 1.0357145 != 1.0673475 within 0.01 delta
+# Disable it temporarily.
+'''
 from test_parallel_executor_mnist import TestMNIST
 
 
 class EagerDeletionTestMNIST(TestMNIST):
     pass
-
+'''
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py b/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d7ce33ea7ca2c53dc2bb2a7048444c818d4f33f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py
@@ -0,0 +1,421 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+import math
+import six
+import paddle.fluid as fluid
+from op_test import OpTest
+'''
+# Equivalent code
+rles = mask_util.frPyObjects([segm], im_h, im_w)
+mask = mask_util.decode(rles)
+'''
+
+
+def decode(cnts, m):
+    v = 0
+    mask = []
+    for j in range(m):
+        for k in range(cnts[j]):
+            mask.append(v)
+        v = 1 - v
+    return mask
+
+
+def poly2mask(xy, k, h, w):
+    scale = 5.
+    x = [int(scale * p + 0.5) for p in xy[::2]]
+    x = x + [x[0]]
+    y = [int(scale * p + 0.5) for p in xy[1::2]]
+    y = y + [y[0]]
+    m = sum([
+        int(max(abs(x[j] - x[j + 1]), abs(y[j] - y[j + 1]))) + int(1)
+        for j in range(k)
+    ])
+
+    u, v = [], []
+    for j in range(k):
+        xs = x[j]
+        xe = x[j + 1]
+        ys = y[j]
+        ye = y[j + 1]
+        dx = abs(xe - xs)
+        dy = abs(ys - ye)
+        flip = (dx >= dy and xs > xe) or (dx < dy and ys > ye)
+        if flip:
+            xs, xe = xe, xs
+            ys, ye = ye, ys
+
+        if dx >= dy:
+            if (dx == 0): assert ye - ys == 0
+            s = 0 if dx == 0 else float(ye - ys) / dx
+        else:
+            if (dy == 0): assert xe - xs == 0
+            s = 0 if dy == 0 else float(xe - xs) / dy
+
+        if dx >= dy:
+            ts = [dx - d if flip else d for d in range(dx + 1)]
+            u.extend([xs + t for t in ts])
+            v.extend([int(ys + s * t + .5) for t in ts])
+        else:
+            ts = [dy - d if flip else d for d in range(dy + 1)]
+            v.extend([t + ys for t in ts])
+            u.extend([int(xs + s * t + .5) for t in ts])
+
+    k = len(u)
+    x = np.zeros((k), np.int)
+    y = np.zeros((k), np.int)
+    m = 0
+    for j in six.moves.xrange(1, k):
+        if u[j] != u[j - 1]:
+            xd = float(u[j] if (u[j] < u[j - 1]) else (u[j] - 1))
+            xd = (xd + .5) / scale - .5
+            if (math.floor(xd) != xd or xd < 0 or xd > (w - 1)):
+                continue
+            yd = float(v[j] if v[j] < v[j - 1] else v[j - 1])
+            yd = (yd + .5) / scale - .5
+            yd = math.ceil(0 if yd < 0 else (h if yd > h else yd))
+            x[m] = int(xd)
+            y[m] = int(yd)
+            m += 1
+    k = m
+    a = [int(x[i] * h + y[i]) for i in range(k)]
+    a.append(h * w)
+    a.sort()
+    b = [0] + a[:len(a) - 1]
+    a = [c - d for (c, d) in zip(a, b)]
+
+    k += 1
+    b = [0 for i in range(k)]
+    b[0] = a[0]
+    m, j = 1, 1
+    while (j < k):
+        if a[j] > 0:
+            b[m] = a[j]
+            m += 1
+            j += 1
+        else:
+            j += 1
+            if (j < k):
+                b[m - 1] += a[j]
+                j += 1
+    mask = decode(b, m)
+    mask = np.array(mask, dtype=np.int).reshape((w, h))
+    mask = mask.transpose((1, 0))
+    return mask
+
+
+def polys_to_boxes(polys):
+    """Convert a list of polygons into an array of tight bounding boxes."""
+    boxes_from_polys = np.zeros((len(polys), 4), dtype=np.float32)
+    for i in range(len(polys)):
+        poly = polys[i]
+        x0 = min(min(p[::2]) for p in poly)
+        x1 = max(max(p[::2]) for p in poly)
+        y0 = min(min(p[1::2]) for p in poly)
+        y1 = max(max(p[1::2]) for p in poly)
+        boxes_from_polys[i, :] = [x0, y0, x1, y1]
+    return boxes_from_polys
+
+
+def bbox_overlaps(boxes, query_boxes):
+    N = boxes.shape[0]
+    K = query_boxes.shape[0]
+    overlaps = np.zeros((N, K), dtype=boxes.dtype)
+    for k in range(K):
+        box_area = (query_boxes[k, 2] - query_boxes[k, 0] + 1) *\
+                   (query_boxes[k, 3] - query_boxes[k, 1] + 1)
+        for n in range(N):
+            iw = min(boxes[n, 2], query_boxes[k, 2]) -\
+                 max(boxes[n, 0], query_boxes[k, 0]) + 1
+            if iw > 0:
+                ih = min(boxes[n, 3], query_boxes[k, 3]) -\
+                     max(boxes[n, 1], query_boxes[k, 1]) + 1
+                if ih > 0:
+                    ua = float(
+                         (boxes[n, 2] - boxes[n, 0] + 1) *\
+                         (boxes[n, 3] - boxes[n, 1] + 1) +\
+                         box_area - iw * ih)
+                    overlaps[n, k] = iw * ih / ua
+    return overlaps
+
+
+def polys_to_mask_wrt_box(polygons, box, M):
+    """Convert from the COCO polygon segmentation format to a binary mask
+    encoded as a 2D array of data type numpy.float32. The polygon segmentation
+    is understood to be enclosed in the given box and rasterized to an M x M
+    mask. The resulting mask is therefore of shape (M, M).
+    """
+    w = box[2] - box[0]
+    h = box[3] - box[1]
+
+    w = np.maximum(w, 1)
+    h = np.maximum(h, 1)
+
+    polygons_norm = []
+    for poly in polygons:
+        p = np.array(poly, dtype=np.float32)
+        p[0::2] = (p[0::2] - box[0]) * M / w
+        p[1::2] = (p[1::2] - box[1]) * M / h
+        polygons_norm.append(p)
+
+    mask = []
+    for polygons in polygons_norm:
+        assert polygons.shape[0] % 2 == 0
+        k = polygons.shape[0] // 2
+        mask.append(poly2mask(polygons, k, M, M))
+    mask = np.array(mask)
+    # Flatten in case polygons was a list
+    mask = np.sum(mask, axis=0)
+    mask = np.array(mask > 0, dtype=np.float32)
+    return mask
+
+
+def expand_mask_targets(masks, mask_class_labels, resolution, num_classes):
+    """Expand masks from shape (#masks, resolution ** 2)
+    to (#masks, #classes * resolution ** 2) to encode class
+    specific mask targets.
+    """
+    assert masks.shape[0] == mask_class_labels.shape[0]
+
+    # Target values of -1 are "don't care" / ignore labels
+    mask_targets = -np.ones(
+        (masks.shape[0], num_classes * resolution**2), dtype=np.int32)
+    for i in range(masks.shape[0]):
+        cls = int(mask_class_labels[i])
+        start = resolution**2 * cls
+        end = start + resolution**2
+        # Ignore background instance
+        # (only happens when there is no fg samples in an image)
+        if cls > 0:
+            mask_targets[i, start:end] = masks[i, :]
+    return mask_targets
+
+
+def generate_mask_labels(num_classes, im_info, gt_classes, is_crowd,
+                         label_int32, gt_polys, resolution, rois, roi_lod,
+                         gt_lod):
+    mask_rois = []
+    roi_has_mask_int32 = []
+    mask_int32 = []
+    new_lod = []
+    for i in range(len(im_info)):
+        roi_s = roi_lod[i]
+        roi_e = roi_lod[i + 1]
+        gt_s = gt_lod[i]
+        gt_e = gt_lod[i + 1]
+        mask_blob = _sample_mask(num_classes, im_info[i], gt_classes[gt_s:gt_e],
+                                 is_crowd[gt_s:gt_e], label_int32[roi_s:roi_e],
+                                 gt_polys[i], resolution, rois[roi_s:roi_e])
+        new_lod.append(mask_blob['mask_rois'].shape[0])
+        mask_rois.append(mask_blob['mask_rois'])
+        roi_has_mask_int32.append(mask_blob['roi_has_mask_int32'])
+        mask_int32.append(mask_blob['mask_int32'])
+    return mask_rois, roi_has_mask_int32, mask_int32, new_lod
+
+
+def _sample_mask(
+        num_classes,
+        im_info,
+        gt_classes,
+        is_crowd,
+        label_int32,
+        gt_polys,  # [[[], []], []]
+        resolution,
+        rois):
+    mask_blob = {}
+    im_scale = im_info[2]
+    sample_boxes = rois
+    polys_gt_inds = np.where((gt_classes > 0) & (is_crowd == 0))[0]
+    polys_gt = [gt_polys[i] for i in polys_gt_inds]
+    boxes_from_polys = polys_to_boxes(polys_gt)
+
+    fg_inds = np.where(label_int32 > 0)[0]
+    roi_has_mask = fg_inds.copy()
+    if fg_inds.shape[0] > 0:
+        mask_class_labels = label_int32[fg_inds]
+        masks = np.zeros((fg_inds.shape[0], resolution**2), dtype=np.int32)
+        rois_fg = sample_boxes[fg_inds]
+        overlaps_bbfg_bbpolys = bbox_overlaps(
+            rois_fg.astype(np.float32), boxes_from_polys.astype(np.float32))
+        fg_polys_inds = np.argmax(overlaps_bbfg_bbpolys, axis=1)
+        for i in range(rois_fg.shape[0]):
+            fg_polys_ind = fg_polys_inds[i]
+            poly_gt = polys_gt[fg_polys_ind]
+            roi_fg = rois_fg[i]
+            mask = polys_to_mask_wrt_box(poly_gt, roi_fg, resolution)
+            mask = np.array(mask > 0, dtype=np.int32)
+            masks[i, :] = np.reshape(mask, resolution**2)
+    else:
+        bg_inds = np.where(label_int32 == 0)[0]
+        rois_fg = sample_boxes[bg_inds[0]].reshape((1, -1))
+        masks = -np.ones((1, resolution**2), dtype=np.int32)
+        mask_class_labels = np.zeros((1, ))
+        roi_has_mask = np.append(roi_has_mask, 0)
+    masks = expand_mask_targets(masks, mask_class_labels, resolution,
+                                num_classes)
+    rois_fg *= im_scale
+    mask_blob['mask_rois'] = rois_fg
+    mask_blob['roi_has_mask_int32'] = roi_has_mask
+    mask_blob['mask_int32'] = masks
+    return mask_blob
+
+
+def trans_lod(lod):
+    new_lod = [0]
+    for i in range(len(lod)):
+        new_lod.append(lod[i] + new_lod[i])
+    return new_lod
+
+
+class TestGenerateMaskLabels(OpTest):
+    def set_data(self):
+        self.init_test_case()
+        self.make_generate_proposal_labels_out()
+        self.generate_gt_polys()
+        self.generate_groundtruth()
+        self.init_test_output()
+        self.inputs = {
+            'ImInfo': self.im_info,
+            'GtClasses': (self.gt_classes.astype(np.int32), self.gt_lod),
+            'IsCrowd': (self.is_crowd.astype(np.int32), self.gt_lod),
+            'LabelsInt32': (self.label_int32.astype(np.int32), self.rois_lod),
+            'GtSegms': (self.gt_polys.astype(np.float32), self.masks_lod),
+            'Rois': (self.rois.astype(np.float32), self.rois_lod)
+        }
+        self.attrs = {
+            'num_classes': self.num_classes,
+            'resolution': self.resolution
+        }
+        self.outputs = {
+            'MaskRois': (self.mask_rois, [self.new_lod]),
+            'RoiHasMaskInt32': (self.roi_has_mask_int32, [self.new_lod]),
+            'MaskInt32': (self.mask_int32, [self.new_lod])
+        }
+
+    def init_test_case(self):
+        self.num_classes = 81
+        self.resolution = 14
+        self.batch_size = 2
+        self.batch_size_per_im = 64
+        self.images_shape = [100, 200]
+        np.random.seed(0)
+
+    def make_generate_proposal_labels_out(self):
+        rois = []
+        self.rois_lod = [[]]
+        self.label_int32 = []
+        for bno in range(self.batch_size):
+            self.rois_lod[0].append(self.batch_size_per_im)
+            for i in range(self.batch_size_per_im):
+                xywh = np.random.rand(4)
+                xy1 = xywh[0:2] * 2
+                wh = xywh[2:4] * (self.images_shape[0] - xy1)
+                xy2 = xy1 + wh
+                roi = [xy1[0], xy1[1], xy2[0], xy2[1]]
+                rois.append(roi)
+        self.rois = np.array(rois).astype("float32")
+        for idx, roi_num in enumerate(self.rois_lod[0]):
+            for roi_id in range(roi_num):
+                class_id = np.random.random_integers(self.num_classes - 1)
+                if idx == 0:
+                    # set an image with no foreground, to test the empty case
+                    self.label_int32.append(0)
+                else:
+                    self.label_int32.append(class_id)
+        label_np = np.array(self.label_int32)
+        self.label_int32 = label_np[:, np.newaxis]
+
+    def generate_gt_polys(self):
+        h, w = self.images_shape[0:2]
+        self.gt_polys = []
+        self.gt_polys_list = []
+        max_gt = 4
+        max_poly_num = 5
+        min_poly_size = 4
+        max_poly_size = 16
+        lod0 = []
+        lod1 = []
+        lod2 = []
+        for i in range(self.batch_size):
+            gt_num = np.random.randint(1, high=max_gt, size=1)[0]
+            lod0.append(gt_num)
+            ptss = []
+            for i in range(gt_num):
+                poly_num = np.random.randint(1, max_poly_num, size=1)[0]
+                lod1.append(poly_num)
+                pts = []
+                for j in range(poly_num):
+                    poly_size = np.random.randint(
+                        min_poly_size, max_poly_size, size=1)[0]
+                    x = np.random.rand(poly_size, 1) * w
+                    y = np.random.rand(poly_size, 1) * h
+                    xy = np.concatenate((x, y), axis=1)
+                    pts.append(xy.flatten().tolist())
+                    self.gt_polys.extend(xy.flatten().tolist())
+                    lod2.append(poly_size)
+                ptss.append(pts)
+            self.gt_polys_list.append(ptss)
+        self.masks_lod = [lod0, lod1, lod2]
+        self.gt_lod = [lod0]
+        self.gt_polys = np.array(self.gt_polys).astype('float32').reshape(-1, 2)
+
+    def generate_groundtruth(self):
+        self.im_info = []
+        self.gt_classes = []
+        self.is_crowd = []
+        for roi_num in self.gt_lod[0]:
+            self.im_info.append(self.images_shape + [1.0])
+            for roi_id in range(roi_num):
+                class_id = np.random.random_integers(self.num_classes - 1)
+                self.gt_classes.append(class_id)
+                self.is_crowd.append(0)
+        self.im_info = np.array(self.im_info).astype(np.float32)
+        gt_classes_np = np.array(self.gt_classes)
+        self.gt_classes = gt_classes_np[:, np.newaxis]
+        is_crowd_np = np.array(self.is_crowd)
+        self.is_crowd = is_crowd_np[:, np.newaxis]
+
+    def init_test_output(self):
+        roi_lod = trans_lod(self.rois_lod[0])
+        gt_lod = trans_lod(self.gt_lod[0])
+        outs = generate_mask_labels(self.num_classes, self.im_info,
+                                    self.gt_classes, self.is_crowd,
+                                    self.label_int32, self.gt_polys_list,
+                                    self.resolution, self.rois, roi_lod, gt_lod)
+        self.mask_rois = outs[0]
+        self.roi_has_mask_int32 = outs[1]
+        self.mask_int32 = outs[2]
+        self.new_lod = outs[3]
+
+        self.mask_rois = np.vstack(self.mask_rois)
+        self.roi_has_mask_int32 = np.hstack(self.roi_has_mask_int32)[:,
+                                                                     np.newaxis]
+        self.mask_int32 = np.vstack(self.mask_int32)
+
+    def setUp(self):
+        self.op_type = "generate_mask_labels"
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
index 2d5cd3b24bff52d82353ccf3fd2ecb69166c66c6..5f6328707fd80ec8f11b96cc65e2dcaf44496d58 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
@@ -4,7 +4,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://w_idxw.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import sys
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
index 9340d558577b4b3141df9317900ee33bbb683a0e..5ce405dccae4cfd66cde471c097698b0869f29fe 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
@@ -4,7 +4,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://w_idxw.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import sys
diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py
index 07693caddb9328e12b5901a717cc92ee47644a25..fab60ae75647a4b3cd1b76416ec88cd21f77f41a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
@@ -255,7 +255,8 @@ class TestImperative(unittest.TestCase):
             x = fluid.layers.reduce_sum(fluid.layers.tanh(x1))
             param_grads = fluid.backward.append_backward(
                 x, parameter_list=[x1.name])[0]
-            exe = fluid.Executor(fluid.CPUPlace())
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
 
             static_out, static_grad = exe.run(
                 feed={inp.name: np_inp},
@@ -282,7 +283,8 @@ class TestImperative(unittest.TestCase):
             x = l(inp)[0]
             param_grads = fluid.backward.append_backward(
                 x, parameter_list=[l._x_for_debug.name])[0]
-            exe = fluid.Executor(fluid.CPUPlace())
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
 
             static_out, static_grad = exe.run(
                 feed={inp.name: np_inp},
@@ -308,7 +310,8 @@ class TestImperative(unittest.TestCase):
             out = mlp(inp)
             param_grads = fluid.backward.append_backward(
                 out, parameter_list=[mlp._fc1._w.name])[0]
-            exe = fluid.Executor(fluid.CPUPlace())
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
             exe.run(fluid.default_startup_program())
 
             static_out, static_grad = exe.run(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
index 4fe286f85ec551946a9431f70d7012b4e7d79662..681661bfc63db95653be371688a047efe96f3866 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -20,6 +20,7 @@ import sys
 
 import paddle
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 from paddle.fluid.optimizer import SGDOptimizer
 from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC
 from test_imperative_base import new_program_scope
@@ -58,7 +59,7 @@ class Generator(fluid.imperative.Layer):
 
 
 class TestImperativeMnist(unittest.TestCase):
-    def test_mnist_cpu_float32(self):
+    def test_gan_float32(self):
         seed = 90
 
         startup = fluid.Program()
@@ -115,7 +116,8 @@ class TestImperativeMnist(unittest.TestCase):
             sgd = SGDOptimizer(learning_rate=1e-3)
             sgd.minimize(g_loss)
 
-        exe = fluid.Executor(fluid.CPUPlace())
+        exe = fluid.Executor(fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0))
         static_params = dict()
         with fluid.scope_guard(scope):
             img = np.ones([2, 1], np.float32)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
index 63eeae4b712c2064309b664b91d5f0347b67817d..d0a5a883174cb33a035b344f9489b2ba02ba99f1 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -145,7 +145,8 @@ class TestImperativeMnist(unittest.TestCase):
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
 
-            exe = fluid.Executor(fluid.CPUPlace())
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
 
             mnist = MNIST()
             sgd = SGDOptimizer(learning_rate=1e-3)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..87a72dd04e376cf9225e275d862b0cbbb9774e2c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -0,0 +1,370 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import unittest
+import numpy as np
+import six
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.optimizer import SGDOptimizer
+from paddle.fluid.imperative.nn import Conv2D, Pool2D, BatchNorm, FC
+from paddle.fluid.imperative.base import to_variable
+from test_imperative_base import new_program_scope
+
+batch_size = 8
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": batch_size,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    },
+    "batch_size": batch_size,
+    "lr": 0.1,
+    "total_images": 1281164,
+}
+
+
+def optimizer_setting(params):
+    ls = params["learning_strategy"]
+    if ls["name"] == "piecewise_decay":
+        if "total_images" not in params:
+            total_images = 1281167
+        else:
+            total_images = params["total_images"]
+        batch_size = ls["batch_size"]
+        step = int(total_images / batch_size + 1)
+
+        bd = [step * e for e in ls["epochs"]]
+        base_lr = params["lr"]
+        lr = []
+        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+        optimizer = fluid.optimizer.SGD(learning_rate=0.01)
+        # TODO(minqiyang): Add learning rate scheduler support to imperative mode
+        #  optimizer = fluid.optimizer.Momentum(
+    #  learning_rate=params["lr"],
+    #  learning_rate=fluid.layers.piecewise_decay(
+    #  boundaries=bd, values=lr),
+    #  momentum=0.9,
+    #  regularization=fluid.regularizer.L2Decay(1e-4))
+
+    return optimizer
+
+
+class ConvBNLayer(fluid.imperative.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            bias_attr=None)
+
+        self._batch_norm = BatchNorm(num_filters, act=act)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+
+        return y
+
+
+class BottleneckBlock(fluid.imperative.Layer):
+    def __init__(self, num_channels, num_filters, stride, shortcut=True):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu')
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu')
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=stride)
+
+        self.shortcut = shortcut
+
+        self._num_channels_out = num_filters * 4
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        y = fluid.layers.elementwise_add(x=short, y=conv2)
+
+        layer_helper = LayerHelper('elementwise_add_activation', act='relu')
+        return layer_helper.append_activation(y)
+
+
+class ResNet(fluid.imperative.Layer):
+    def __init__(self, layers=50, class_dim=102):
+        super(ResNet, self).__init__()
+
+        self.layers = layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_filters = [64, 128, 256, 512]
+
+        self.conv = ConvBNLayer(
+            num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu')
+        self.pool2d_max = Pool2D(
+            pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+
+        self.bottleneck_block_list = []
+        num_channels = 64
+        for block in range(len(depth)):
+            shortcut = False
+            for i in range(depth[block]):
+                bottleneck_block = BottleneckBlock(
+                    num_channels=num_channels,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1,
+                    shortcut=shortcut)
+                num_channels = bottleneck_block._num_channels_out
+                self.bottleneck_block_list.append(bottleneck_block)
+                shortcut = True
+
+        self.pool2d_avg = Pool2D(
+            pool_size=7, pool_type='avg', global_pooling=True)
+
+        import math
+        stdv = 1.0 / math.sqrt(2048 * 1.0)
+
+        self.out = FC(size=class_dim,
+                      act='softmax',
+                      param_attr=fluid.param_attr.ParamAttr(
+                          initializer=fluid.initializer.Uniform(-stdv, stdv)))
+
+    def forward(self, inputs):
+        y = self.conv(inputs)
+        y = self.pool2d_max(y)
+        for bottleneck_block in self.bottleneck_block_list:
+            y = bottleneck_block(y)
+        y = self.pool2d_avg(y)
+        y = self.out(y)
+        return y
+
+
+class TestImperativeResnet(unittest.TestCase):
+    def test_resnet_float32(self):
+        seed = 90
+
+        batch_size = train_parameters["batch_size"]
+        batch_num = 1
+        with fluid.imperative.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+
+            resnet = ResNet()
+            optimizer = optimizer_setting(train_parameters)
+            np.random.seed(seed)
+            import random
+            random.seed = seed
+            train_reader = paddle.batch(
+                paddle.dataset.flowers.train(use_xmap=False),
+                batch_size=batch_size)
+
+            dy_param_init_value = {}
+            for param in fluid.default_main_program().global_block(
+            ).all_parameters():
+                dy_param_init_value[param.name] = param._numpy()
+
+            for batch_id, data in enumerate(train_reader()):
+                if batch_id >= batch_num:
+                    break
+
+                dy_x_data = np.array(
+                    [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
+                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
+                    batch_size, 1)
+
+                img = to_variable(dy_x_data)
+                label = to_variable(y_data)
+                label._stop_gradient = True
+
+                out = resnet(img)
+                loss = fluid.layers.cross_entropy(input=out, label=label)
+                avg_loss = fluid.layers.mean(x=loss)
+
+                dy_out = avg_loss._numpy()
+
+                if batch_id == 0:
+                    for param in fluid.default_main_program().global_block(
+                    ).all_parameters():
+                        if param.name not in dy_param_init_value:
+                            dy_param_init_value[param.name] = param._numpy()
+
+                avg_loss._backward()
+
+                dy_grad_value = {}
+                for param in fluid.default_main_program().global_block(
+                ).all_parameters():
+                    if not param.stop_gradient:
+                        np_array = np.array(param._ivar._grad_ivar().value()
+                                            .get_tensor())
+                        dy_grad_value[param.name + core.grad_var_suffix(
+                        )] = np_array
+
+                optimizer.minimize(avg_loss)
+
+                dy_param_value = {}
+                for param in fluid.default_main_program().global_block(
+                ).all_parameters():
+                    dy_param_value[param.name] = param._numpy()
+
+        with new_program_scope():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
+
+            resnet = ResNet()
+            optimizer = optimizer_setting(train_parameters)
+
+            np.random.seed(seed)
+            import random
+            random.seed = seed
+            train_reader = paddle.batch(
+                paddle.dataset.flowers.train(use_xmap=False),
+                batch_size=batch_size)
+
+            img = fluid.layers.data(
+                name='pixel', shape=[3, 224, 224], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            out = resnet(img)
+            loss = fluid.layers.cross_entropy(input=out, label=label)
+            avg_loss = fluid.layers.mean(x=loss)
+            optimizer.minimize(avg_loss)
+
+            # initialize params and fetch them
+            static_param_init_value = {}
+            static_param_name_list = []
+            static_grad_name_list = []
+            for param in fluid.default_startup_program().global_block(
+            ).all_parameters():
+                static_param_name_list.append(param.name)
+            for param in fluid.default_main_program().global_block(
+            ).all_parameters():
+                if not param.stop_gradient:
+                    static_grad_name_list.append(param.name +
+                                                 core.grad_var_suffix())
+
+            out = exe.run(fluid.default_startup_program(),
+                          fetch_list=static_param_name_list)
+
+            for i in range(len(static_param_name_list)):
+                static_param_init_value[static_param_name_list[i]] = out[i]
+
+            for batch_id, data in enumerate(train_reader()):
+                if batch_id >= batch_num:
+                    break
+
+                static_x_data = np.array(
+                    [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
+                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
+                    [batch_size, 1])
+
+                fetch_list = [avg_loss.name]
+                fetch_list.extend(static_param_name_list)
+                fetch_list.extend(static_grad_name_list)
+                out = exe.run(fluid.default_main_program(),
+                              feed={"pixel": static_x_data,
+                                    "label": y_data},
+                              fetch_list=fetch_list)
+
+                static_param_value = {}
+                static_grad_value = {}
+                static_out = out[0]
+                param_start_pos = 1
+                grad_start_pos = len(static_param_name_list) + param_start_pos
+                for i in range(param_start_pos,
+                               len(static_param_name_list) + param_start_pos):
+                    static_param_value[static_param_name_list[
+                        i - param_start_pos]] = out[i]
+                for i in range(grad_start_pos,
+                               len(static_grad_name_list) + grad_start_pos):
+                    static_grad_value[static_grad_name_list[
+                        i - grad_start_pos]] = out[i]
+
+        self.assertTrue(np.allclose(static_out, dy_out))
+
+        self.assertEqual(len(dy_param_init_value), len(static_param_init_value))
+        for key, value in six.iteritems(static_param_init_value):
+            self.assertTrue(np.allclose(value, dy_param_init_value[key]))
+            self.assertTrue(np.isfinite(value.all()))
+            self.assertFalse(np.isnan(value.any()))
+
+        self.assertEqual(len(dy_grad_value), len(static_grad_value))
+        for key, value in six.iteritems(static_grad_value):
+            self.assertTrue(np.allclose(value, dy_grad_value[key]))
+            self.assertTrue(np.isfinite(value.all()))
+            self.assertFalse(np.isnan(value.any()))
+
+        self.assertEqual(len(dy_param_value), len(static_param_value))
+        for key, value in six.iteritems(static_param_value):
+            self.assertTrue(np.allclose(value, dy_param_value[key]))
+            self.assertTrue(np.isfinite(value.all()))
+            self.assertFalse(np.isnan(value.any()))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index ab7183f88df809e584ca50ba16221bfdfe1376a9..2d98b063d10e2bb9071c4b8dc4ac9373f63df387 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -420,5 +420,26 @@ class TestMSRAInitializer(unittest.TestCase):
         self.assertEqual(init_op.type, 'assign_value')
 
 
+class TestNumpyArrayInitializer(unittest.TestCase):
+    def test_numpy_array_initializer(self):
+        """Test the numpy array initializer with supplied arguments
+        """
+        import numpy
+        program = framework.Program()
+        block = program.global_block()
+        np_array = numpy.random.random((10000)).astype("float32")
+        for _ in range(2):
+            block.create_parameter(
+                dtype=np_array.dtype,
+                shape=np_array.shape,
+                lod_level=0,
+                name="param",
+                initializer=initializer.NumpyArrayInitializer(np_array))
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'assign_value')
+        assert (init_op.attr('fp32_values') == np_array).all()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pass_builder.py b/python/paddle/fluid/tests/unittests/test_pass_builder.py
index 8c9e489e02839e25cfabe14c16bfd91a908bd734..7e1c2572f08598b8b600517e4a82b48ca71cc20d 100644
--- a/python/paddle/fluid/tests/unittests/test_pass_builder.py
+++ b/python/paddle/fluid/tests/unittests/test_pass_builder.py
@@ -111,7 +111,7 @@ class TestPassBuilder(unittest.TestCase):
 
         pass_builder.remove_pass(len(pass_builder.all_passes()) - 1)
         self.assertEqual(origin_len + 1, len(pass_builder.all_passes()))
-        viz_pass.set_str("graph_viz_path", "/tmp/test_viz_pass")
+        viz_pass.set("graph_viz_path", "/tmp/test_viz_pass")
 
         self.check_network_convergence(
             use_cuda=core.is_compiled_with_cuda(),
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_expand.py b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
index ffd4026dbade2f8f7eace399c52ae0428f3e8d7b..d33a57f675aa98cf13e1ac0014109d9cb3856e87 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
@@ -81,11 +81,10 @@ class TestSequenceExpand(OpTest):
 class TestSequenceExpandCase1(TestSequenceExpand):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
-        x_lod = [[2, 3]]
         y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float32')
         y_lod = [[2, 3], [2, 2, 3, 3, 3]]
         self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
-        self.attrs = {'ref_level': 0}
+        self.attrs = {'ref_level': 1}
 
 
 class TestSequenceExpandCase2(TestSequenceExpand):
diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
index 41797a241cab9f2b3bc4b492a1c4b6db89ac2948..ae1883f1f7e44e06e378ff6d16dbc3c5060027e4 100644
--- a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
@@ -18,6 +18,7 @@ import numpy as np
 from op_test import OpTest
 from scipy.special import logit
 from scipy.special import expit
+import paddle.fluid.core as core
 import unittest
 
 
@@ -117,5 +118,36 @@ class TestSigmoidCrossEntropyWithLogitsOp3(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestSigmoidCrossEntropyWithNorm(OpTest):
+    def setUp(self):
+        self.op_type = "sigmoid_cross_entropy_with_logits"
+        batch_size = 64
+        num_classes = 20
+        ignore_index = -1
+        self.inputs = {
+            'X': logit(
+                np.random.uniform(0, 1, (batch_size, num_classes))
+                .astype("float32")),
+            'Label': np.random.randint(-1, 2, (batch_size, num_classes))
+            .astype("float32")
+        }
+        self.attrs = {'ignore_index': ignore_index, 'normalize': True}
+        sigmoid_X = expit(self.inputs['X'])
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
+        out = -term1 - term2
+        out[np.where(self.inputs['Label'] == ignore_index)] = 0
+        if self.attrs['normalize']:
+            out = out / float(
+                np.where(self.inputs['Label'] != ignore_index)[0].size)
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/transpiler/details/checkport.py b/python/paddle/fluid/transpiler/details/checkport.py
index 6b78ceeaeec4d9b3db6524a5b5e939f88267340c..89dd4dd50b0299de986b84f46e889d554030f180 100644
--- a/python/paddle/fluid/transpiler/details/checkport.py
+++ b/python/paddle/fluid/transpiler/details/checkport.py
@@ -16,6 +16,7 @@ import sys
 import time
 import socket
 from contextlib import closing
+from six import string_types
 
 
 def wait_server_ready(endpoints):
@@ -32,6 +33,7 @@ def wait_server_ready(endpoints):
 
            wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"])
     """
+    assert not isinstance(endpoints, string_types)
     while True:
         all_ok = True
         not_ready_endpoints = []
@@ -45,7 +47,7 @@ def wait_server_ready(endpoints):
                     all_ok = False
                     not_ready_endpoints.append(ep)
         if not all_ok:
-            sys.stderr.write("pserver not ready, wait 3 sec to retry...\n")
+            sys.stderr.write("server not ready, wait 3 sec to retry...\n")
             sys.stderr.write("not ready endpoints:" + str(not_ready_endpoints) +
                              "\n")
             sys.stderr.flush()
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index ea5a4cf7cdb3ef91a02bb88d9b859da1ecd1ed0b..e58f34e3750803669149685003ea5858fa775ed7 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -39,7 +39,7 @@ from .ps_dispatcher import RoundRobin, PSDispatcher
 from .. import core, framework, unique_name
 from ..framework import Program, default_main_program, \
     default_startup_program, Block, \
-    Parameter, grad_var_name
+    Parameter, Variable, grad_var_name
 from .details import *
 from ..distribute_lookup_table import find_distributed_lookup_table
 from functools import reduce
@@ -62,6 +62,260 @@ def log(*args):
         print(args)
 
 
+class VarStruct(object):
+    """
+    record part properties of a Variable in python.
+    """
+
+    def __init__(self, name, shape, dtype, type, lod_level, persistable):
+        self.name = name
+        self.shape = shape
+        self.dtype = dtype
+        self.type = type
+        self.lod_level = lod_level
+        self.persistable = persistable
+
+
+class VarDistributed(object):
+    """
+    a class to record the var distributed on parameter servers.
+    the class will record the relationship between origin var and slice var.
+    the slice var's properties, such as type/shape/offset/endpoint.
+    """
+
+    def __init__(self,
+                 origin_var,
+                 slice_var,
+                 is_slice=None,
+                 block_id=None,
+                 offset=None,
+                 vtype=None,
+                 endpoint=None):
+        """
+        Args:
+            origin_var(Variable|VarStruct): origin var properties
+            slice_var(Variable|VarStruct): slice var properties
+            is_slice(bool|None): slice or not, slice_var=True/False and its block size > 8192 are the judgement standard.
+            block_id(int|None): the number about the slice var.
+            offset(int|None): if the slice var is sliced, offset is the numel before the var.
+            vtype(str|None): a tag, such as Optimizer/Param/RemoteProfetch.
+            endpoint(str|None): which parameter the slice var on, such as "127.0.0.1:1001"
+        """
+
+        if isinstance(origin_var, Variable):
+            self.origin = self.__create_var_struct(origin_var)
+        else:
+            self.origin = origin_var
+
+        if isinstance(slice_var, Variable):
+            self.slice = self.__create_var_struct(slice_var)
+        else:
+            self.slice = slice_var
+
+        if self.equal(self.origin, self.slice):
+            self.is_slice = False
+            self.block_id = 0
+            self.offset = 0
+        else:
+            self.is_slice = True
+            self.block_id = 0
+            self.offset = 0
+
+        if is_slice is not None:
+            self.is_slice = is_slice
+        if block_id is not None:
+            self.block_id = block_id
+        if offset is not None:
+            self.offset = offset
+
+        self.vtype = vtype
+        self.endpoint = endpoint
+
+    @staticmethod
+    def __create_var_struct(var):
+        return VarStruct(var.name, var.shape, var.dtype, var.type,
+                         var.lod_level, var.persistable)
+
+    @staticmethod
+    def equal(var1, var2):
+        """
+        the two var is equal or not.
+        Returns:
+            bool: equal will return True else False
+        """
+        assert isinstance(var1, VarStruct) and isinstance(var2, VarStruct)
+
+        return var1.name == var2.name and \
+               var1.type == var2.type and \
+               var1.shape == var2.shape and \
+               var1.dtype == var2.dtype and \
+               var1.lod_level == var2.lod_level and \
+               var1.persistable == var2.persistable
+
+    def __str__(self):
+        origin_var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})". \
+            format(i="{", e="}", name=self.origin.name, type=self.origin.type,
+                   shape=self.origin.shape, dtype=self.origin.dtype)
+
+        slice_var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})" \
+                        ".slice({is_slice}).block({block_id}).offset({offset})". \
+            format(i="{", e="}", name=self.slice.name, type=self.slice.type,
+                   shape=self.slice.shape, dtype=self.slice.dtype,
+                   is_slice=self.is_slice, block_id=self.block_id, offset=self.offset)
+
+        return "var owned: {}, origin var: ( {} ), slice var: ( {} ), endpoint: {} ".format(
+            self.vtype, origin_var_str, slice_var_str, self.endpoint)
+
+
+class VarsDistributed(object):
+    """
+    a gather about VarDistributed with many methods to find distributed vars.
+    through the class, we can get overview about the distributed parameters on parameter servers.
+    this class may centralized and convenient for developer to manage and get variable's distribute.
+    other module can also use this to find variables such io.py.
+    """
+
+    def __init__(self):
+        self.distributed_vars = []
+
+    def add_distributed_var(self,
+                            origin_var,
+                            slice_var,
+                            is_slice=None,
+                            block_id=None,
+                            offset=None,
+                            vtype=None,
+                            endpoint=None):
+        """
+        add distributed var in this.
+
+        Args:
+            origin_var(Variable|VarStruct): origin var properties
+            slice_var(Variable|VarStruct): slice var properties
+            is_slice(bool|None): slice or not, slice_var=True/False and its block size > 8192 are the judgement standard.
+            block_id(int|None): the number about the slice var.
+            offset(int|None): if the slice var is sliced, offset is the numel before the var.
+            vtype(str|None): a tag, such as Optimizer/Param/RemoteProfetch.
+            endpoint(str|None): which parameter the slice var on, such as "127.0.0.1:1001"
+        Returns:
+            None
+        """
+        self.distributed_vars.append(
+            VarDistributed(origin_var, slice_var, is_slice, block_id, offset,
+                           vtype, endpoint))
+
+    def get_distributed_var_by_slice(self, var_name):
+        """
+        get distributed var by conditions.
+
+        Args:
+            var_name(str): slice var name, such as "w.traier0.block1"
+        Returns:
+            VarDistributed: distributed var.
+        """
+        for dist_var in self.distributed_vars:
+            if dist_var.slice.name == var_name:
+                return dist_var
+        return None
+
+    @staticmethod
+    def equal(var1, var2):
+        """
+        the two var is equal or not.
+        Returns:
+            bool: equal will return True else False
+        """
+        return var1.name == var2.name and \
+               var1.type == var2.type and \
+               var1.shape == var2.shape and \
+               var1.dtype == var2.dtype and \
+               var1.lod_level == var2.lod_level and \
+               var1.persistable == var2.persistable
+
+    def get_distributed_var_by_origin_and_ep(self, origin_var_name, endpoint):
+        """
+        get distributed var by conditions.
+
+        Args:
+            origin_var_name(str):
+            endpoint(str): the parameter endpoint, such as "127.0.0.1:1001"
+        Returns:
+            VarDistributed: distributed var.
+        """
+        for dist_var in self.distributed_vars:
+            if dist_var.origin.name == origin_var_name and dist_var.endpoint == endpoint:
+                return dist_var
+        return None
+
+    def get_distributed_vars_by_vtypes(self, vtypes, groupby=False):
+        """
+        get distributed vars by conditions.
+
+        Args:
+            vtype(str|None): distributed var's vtype, such as "Optimizer", "RemotePrefetch"
+            groupby(bool|False): group by origin var or not.
+
+        Returns:
+            list: distributed var list.
+            dict: distributed var map when groupby=True
+        """
+        vtype_vars = []
+        for var in self.distributed_vars:
+            if var.vtype in vtypes:
+                vtype_vars.append(var)
+        if not groupby:
+            return vtype_vars
+
+        params_map = {}
+        for var in vtype_vars:
+            origin_var_name = var.origin.name
+
+            if origin_var_name in params_map.keys():
+                optimizers = params_map.get(origin_var_name)
+            else:
+                optimizers = []
+            optimizers.append(var)
+            params_map[origin_var_name] = optimizers
+        return params_map
+
+    def get_distributed_vars_by_ep(self, endpoint, vtype=None):
+        """
+        get distributed vars by conditions.
+
+        Args:
+            endpoint(str): the parameter server endpoint, such as "127.0.0.1:2001"
+            vtype(str|None): distributed var's vtype, such as "Optimizer", "RemotePrefetch"
+
+        Returns:
+            list: distributed var list.
+        """
+        endpoint_vars = []
+        for var in self.distributed_vars:
+            if var.endpoint == endpoint:
+                endpoint_vars.append(var)
+        if not vtype:
+            return endpoint_vars
+
+        vtype_vars = []
+        for var in endpoint_vars:
+            if var.vtype == vtype:
+                vtype_vars.append(var)
+        return vtype_vars
+
+    def overview(self):
+        """
+        get the overview string about all params on all parameter servers.
+
+        Returns:
+            Str: overview string.
+
+        """
+        vars_str = []
+        for var in self.distributed_vars:
+            vars_str.append(str(var))
+        return "\n".join(vars_str)
+
+
 class VarBlock:
     def __init__(self, varname, offset, size):
         self.varname = varname
@@ -327,6 +581,7 @@ class DistributeTranspiler(object):
         self.trainer_id = trainer_id
         pserver_endpoints = pservers.split(",")
         self.pserver_endpoints = pserver_endpoints
+        self.vars_overview = VarsDistributed()
         self.optimize_ops, self.params_grads = self._get_optimize_pass()
 
         ps_dispatcher = self.config.split_method(self.pserver_endpoints)
@@ -347,6 +602,7 @@ class DistributeTranspiler(object):
         # add distributed attrs to program
         self.origin_program._is_distributed = True
         self.origin_program._endpoints = self.pserver_endpoints
+        self.origin_program._ps_endpoint = current_endpoint
         self.origin_program._is_chief = self.trainer_id == 0
         self.origin_program._distributed_lookup_table = self.table_name if self.table_name else None
 
@@ -454,6 +710,10 @@ class DistributeTranspiler(object):
             self.param_grad_ep_mapping[ep]["params"].append(recv_vars[i])
             self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i])
 
+            distributed_var = self.vars_overview.get_distributed_var_by_slice(
+                recv_vars[i].name)
+            distributed_var.endpoint = ep
+
         # step4: Concat the parameters splits together after recv.
         all_recv_outputs = []
         for param_varname, splited_var in six.iteritems(self.param_var_mapping):
@@ -480,6 +740,12 @@ class DistributeTranspiler(object):
                 recv_op_role_var_name = splited_trainer_grad[0].name
 
             if param_varname in self.sparse_param_to_height_sections:
+
+                for table_name in table_names:
+                    distributed_var = self.vars_overview.get_distributed_var_by_slice(
+                        table_name)
+                    distributed_var.vtype = "RemotePrefetch"
+
                 height_sections = self.sparse_param_to_height_sections[
                     param_varname]
                 self._update_remote_sparse_update_op(
@@ -532,6 +798,9 @@ class DistributeTranspiler(object):
                                                         pserver_endpoints)
             self._split_table_grad_and_add_send_vars(program, pserver_endpoints)
 
+        self._get_distributed_optimizer_vars()
+        self.origin_program._parameters_on_pservers = self.vars_overview
+
     def get_trainer_program(self, wait_port=True):
         """
         Get transpiled trainer side program.
@@ -541,6 +810,7 @@ class DistributeTranspiler(object):
         """
         # remove optimize ops and add a send op to main_program
         # FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay?
+
         lr_ops = self._get_lr_ops()
         delete_ops(self.origin_program.global_block(), self.optimize_ops)
         delete_ops(self.origin_program.global_block(), lr_ops)
@@ -665,9 +935,14 @@ class DistributeTranspiler(object):
         # NOTE: assume blocks of the same variable is not distributed
         # on the same pserver, only change param/grad varnames for
         # trainers to fetch.
+        sys.stderr.write(
+            "get_pserver_program() is deprecated, call get_pserver_programs() to get pserver main and startup in a single call.\n"
+        )
         # step1
         pserver_program = Program()
         pserver_program.random_seed = self.origin_program.random_seed
+        pserver_program._copy_dist_param_info_from(self.origin_program)
+
         # step2: Create vars to receive vars at parameter servers.
         recv_inputs = []
         for v in self.param_grad_ep_mapping[endpoint]["params"]:
@@ -703,9 +978,6 @@ class DistributeTranspiler(object):
             else:
                 recv_inputs.append(single_trainer_var)
 
-        self._slice_params_and_optimizes = self._get_slice_vars_and_attrs(
-            endpoint)
-
         # step 3
         # Create a union-find data structure from optimize ops,
         # If two ops are connected, we could add these two ops
@@ -882,10 +1154,6 @@ class DistributeTranspiler(object):
             outputs={},
             attrs=attrs)
 
-        # add distributed attrs
-        pserver_program._slice_vars_and_attrs = list(
-            self._slice_params_and_optimizes.values())
-
         pserver_program._sync_with_cpp()
         # save pserver program to generate pserver side startup relatively.
         self.pserver_program = pserver_program
@@ -984,30 +1252,88 @@ class DistributeTranspiler(object):
                     inputs={"X": startup_param_var},
                     outputs={"Out": startup_tmpvar})
 
-        # add slice vars
-        s_prog._slice_vars_and_attrs = pserver_program._slice_vars_and_attrs
-
         return s_prog
 
-    def _get_slice_vars_and_attrs(self, endpoint):
-        slice_vars_and_attrs = {}
+    # ====================== private transpiler functions =====================
+    def _get_slice_var_info(self, slice_var):
         block_suffix = "block"
-        for param in self.param_grad_ep_mapping[endpoint]["params"]:
-            orig_var_name, block_name, _ = self._get_varname_parts(param.name)
-            if not block_name:
-                continue
+        block_idx = 0
+        offset = 0
+        is_slice = False
 
-            block_idx = int(block_name.split(block_suffix)[1])
-            orig_var = self.origin_program.global_block().vars[orig_var_name]
+        orig_var_name, block_name, _ = self._get_varname_parts(slice_var.name)
 
-            skip_dim0 = 0
-            slice_vars = self.param_var_mapping[orig_var_name]
-            for slice_var in slice_vars[:block_idx]:
-                skip_dim0 += slice_var.shape[0]
-            slice_vars_and_attrs[param.name] = [orig_var, skip_dim0, param]
-        return slice_vars_and_attrs
+        if not block_name:
+            return is_slice, block_idx, offset
 
-    # ====================== private transpiler functions =====================
+        block_idx = int(block_name.split(block_suffix)[1])
+        skip_dim0 = 0
+        slice_vars = self.param_var_mapping[orig_var_name]
+
+        orig_dim1_flatten = reduce(lambda x, y: x * y, slice_vars[0].shape[1:])
+
+        for slice_var in slice_vars[:block_idx]:
+            skip_dim0 += slice_var.shape[0]
+
+        offset = skip_dim0 * orig_dim1_flatten
+        is_slice = True
+        return is_slice, block_idx, offset
+
+    def _get_distributed_optimizer_vars(self):
+        def _get_distributed_optimizer_var(endpoint):
+            opt_op_on_pserver = []
+            for _, op in enumerate(self.optimize_ops):
+                if self._is_optimizer_op(op) and self._is_opt_op_on_pserver(
+                        endpoint, op):
+                    opt_op_on_pserver.append(op)
+
+            for opt_op in opt_op_on_pserver:
+                dist_var = None
+                for key in opt_op.input_names:
+                    if key == "Param":
+                        param_name = opt_op.input(key)[0]
+                        dist_var = self.vars_overview.get_distributed_var_by_origin_and_ep(
+                            param_name, endpoint)
+                        break
+                for key in opt_op.input_names:
+                    if key in ["Param", "Grad", "LearningRate"]:
+                        continue
+                    origin_var = self.origin_program.global_block().vars[
+                        opt_op.input(key)[0]]
+                    # update accumulator variable shape
+                    new_shape = self._get_optimizer_input_shape(
+                        opt_op.type, key, origin_var.shape,
+                        dist_var.slice.shape)
+
+                    if new_shape == dist_var.slice.shape:
+                        splited_var = VarStruct(
+                            name=origin_var.name,
+                            shape=new_shape,
+                            dtype=origin_var.dtype,
+                            type=origin_var.type,
+                            lod_level=origin_var.lod_level,
+                            persistable=origin_var.persistable)
+
+                        self.vars_overview.add_distributed_var(
+                            origin_var=origin_var,
+                            slice_var=splited_var,
+                            is_slice=dist_var.is_slice,
+                            block_id=dist_var.block_id,
+                            offset=dist_var.offset,
+                            vtype="Optimizer",
+                            endpoint=endpoint)
+                    else:
+                        self.vars_overview.add_distributed_var(
+                            origin_var=origin_var,
+                            slice_var=origin_var,
+                            is_slice=False,
+                            block_id=0,
+                            offset=0,
+                            vtype="Optimizer",
+                            endpoint=endpoint)
+
+        for ep in self.pserver_endpoints:
+            _get_distributed_optimizer_var(ep)
 
     def _update_dist_lookup_table_vars(self, param_list, grad_list,
                                        params_grads):
@@ -1093,6 +1419,22 @@ class DistributeTranspiler(object):
         # origin_param_name -> [splited_param_vars]
         self.param_var_mapping = self._create_vars_from_blocklist(
             self.origin_program, param_blocks)
+
+        for orig_name, splited_vars in self.param_var_mapping.items():
+            orig_var = self.origin_program.global_block().var(orig_name)
+
+            for splited_var in splited_vars:
+                is_slice, block_id, offset = self._get_slice_var_info(
+                    splited_var)
+
+                self.vars_overview.add_distributed_var(
+                    origin_var=orig_var,
+                    slice_var=splited_var,
+                    block_id=block_id,
+                    offset=offset,
+                    is_slice=is_slice,
+                    vtype="Param")
+
         # origin_grad_name -> [splited_grad_vars]
         self.grad_var_mapping = self._create_vars_from_blocklist(
             self.origin_program,
@@ -1729,13 +2071,6 @@ class DistributeTranspiler(object):
                 shape=new_shape)
             new_inputs[key] = tmpvar
 
-            # var shape been changed
-            if new_shape != var.shape:
-                slice_var_args = self._slice_params_and_optimizes[
-                    param_var.name]
-                self._slice_params_and_optimizes[
-                    var.name] = [var, slice_var_args[1], tmpvar]
-
         # change output's ParamOut variable
         outputs = self._get_output_map_from_op(
             self.origin_program.global_block().vars, opt_op)
@@ -1763,8 +2098,8 @@ class DistributeTranspiler(object):
                 # skip per trainer vars
                 if g.name.find(".trainer_") == -1:
                     # only param or grads have splited blocks
-                    if self._orig_varname(g.name) in self.grad_name_to_param_name or\
-                        self._orig_varname(g.name) in self.param_name_to_grad_name:
+                    if self._orig_varname(g.name) in self.grad_name_to_param_name or \
+                            self._orig_varname(g.name) in self.param_name_to_grad_name:
                         grad_block = g
                         break
         return grad_block
diff --git a/python/setup.py.in b/python/setup.py.in
index e00c88b3a6e49bd806bcc6125fa8dc0f69928227..fb4b273a0676fcbcb4402eaf54ddf73d37a2754f 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -113,6 +113,7 @@ packages=['paddle',
           'paddle.fluid.contrib.slim.core',
           'paddle.fluid.contrib.slim.graph',
           'paddle.fluid.contrib.slim.prune',
+          'paddle.fluid.contrib.slim.quantization',
           'paddle.fluid.contrib.utils',
           'paddle.fluid.transpiler',
           'paddle.fluid.transpiler.details']